diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..ee726bf4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,264 @@ +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData +.RDataTmp + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# pkgdown site +docs/ + +# translation temp files +po/*~ + +# RStudio Connect folder +rsconnect/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix diff --git a/Fraser/1.99.3/environment.yml b/Fraser/1.99.3/environment.yml new file mode 100644 index 00000000..73a13da2 --- /dev/null +++ b/Fraser/1.99.3/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 + \ No newline at end of file diff --git a/Fraser/1.99.3/fraser.R b/Fraser/1.99.3/fraser.R new file mode 100644 index 00000000..1e9491b3 --- /dev/null +++ b/Fraser/1.99.3/fraser.R @@ -0,0 +1,152 @@ +# load FRASER library +suppressPackageStartupMessages(library(FRASER)) +suppressPackageStartupMessages(library(biomaRt)) +suppressPackageStartupMessages(library(utils)) + +args <- commandArgs() + + +get_st_rows <- function(input){ + if(dir.exists(input)){ + sampleID <- list.files(path=input,pattern="*.bam",full.names=FALSE) + bamFile <- list.files(path=input,pattern="*.bam",full.names=TRUE) + return(data.table(sampleID,bamFile)) + } + else if(file.exists(input)){ + sampleID <- basename(file.path(input)) + bamFile <- file.path(input) + return(data.table(sampleID,bamFile)) + } +} + + +create_sample_table <- function(sample,ref,paired_end){ + sampleTable <- rbind(get_st_rows(sample),get_st_rows(ref)) + sampleTable <- sampleTable[!grepl(".bai",sampleID), ] + sampleTable$group <- c(1:nrow(sampleTable)) + sampleTable$pairedEnd <- TRUE + + message(date(), ": sampleTable") + print(sampleTable) + + return(sampleTable) +} + + +run_fraser <- function(sampleTable){ + # create FRASER object + settings <- FraserDataSet(colData=sampleTable, workingDir="./FRASER_output") + message(date(), ": settings...") + print(settings) + + register(MulticoreParam(workers=min(8, multicoreWorkers()))) + + # count reads + fds <- countRNAData(settings) + + fds <- calculatePSIValues(fds) + + #FRASER vignette/gagneurlab settings: + #Currently, we suggest keeping only junctions which support the following: + #minExpressionInOneSample - 20 - At least one sample has 20 (or more) reads + #quantileMinExpression - 10 - The minimum total read count (N) an intron needs to have at the specified quantile across samples to pass the filter. See quantileForFiltering. + #quantile - 0.95 - Defines at which percentile the quantileMinExpression filter is applied. # nolint + #A value of 0.95 means that at least 5% of the samples need to have a total read count N >= quantileMinExpression to pass the filter. + #minDeltaPsi - 0.05 - The minimal variation (in delta psi) required for an intron to pass the filter. gagneurlab suggests 0.05 + + fds <- filterExpressionAndVariability(fds, minExpressionInOneSample = 20, quantileMinExpression=10, quantile = 0.95, minDeltaPsi=0.05, filter=FALSE) + fds <- saveFraserDataSet(fds, name="fds-filter") + plotheat = "plots_fraser.pdf" + pdf(plotheat,onefile = TRUE) + plotFilterExpression(fds, bins=100) + + message(date(), ": dimension before filtering...") + print(dim(assays(fds)$rawCountsJ)) + print(dim(assays(fds)$rawCountsSS)) + + fds <- fds[mcols(fds, type="j")[,"passed"]] + message(date(), ": dimension after filtering...") + print(dim(assays(fds)$rawCountsJ)) + print(dim(assays(fds)$rawCountsSS)) + + #Finds the optimal encoding dimension by injecting artificial splicing outlier ratios while maximizing the precision-recall curve. + # Get range for latent space dimension, max tested dimensions = 6 + mp <- 6 + a <- 2 + b <- min(ncol(fds), nrow(fds)) / mp # N/mp + + maxSteps <- 12 + if(mp < 6){ + maxSteps <- 15 + } + + Nsteps <- min(maxSteps, b) + pars_q <- unique(round(exp(seq(log(a),log(b),length.out = Nsteps)))) + + message(date(), ": getEncDimRange", pars_q) + + fds <- saveFraserDataSet(fds, name="fds-normfalse") + # Heatmap of the sample correlation + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=FALSE) + # Heatmap of the intron/sample expression + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=FALSE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) + + #set.seed(as.integer(random)) + # hyperparameter opimization can be used to estimate the dimension q of the latent space of the data. + # It works by artificially injecting outliers into the data and then comparing the AUC of recalling these outliers for different values of q. + fds <- optimHyperParams(fds, type="jaccard", implementation="PCA", + q_param=pars_q, minDeltaPsi=0.05, plot=FALSE) + # retrieve the estimated optimal dimension of the latent space + bestq = bestQ(fds, type="jaccard") + message(date(), "...find best q: " ,bestq) + + # Add verbosity to the FRASER object + verbose(fds) <- 3 + fds <- fit(fds, q=bestq, type="jaccard", implementation="PCA", iterations=15) + + # Heatmap of the sample correlation + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=TRUE) + # Heatmap of the intron/sample expression + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=TRUE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) + dev.off() + fds <- saveFraserDataSet(fds, name="fds-normtrue") + + #all in one getEncDimRange, optimHyperParams, bestQ, fit + #fds_fraser <- FRASER(fds_filtered, q=c(jaccard=2)) + + fds <- annotateRanges(fds, GRCh = 38) + + # Pvalues + fds <- calculatePvalues(fds, type="jaccard") + # Adjust Pvalues + fds <- calculatePadjValues(fds, type="jaccard") + + fds <- saveFraserDataSet(fds, name="fds-final") + + return(fds) +} + + +write_fraser_output <- function(fraser_ds, prefix){ + # retrieve results with default and recommended cutoffs (padj <= 0.05 (0.1 gagneurlab) and # |deltaPsi| >= 0.3) + fraser_res <- results(fraser_ds, all=TRUE) + message(date(), ": result...") + print(fraser_res) + + write.table(as.data.frame(unname(fraser_res)), paste0("fraser_result_",prefix,".tsv"), sep="\t", quote=FALSE) +} + + +main <- function(args){ + ##Argument.Parser not in docker image.. + sample <- args[6] #: bam + bai sample of interest + refset <- args[7] #: dir to ref set bams + bai + prefix <- args[8] #: prefix + pairedEnd <- args[9] #: pairedEnd (=TRUE) + sampleTable <- create_sample_table(sample, refset, pairedEnd) + fraser_ds <- run_fraser(sampleTable) + write_fraser_output(fraser_ds, prefix) +} + + +main(args) diff --git a/Fraser/1.99.3/main.nf b/Fraser/1.99.3/main.nf new file mode 100644 index 00000000..29ed828b --- /dev/null +++ b/Fraser/1.99.3/main.nf @@ -0,0 +1,26 @@ + +process FRASER { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "quay.io/biocontainers/bioconductor-fraser:1.99.3--r43hf17093f_0" + + input: + tuple val(meta), path(input) + path(refset) + + output: + path("*.tsv"), emit: tsv +// path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + export BIOMART_CACHE=/hpc/diaggen/projects/woc/rna/container + Rscript ${moduleDir}/fraser.R ${input} ${refset} ${prefix} TRUE + """ +} diff --git a/Fraser/1.99.3/meta.yml b/Fraser/1.99.3/meta.yml new file mode 100644 index 00000000..fe11285f --- /dev/null +++ b/Fraser/1.99.3/meta.yml @@ -0,0 +1,25 @@ +name: fraser +description: Run FRASER on a set of bam files +keywords: + - fraser + - abberant splicing +tools: + - fraser: + description: FRASER - Find RAre Splicing Events in RNA-seq Data + homepage: + documentation: https://www.bioconductor.org/packages/release/bioc/vignettes/FRASER/inst/doc/FRASER.pdf + licence: +input: + - bams: + type: path + description: | + path directory where bam files are located +output: + - meta: + type: file + description: | + tsv file with aberrant spling output per sample +authors: + - +maintainers: + - diff --git a/Outrider/1.20.0/Dockerfile b/Outrider/1.20.0/Dockerfile new file mode 100644 index 00000000..d4510780 --- /dev/null +++ b/Outrider/1.20.0/Dockerfile @@ -0,0 +1,19 @@ +################## BASE IMAGE ###################### +FROM --platform=linux/amd64 quay.io/biocontainers/bioconductor-outrider:1.18.1--r43hf17093f_0 + +################## METADATA ###################### +LABEL base_image="outrider:1.18.1--r43hf17093f_0" +LABEL version="1.18.1--r43hf17093f_0" +LABEL software="dependencies_custom_outrider" +LABEL about.summary="" +LABEL about.home="https://github.com/gagneurlab/OUTRIDER" +LABEL about.documentation="https://github.com/gagneurlab/OUTRIDER" +# LABEL extra.binaries="R" + +################## INSTALLATIONS ###################### + +RUN R -e "install.packages('argparse', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('tools', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('tibble', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('readr', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('dplyr', dependencies=TRUE, repos='http://cran.rstudio.com/')" diff --git a/Outrider/1.20.0/environment.yml b/Outrider/1.20.0/environment.yml new file mode 100644 index 00000000..f52a53a0 --- /dev/null +++ b/Outrider/1.20.0/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf new file mode 100644 index 00000000..283a58d3 --- /dev/null +++ b/Outrider/1.20.0/main.nf @@ -0,0 +1,27 @@ +process OUTRIDER { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "ghcr.io/umcugenetics/outrider_custom:0.0.1" + + input: + tuple val(meta), path(counts) + path(refset) + path(genome_gtf) + + output: + path("*.tsv"), emit: tsv + path("*.csv"), emit: csv + path("*.pdf"), emit: pdf +// path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + Rscript ${moduleDir}/outrider.R ${counts} -r ${refset} -p ${prefix} -g ${genome_gtf} + """ +} diff --git a/Outrider/1.20.0/meta.yml b/Outrider/1.20.0/meta.yml new file mode 100644 index 00000000..d5d083ff --- /dev/null +++ b/Outrider/1.20.0/meta.yml @@ -0,0 +1,38 @@ +name: outrider +description: Run Outrider on feature counts +keywords: + - outrider + - abberant expression +tools: + - outrider: + description: | + homepage: + documentation: + licence: +input: + - meta: + type: map + description: | + Groovy Map containing sample information + - counts: + type: file + description: | + Input file to measure aberrant expression + - ref: + type: file + description: | + List of files as a reference to measure aberrant expression to + - feature: + type: value + description: | + Value for the feature exon or gene to be used in the output file +output: + - meta: + type: file + description: | + tsv file with gene or exon values regarding aberrant expression +authors: + - +maintainers: + - + diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R new file mode 100644 index 00000000..aa964e80 --- /dev/null +++ b/Outrider/1.20.0/outrider.R @@ -0,0 +1,256 @@ +#!/usr/bin/env Rscript + +# Import statements, alphabetic order of main package. +suppressPackageStartupMessages(library("argparse")) +suppressPackageStartupMessages(library("OUTRIDER")) +suppressPackageStartupMessages(library("tools")) +suppressPackageStartupMessages(library("tibble")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("stringr")) + +# Argument parser +parser <- ArgumentParser(description = "Process some integers") +parser$add_argument("query", metavar = "query_input_files", nargs = "+", + help = "Files or directories containing the query input files.") +parser$add_argument("-o","--output_path", metavar = "output_path", nargs = "+", + help = "Path where output of OUTRIDER will be stored.", default="./") +parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", + help = "Files or directories containing the reference input files.") +parser$add_argument("-p", "--pref", metavar = "prefix", nargs = "+", + help = "Prefix of file.", default="gene") +parser$add_argument("-g", "--gtf", metavar = "genome_gtf", nargs = "+", + help = "Genome gtf file") +args <- parser$parse_args() + + +read_input_files <- function(input){ + sampleIDs <- c() + count_tables <- lapply(input, function(f) { + input_ext <- tools::file_ext(f) + if(input_ext == "Rds") { + rse <- readRDS(f) # RangedSummarizedExperiment + sampleIDs <- append(sampleIDs, colnames(rse)) + return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) + } else if (input_ext %in% c("txt", "tsv")) { + count_table <- read_delim(f, show_col_types=FALSE, skip=1) + # col 1: samples IDs, col 9: counts + ct <- count_table[,c(1,9)] + names(ct)[1] <- "rownames" + sampleIDs <<- append(sampleIDs, colnames(ct)[2]) + return(ct) + } else { + stop("Input file extension is not supported.") + } + }) + return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) +} + + +get_input <- function(input){ + # If directories are provided + if(all(sapply(input, function(x) dir.exists(x)))) { + retrieved_files <- sapply(input, function(d){ + list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) + }) + } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. + retrieved_files <- input + } else { # If both dir and files are provided, or different tyes (character, int etc) + stop("Input is neither dir or file.") + } + return(read_input_files(retrieved_files)) +} + + +merge_count_tables <- function(lst_query, lst_ref){ + # merge count tables together. + lst_count_tables <- c(lst_query, lst_ref) + all_counts <- lst_count_tables %>% + Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) + return(all_counts) +} + + +filter_expression <- function(ods, query, prefix, gtf){ + if(grepl("gene", prefix)){ + ##FOR GENE LEVEL + ods <- filterExpression(ods, fpkmCutoff = 1, minCounts = FALSE, filterGenes = FALSE, gtfFile=gtf) + } + else{ + ##FOR EXON LEVEL + ct <- read_delim(query, show_col_types=FALSE, skip=1) + print("identical exon ids:") + print(identical(ct$Geneid,rownames(assays(ods)$counts))) + readD <- apply(assays(ods)$counts, 2, function(x) x / sum(x) * 10^6) + countsFPKM <- readD / ct$Length * 10^3 + perc95e <- apply(countsFPKM, 1, function(x) quantile(x,probs=0.95)) + mcols(ods)$passedFilter <- perc95e>1 + mcols(ods)$basepairs <- ct$Length + + ##FOR EXON RATIO + ##REMOVE ##FOR EXON LEVEL code above + ##UNCOMMENT BELOW STEPS + # print("EXON RATIO") + # ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) + } + + # display the FPKM distribution of counts. + #ods <- plotFPKM(ods) + message(date(), ": dim before filtering...") + print(dim(assays(ods)$counts)) + + ##SUBSETTING + ods <- ods[mcols(ods)$passedFilter,] + message(date(), ": dim after filtering...") + print(dim(assays(ods)$counts)) + return(ods) +} + + +run_outrider <- function(all_counts, query, prefix, gtf) { + # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. + all_counts_matrix <- as.matrix(all_counts)[,-1] + mode(all_counts_matrix) <- "integer" + rownames(all_counts_matrix) <- all_counts$rownames + + ods <- OutriderDataSet(countData=all_counts_matrix) + + plotheat = "counts_heatplots.pdf" + pdf(plotheat,onefile = TRUE) + + ods <- filter_expression(ods, query, prefix, gtf) + + # Heatmap of the sample correlation + # it can also annotate the clusters resulting from the dendrogram + ods <- plotCountCorHeatmap(ods, normalized=FALSE) + + # Heatmap of the gene/sample expression + ods <- plotCountGeneSampleHeatmap(ods, normalized=FALSE) + + ods <- estimateSizeFactors(ods) + message(date(), ": sizeFactors...") + print(ods$sizeFactor) + + pars_q <- seq(2, min(100, ncol(ods) - 1, nrow(ods) - 1), 2) + + message(date(), ": findEncodingDim...") + ods <- findEncodingDim(ods, params = pars_q, implementation="autoencoder", BPPARAM=MulticoreParam(8)) + + #find best q (dimension) + opt_q <- getBestQ(ods) + message(date(), ": getbestQ...") + print(opt_q) + + ods <- controlForConfounders(ods, q = opt_q, BPPARAM = MulticoreParam(8)) + + # After controlling for confounders the heatmap should be plotted again. + # If it worked, no batches should be present and the correlations between samples should be reduced and close to zero. [1]* + ods <- plotCountCorHeatmap(ods, normalized=TRUE) + ods <- plotCountGeneSampleHeatmap(ods, normalized=TRUE) + dev.off() + +# if(grepl("^(peer|pca)$", implementation)){ +# message(date(), ": Fitting the data ...") # NOT with autoencoder +# ods <- fit(ods, BPPARAM=MulticoreParam(8)) +# } + + ods <- computePvalues(ods, alternative="two.sided", method="BY", BPPARAM = MulticoreParam(8)) + out <- computeZscores(ods) + +# run full OUTRIDER pipeline (control, fit model, calculate P-values) +# out <- OUTRIDER(ods, BPPARAM=MulticoreParam(8)) + return(out) +} + +##Only necessary for Erasmus app +get_ct_emcapp <- function(query, prefix){ + count_table <- read_delim(query, show_col_types=FALSE, skip=1) + ct <- count_table[,c(2,3,4,8,1)] + colnames(ct)<- c("chr","start","end","gene","geneID") + ct$chr <- str_split_fixed(ct$chr,';',2)[,1] + ct$start <- str_split_fixed(ct$start,';',2)[,1] + ct$end <- gsub("^.*;", "", ct$end) + + ##For gene level: create countdata table Identifier chr_start_end_genename + ct$emcID <- paste(ct$chr,ct$start,ct$end,ct$gene,sep="_") + ##For exon level: create countdata table Identifier: add Ensemble exon_id: chr_start_end_genename-exon_id + if(grepl("exon", prefix)){ + ct$emcID <- paste(ct$emcID,ct$geneID,sep="-") + } + + return(ct) +} + +##Only necessary for Erasmus app +save_res_emcapp <- function(ct, res, prefix, out_path){ + res <- as_tibble(results(res, all=TRUE))[,c(2,1,3:14)] + res_merge <- merge(res, ct, by = "geneID", all.res = TRUE) + res_merge$geneENSG <- res_merge$geneID + res_merge$geneID <- res_merge$emcID + res_merge$sample_name<-NA + res_merge$TIN_mean<-NA + res_merge$link_bam<-NA + + fragment <- "genes" + treatment <- "untreated" + if(grepl("exon",prefix)){ fragment <- "exons"} + if(grepl("chx|CHX",prefix)){ treatment <- "CHX"} + write_csv(res_merge[,c(1:18,20:23)], paste0(out_path, "umcu_rnaseq_fib_",treatment,"_res_outrider_",fragment,"_counts.tsv"), append=FALSE, col_names = TRUE) +} + +##Only necessary for Erasmus app +save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ + ct <- ct[,c("emcID","geneID")] + colnames(all_counts)[1]<-c("geneID") + merge_counts <- merge(all_counts, ct, by = "geneID", all.ct = TRUE) + merge_counts$geneID <- merge_counts$emcID + counts_out <- merge_counts[,c(1:ncol(merge_counts)-1)] + colnames(counts_out)[1]<-c("") + + ##countdata output table EMC app + if(grepl("gene", prefix)){ + write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_genes_counts.tsv")) + }else{ + write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_exons_counts.tsv")) + } + + ##Meta output table EMC app + treatment <- "untreated" + if(grepl("chx|CHX",prefix)){ treatment <- "chx"} + metadata<-data.frame(colnames(counts_out[,2:ncol(counts_out)]),treatment,"fib","umcu_rnaseq",0,"") + colnames(metadata)<-c("sample_id","treatment","species","experiment_GS","gender","drop") + if(grepl("gene",prefix)){ + write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_genes.csv"), row.names = FALSE, quote=FALSE) + }else{ + write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_exons.csv"), row.names = FALSE, quote=FALSE) + } +} + +save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj_thres=0.05, zscore_thres=0, a=TRUE) { +# res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) + ##Reference samples are excluded from final results. +# query_res <- filter(res, !(sampleID %in% ref_samples)) + res <- as_tibble(results(out_outrider, all=a)) + query_res <- res + + # Write output table with aberrant expressed targets. + write_tsv(query_res, paste0(out_path, prefix, ".outrider_result.tsv")) +} + +# TODO: investigate memory usage and if possible reduced / parallel. +main <- function(query, ref, output_path, prefix, gtf){ + query_data <- get_input(query) + ref_data <- get_input(ref) + all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) + + output <- run_outrider(all_counts, query, prefix, gtf) + save_output(output_path, output, ref_data$sampleIDs, prefix, query) + + ##Only necessary for Erasmus app, to get the counts, meta and outrider results tables + ct_emcapp <- get_ct_emcapp(query, prefix) + save_count_meta_emcapp(ct_emcapp, all_counts, output_path, prefix) + save_res_emcapp(ct_emcapp, output, prefix, output_path) +} + + +main(args$query, args$ref, args$output_path, args$pref, args$gtf) \ No newline at end of file