From 4b395e1e57791cca9d70933b07a4e03a71369ed7 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 9 Dec 2025 12:40:22 +0100 Subject: [PATCH 1/8] old FillMissing functions replaced by new ones in preprocessing folder --- DIMS/FillMissing.R | 25 ++++++----------- DIMS/FillMissing.nf | 2 +- DIMS/preprocessing/fill_missing_functions.R | 31 +++++++++++++++++++++ 3 files changed, 40 insertions(+), 18 deletions(-) create mode 100644 DIMS/preprocessing/fill_missing_functions.R diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index a76c163b..32480d38 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -1,25 +1,14 @@ -## adapted from 9-runFillMissing.R - # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) peakgrouplist_file <- cmd_args[1] -scripts_dir <- cmd_args[2] +preprocessing_scripts_dir <- cmd_args[2] thresh <- as.numeric(cmd_args[3]) resol <- as.numeric(cmd_args[4]) ppm <- as.numeric(cmd_args[5]) -outdir <- "./" # load in function scripts -source(paste0(scripts_dir, "replace_zeros.R")) -source(paste0(scripts_dir, "fit_optim.R")) -source(paste0(scripts_dir, "get_fwhm.R")) -source(paste0(scripts_dir, "get_stdev.R")) -source(paste0(scripts_dir, "estimate_area.R")) -source(paste0(scripts_dir, "optimize_gaussfit.R")) -source(paste0(scripts_dir, "identify_noisepeaks.R")) -source(paste0(scripts_dir, "get_element_info.R")) -source(paste0(scripts_dir, "atomic_info.R")) +source(paste0(preprocessing_scripts_dir, "fill_missing_functions.R")) # determine scan mode if (grepl("_pos", peakgrouplist_file)) { @@ -33,12 +22,14 @@ pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) # load peak group list and determine output file name -outpgrlist_identified <- get(load(peakgrouplist_file)) - -outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) +peakgroup_list <- get(load(peakgrouplist_file)) # replace missing values (zeros) with random noise -peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) +peakgrouplist_filled <- fill_missing_intensities(peakgroup_list, repl_pattern, thresh) + +# set name of output file +outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) # save output save(peakgrouplist_filled, file = outputfile_name) + diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index b0d18505..50227026 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -13,6 +13,6 @@ process FillMissing { script: """ - Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.preprocessing_scripts_dir $params.thresh """ } diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R new file mode 100644 index 00000000..86a70d9a --- /dev/null +++ b/DIMS/preprocessing/fill_missing_functions.R @@ -0,0 +1,31 @@ +fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh) { + #' Replace intensities that are zero with random value + #' + #' @param peakgroup_list: Peak groups (matrix) + #' @param repl_pattern: Replication pattern (list of strings) + #' @param thresh: Value for threshold between noise and signal (integer) + #' + #' @return final_outlist: peak groups with filled-in intensities (matrix) + + # replace missing intensities with random values around threshold + if (!is.null(peakgroup_list)) { + for (sample_index in 1:length(names(repl_pattern))) { + sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]] + zero_intensity <- which(sample_peaks <= 0) + if (!length(zero_intensity)) { + next + } + for (zero_index in 1:length(zero_intensity)) { + peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, + mean = thresh, + sd = 100) + } + } + + # Add column with average intensity; find intensity columns first + int_cols <- which(colnames(peakgroup_list) %in% names(repl_pattern)) + peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, int_cols], 1, mean)) + + return(peakgroup_list) + } +} From 1f77b2139096190dead1b3a9e0561715ce0c7a70 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 9 Dec 2025 12:53:43 +0100 Subject: [PATCH 2/8] removed identification of noise peaks --- DIMS/CollectFilled.R | 10 +++++----- DIMS/Utils/calculate_zscores.R | 2 +- DIMS/Utils/merge_duplicate_rows.R | 4 ---- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 4bd408e0..5735bc56 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -50,11 +50,11 @@ for (scanmode in scanmodes) { # make a copy of the outlist outlist_ident <- outlist_total - # take care of NAs in theormz_noise - outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 - outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) - outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 - outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) + # select identified peak groups if ppm deviation is within limits + if (z_score == 1) { + outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev) + outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ] + } # Extra output in Excel-readable format: remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R index d687376c..12365e41 100644 --- a/DIMS/Utils/calculate_zscores.R +++ b/DIMS/Utils/calculate_zscores.R @@ -55,7 +55,7 @@ calculate_zscores <- function(peakgroup_list, adducts) { ppmdev[i] <- NA } } - peakgroup_list <- cbind(peakgroup_list[, 1:6], ppmdev = ppmdev, peakgroup_list[, 7:ncol(peakgroup_list)]) + peakgroup_list <- cbind(peakgroup_list[, 1:4], ppmdev = ppmdev, peakgroup_list[, 5:ncol(peakgroup_list)]) } } diff --git a/DIMS/Utils/merge_duplicate_rows.R b/DIMS/Utils/merge_duplicate_rows.R index 901f1160..25afd312 100644 --- a/DIMS/Utils/merge_duplicate_rows.R +++ b/DIMS/Utils/merge_duplicate_rows.R @@ -38,10 +38,6 @@ merge_duplicate_rows <- function(peakgroup_list) { single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index) single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index) single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index) - single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index) - if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA - single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index) - if (single_peakgroup[, "theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index) single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index) if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA From d0aff6cf2791fa2b9b1d361d321c6800735ebbbe Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 9 Dec 2025 12:55:18 +0100 Subject: [PATCH 3/8] added as.data.frame to avoid error of non-numeric argument --- DIMS/GenerateQCOutput.R | 1 + 1 file changed, 1 insertion(+) diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R index 4e3394d0..bdc14f86 100644 --- a/DIMS/GenerateQCOutput.R +++ b/DIMS/GenerateQCOutput.R @@ -329,6 +329,7 @@ if (length(sst_colnrs) > 0) { sst_list_intensities <- sst_list[, intensity_col_ids] } for (col_nr in seq_len(ncol(sst_list_intensities))) { + sst_list_intensities <- as.data.frame(sst_list_intensities) sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr]) if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) { sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr], 2) From 519c5b71b0a46503851bb30707e05ad0fb213263 Mon Sep 17 00:00:00 2001 From: mraves2 Date: Tue, 9 Dec 2025 14:57:39 +0100 Subject: [PATCH 4/8] linting modifications --- DIMS/FillMissing.R | 1 - DIMS/preprocessing/fill_missing_functions.R | 13 +++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index 32480d38..0e8a26c7 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -32,4 +32,3 @@ outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) # save output save(peakgrouplist_filled, file = outputfile_name) - diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R index 86a70d9a..e1c455d5 100644 --- a/DIMS/preprocessing/fill_missing_functions.R +++ b/DIMS/preprocessing/fill_missing_functions.R @@ -1,4 +1,4 @@ -fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh) { +fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, not_random = FALSE) { #' Replace intensities that are zero with random value #' #' @param peakgroup_list: Peak groups (matrix) @@ -7,16 +7,21 @@ fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh) { #' #' @return final_outlist: peak groups with filled-in intensities (matrix) + # for unit test, turn off randomness + if (not_random) { + set.seed(123) + } + # replace missing intensities with random values around threshold if (!is.null(peakgroup_list)) { - for (sample_index in 1:length(names(repl_pattern))) { + for (sample_index in seq_along(names(repl_pattern))) { sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]] zero_intensity <- which(sample_peaks <= 0) if (!length(zero_intensity)) { next } - for (zero_index in 1:length(zero_intensity)) { - peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, + for (zero_index in seq_along(zero_intensity)) { + peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, mean = thresh, sd = 100) } From 4bbccd867085bea356a9199de373cf72816025a9 Mon Sep 17 00:00:00 2001 From: mraves2 Date: Tue, 9 Dec 2025 14:58:26 +0100 Subject: [PATCH 5/8] added unit test for FillMissing --- DIMS/tests/testthat/test_fill_missing.R | 45 +++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 DIMS/tests/testthat/test_fill_missing.R diff --git a/DIMS/tests/testthat/test_fill_missing.R b/DIMS/tests/testthat/test_fill_missing.R new file mode 100644 index 00000000..1926b632 --- /dev/null +++ b/DIMS/tests/testthat/test_fill_missing.R @@ -0,0 +1,45 @@ +# unit tests for FillMissing +# function: fill_missing_intensities +source("../../preprocessing/fill_missing_functions.R") + +# test fill_missing_intensities +testthat::test_that("missing values are corretly filled with random values", { + # create peakgroup_list to test on in diagnostics setting + test_peakgroup_list <- data.frame(matrix(NA, nrow = 4, ncol = 23)) + colnames(test_peakgroup_list) <- c("mzmed.pgrp", "nrsamples", "ppmdev", "assi_HMDB", "all_hmdb_names", + "iso_HMDB", "HMDB_code", "all_hmdb_ids", "sec_hmdb_ids", "theormz_HMDB", + "C101.1", "C102.1", "P2.1", "P3.1", + "avg.int", "assi_noise", "theormz_noise", "avg.ctrls", "sd.ctrls", + "C101.1_Zscore", "C102.1_Zscore", "P2.1_Zscore", "P3.1_Zscore") + test_peakgroup_list[, c(1)] <- 300 + runif(4) + test_peakgroup_list[, c(2, 3)] <- runif(8) + test_peakgroup_list[, "HMDB_code"] <- c("HMDB1234567", "HMDB1234567_1", "HMDB1234567_2", "HMDB1234567_7") + test_peakgroup_list[, "all_hmdb_ids"] <- paste(test_peakgroup_list[, "HMDB_code"], + test_peakgroup_list[, "HMDB_code"], sep = ";") + test_peakgroup_list[, "all_hmdb_names"] <- paste(test_peakgroup_list[, "assi_HMDB"], + test_peakgroup_list[, "assi_HMDB"], sep = ";") + test_peakgroup_list[, grep("C", colnames(test_peakgroup_list))] <- 1000 * (1:16) + test_peakgroup_list[, grep("P", colnames(test_peakgroup_list))] <- 0 + test_repl_pattern <- c(list(1), list(2), list(3), list(4)) + names(test_repl_pattern) <- c("C101.1", "C102.1", "P2.1", "P3.1") + test_thresh <- 2000 + + # create a large peak group list to test for negative values + test_large_peakgroup_list <- rbind(test_peakgroup_list, test_peakgroup_list) + for (index in 1:15) { + test_large_peakgroup_list <- rbind(test_large_peakgroup_list, test_large_peakgroup_list) + } + # for the sake of time, leave only one intensity column with zeros + test_large_peakgroup_list$P2.1 <- 1 + + expect_equal(round(fill_missing_intensities(test_peakgroup_list, test_repl_pattern, test_thresh, not_random = TRUE)$P2.1), + c(1944, 1977, 2156, 2007), TRUE, tolerance = 0.1) + # fill_missing_intensities should not produce any negative values, even if a large quantity of numbers are filled in + start.time <- Sys.time() + expect_gt(min(fill_missing_intensities(test_large_peakgroup_list, test_repl_pattern, test_thresh, not_random = FALSE)$P3.1), + 0, TRUE) + end.time <- Sys.time() + time.taken <- end.time - start.time + time.taken + +}) From 7ea64f0ab240490301803482102ca8836f41dabc Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 22 Dec 2025 13:14:50 +0100 Subject: [PATCH 6/8] removed unused variables --- DIMS/FillMissing.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index 32480d38..27390376 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -4,8 +4,6 @@ cmd_args <- commandArgs(trailingOnly = TRUE) peakgrouplist_file <- cmd_args[1] preprocessing_scripts_dir <- cmd_args[2] thresh <- as.numeric(cmd_args[3]) -resol <- as.numeric(cmd_args[4]) -ppm <- as.numeric(cmd_args[5]) # load in function scripts source(paste0(preprocessing_scripts_dir, "fill_missing_functions.R")) From c386ea37877f8f646f1d3cfde8581cfbc3c8b8a1 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 22 Dec 2025 13:55:58 +0100 Subject: [PATCH 7/8] moved CollectFilled functions to preprocessing folder --- DIMS/CollectFilled.R | 11 ++-- DIMS/CollectFilled.nf | 2 +- DIMS/preprocessing/collect_filled_functions.R | 57 +++++++++++++++++++ 3 files changed, 62 insertions(+), 8 deletions(-) create mode 100644 DIMS/preprocessing/collect_filled_functions.R diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 5735bc56..69f9e04c 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -1,14 +1,11 @@ -## adapted from 10-collectSamplesFilled.R - # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -scripts_dir <- cmd_args[1] +preprocessing_scripts_dir <- cmd_args[1] ppm <- as.numeric(cmd_args[2]) z_score <- as.numeric(cmd_args[3]) -source(paste0(scripts_dir, "merge_duplicate_rows.R")) -source(paste0(scripts_dir, "calculate_zscores.R")) +source(paste0(preprocessing_scripts_dir, "collect_filled_functions.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") @@ -17,7 +14,7 @@ for (scanmode in scanmodes) { filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled")) # load files and combine into one object outlist_total <- NULL - for (file_nr in 1:length(filled_files)) { + for (file_nr in seq_along(filled_files)) { peakgrouplist_filled <- get(load(filled_files[file_nr])) outlist_total <- rbind(outlist_total, peakgrouplist_filled) } @@ -47,7 +44,7 @@ for (scanmode in scanmodes) { outlist_stats_more <- cbind(outlist_stats_more, tmp) outlist_total <- outlist_stats_more } - + # make a copy of the outlist outlist_ident <- outlist_total # select identified peak groups if ppm deviation is within limits diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index 53b19676..b0025286 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -14,6 +14,6 @@ process CollectFilled { script: """ - Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.preprocessing_scripts_dir $params.ppm $params.zscore """ } diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R new file mode 100644 index 00000000..2f3ca38b --- /dev/null +++ b/DIMS/preprocessing/collect_filled_functions.R @@ -0,0 +1,57 @@ +merge_duplicate_rows <- function(peakgroup_list) { + #' Merge identification info for peak groups with the same mass + #' + #' @param peakgroup_list: Peak group list (matrix) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + collapse <- function(column_label, peakgroup_list, index_dup) { + #' Collapse identification info for peak groups with the same mass + #' + #' @param column_label: Name of column in peakgroup_list (string) + #' @param peakgroup_list: Peak group list (matrix) + #' @param index_dup: Index of duplicate peak group (integer) + #' + #' @return collapsed_items: Semicolon-separated list of info (string) + # get the item(s) that need to be collapsed + list_items <- as.vector(peakgroup_list[index_dup, column_label]) + # remove NA + if (length(which(is.na(list_items))) > 0) list_items <- list_items[-which(is.na(list_items))] + collapsed_items <- paste(list_items, collapse = ";") + return(collapsed_items) + } + + options(digits = 16) + collect <- NULL + remove <- NULL + + # check for peak groups with identical mass + index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"])) + + while (length(index_dup) > 0) { + # get the index for the peak group which is double + peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"]) + single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE] + + # use function collapse to concatenate info + single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index) + single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index) + single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index) + if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA + + # keep track of deduplicated entries + collect <- rbind(collect, single_peakgroup) + remove <- c(remove, peaklist_index) + + # remove current entry from index + index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])] + } + + # remove duplicate entries + if (!is.null(remove)) peakgroup_list <- peakgroup_list[-remove, ] + # append deduplicated entries + peakgroup_list_dedup <- rbind(peakgroup_list, collect) + return(peakgroup_list_dedup) +} From 1ff7e0ee8bec66bac9799d5892fac36c728fb25f Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 22 Dec 2025 15:05:04 +0100 Subject: [PATCH 8/8] refactored collect_filled_functions --- DIMS/preprocessing/collect_filled_functions.R | 96 +++++++++++++++---- 1 file changed, 79 insertions(+), 17 deletions(-) diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R index 2f3ca38b..6741076f 100644 --- a/DIMS/preprocessing/collect_filled_functions.R +++ b/DIMS/preprocessing/collect_filled_functions.R @@ -1,3 +1,23 @@ +# CollectFilled functions + +collapse <- function(column_label, peakgroup_list, index_dup) { + #' Collapse identification info for peak groups with the same mass + #' + #' @param column_label: Name of column in peakgroup_list (string) + #' @param peakgroup_list: Peak group list (matrix) + #' @param index_dup: Index of duplicate peak group (integer) + #' + #' @return collapsed_items: Semicolon-separated list of info (string) + # get the item(s) that need to be collapsed + list_items <- as.vector(peakgroup_list[index_dup, column_label]) + # remove NA + if (length(which(is.na(list_items))) > 0) { + list_items <- list_items[-which(is.na(list_items))] + } + collapsed_items <- paste(list_items, collapse = ";") + return(collapsed_items) +} + merge_duplicate_rows <- function(peakgroup_list) { #' Merge identification info for peak groups with the same mass #' @@ -5,22 +25,6 @@ merge_duplicate_rows <- function(peakgroup_list) { #' #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) - collapse <- function(column_label, peakgroup_list, index_dup) { - #' Collapse identification info for peak groups with the same mass - #' - #' @param column_label: Name of column in peakgroup_list (string) - #' @param peakgroup_list: Peak group list (matrix) - #' @param index_dup: Index of duplicate peak group (integer) - #' - #' @return collapsed_items: Semicolon-separated list of info (string) - # get the item(s) that need to be collapsed - list_items <- as.vector(peakgroup_list[index_dup, column_label]) - # remove NA - if (length(which(is.na(list_items))) > 0) list_items <- list_items[-which(is.na(list_items))] - collapsed_items <- paste(list_items, collapse = ";") - return(collapsed_items) - } - options(digits = 16) collect <- NULL remove <- NULL @@ -50,8 +54,66 @@ merge_duplicate_rows <- function(peakgroup_list) { } # remove duplicate entries - if (!is.null(remove)) peakgroup_list <- peakgroup_list[-remove, ] + if (!is.null(remove)) { + peakgroup_list <- peakgroup_list[-remove, ] + } # append deduplicated entries peakgroup_list_dedup <- rbind(peakgroup_list, collect) return(peakgroup_list_dedup) } + +calculate_zscores <- function(peakgroup_list) { + #' Calculate Z-scores for peak groups based on average and standard deviation of controls + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param sort_col: Column to sort on (string) + #' @param adducts: Parameter indicating whether there are adducts in the list (boolean) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + case_label <- "P" + control_label <- "C" + # get index for new column names + startcol <- ncol(peakgroup_list) + 3 + + # calculate mean and standard deviation for Control group + ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE) + case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE) + int_cols <- c(ctrl_cols, case_cols) + # set all zeros to NA + peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA + ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE] + peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) + peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) + + # set new column names and calculate Z-scores + colnames_zscores <- NULL + for (col_index in int_cols) { + col_name <- colnames(peakgroup_list)[col_index] + colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore")) + zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) - + peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls + peakgroup_list <- cbind(peakgroup_list, zscores_1col) + } + + # apply new column names to columns at end plus avg and sd columns + colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores + + # add ppm deviation column + zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE) + # calculate ppm deviation + for (row_index in seq_len(nrow(peakgroup_list))) { + if (!is.na(peakgroup_list$theormz_HMDB[row_index]) && + !is.null(peakgroup_list$theormz_HMDB[row_index]) && + (peakgroup_list$theormz_HMDB[row_index] != "")) { + peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) - + as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) / + as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index])) + } else { + peakgroup_list$ppmdev[row_index] <- NA + } + } + + return(peakgroup_list) +} +