From 4b395e1e57791cca9d70933b07a4e03a71369ed7 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 9 Dec 2025 12:40:22 +0100
Subject: [PATCH 1/8] old FillMissing functions replaced by new ones in
 preprocessing folder

---
 DIMS/FillMissing.R                          | 25 ++++++-----------
 DIMS/FillMissing.nf                         |  2 +-
 DIMS/preprocessing/fill_missing_functions.R | 31 +++++++++++++++++++++
 3 files changed, 40 insertions(+), 18 deletions(-)
 create mode 100644 DIMS/preprocessing/fill_missing_functions.R

diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index a76c163b..32480d38 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -1,25 +1,14 @@
-## adapted from 9-runFillMissing.R
-
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 peakgrouplist_file <- cmd_args[1]
-scripts_dir <- cmd_args[2]
+preprocessing_scripts_dir <- cmd_args[2]
 thresh <- as.numeric(cmd_args[3])
 resol <- as.numeric(cmd_args[4])
 ppm <- as.numeric(cmd_args[5])
-outdir <- "./"
 
 # load in function scripts
-source(paste0(scripts_dir, "replace_zeros.R"))
-source(paste0(scripts_dir, "fit_optim.R"))
-source(paste0(scripts_dir, "get_fwhm.R"))
-source(paste0(scripts_dir, "get_stdev.R"))
-source(paste0(scripts_dir, "estimate_area.R"))
-source(paste0(scripts_dir, "optimize_gaussfit.R"))
-source(paste0(scripts_dir, "identify_noisepeaks.R"))
-source(paste0(scripts_dir, "get_element_info.R"))
-source(paste0(scripts_dir, "atomic_info.R"))
+source(paste0(preprocessing_scripts_dir, "fill_missing_functions.R"))
 
 # determine scan mode
 if (grepl("_pos", peakgrouplist_file)) {
@@ -33,12 +22,14 @@ pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 repl_pattern <- get(load(pattern_file))
 
 # load peak group list and determine output file name
-outpgrlist_identified <- get(load(peakgrouplist_file))
-
-outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
+peakgroup_list <- get(load(peakgrouplist_file))
 
 # replace missing values (zeros) with random noise
-peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+peakgrouplist_filled <- fill_missing_intensities(peakgroup_list, repl_pattern, thresh)
+
+# set name of output file
+outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
 # save output
 save(peakgrouplist_filled, file = outputfile_name)
+
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
index b0d18505..50227026 100644
--- a/DIMS/FillMissing.nf
+++ b/DIMS/FillMissing.nf
@@ -13,6 +13,6 @@ process FillMissing {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.preprocessing_scripts_dir $params.thresh
         """
 }
diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R
new file mode 100644
index 00000000..86a70d9a
--- /dev/null
+++ b/DIMS/preprocessing/fill_missing_functions.R
@@ -0,0 +1,31 @@
+fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh) {
+  #' Replace intensities that are zero with random value
+  #'
+  #' @param peakgroup_list: Peak groups (matrix)
+  #' @param repl_pattern: Replication pattern (list of strings)
+  #' @param thresh: Value for threshold between noise and signal (integer)
+  #'
+  #' @return final_outlist: peak groups with filled-in intensities (matrix)
+
+  # replace missing intensities with random values around threshold
+  if (!is.null(peakgroup_list)) {
+    for (sample_index in 1:length(names(repl_pattern))) {
+      sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]]
+      zero_intensity <- which(sample_peaks <= 0)
+      if (!length(zero_intensity)) {
+        next
+      }
+      for (zero_index in 1:length(zero_intensity)) {
+        peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, 
+                                                                                               mean = thresh,
+                                                                                               sd = 100)
+      }
+    }
+
+    # Add column with average intensity; find intensity columns first
+    int_cols <- which(colnames(peakgroup_list) %in% names(repl_pattern))
+    peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, int_cols], 1, mean))
+
+    return(peakgroup_list)
+  }
+}

From 1f77b2139096190dead1b3a9e0561715ce0c7a70 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 9 Dec 2025 12:53:43 +0100
Subject: [PATCH 2/8] removed identification of noise peaks

---
 DIMS/CollectFilled.R              | 10 +++++-----
 DIMS/Utils/calculate_zscores.R    |  2 +-
 DIMS/Utils/merge_duplicate_rows.R |  4 ----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index 4bd408e0..5735bc56 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -50,11 +50,11 @@ for (scanmode in scanmodes) {
   
   # make a copy of the outlist
   outlist_ident <- outlist_total
-  # take care of NAs in theormz_noise
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
+  # select identified peak groups if ppm deviation is within limits
+  if (z_score == 1) {
+    outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
+    outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
+  }
 
   # Extra output in Excel-readable format:
   remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R
index d687376c..12365e41 100644
--- a/DIMS/Utils/calculate_zscores.R
+++ b/DIMS/Utils/calculate_zscores.R
@@ -55,7 +55,7 @@ calculate_zscores <- function(peakgroup_list, adducts) {
           ppmdev[i] <- NA
         }
       }
-      peakgroup_list <- cbind(peakgroup_list[, 1:6], ppmdev = ppmdev, peakgroup_list[, 7:ncol(peakgroup_list)])
+      peakgroup_list <- cbind(peakgroup_list[, 1:4], ppmdev = ppmdev, peakgroup_list[, 5:ncol(peakgroup_list)])
     }
   }
 
diff --git a/DIMS/Utils/merge_duplicate_rows.R b/DIMS/Utils/merge_duplicate_rows.R
index 901f1160..25afd312 100644
--- a/DIMS/Utils/merge_duplicate_rows.R
+++ b/DIMS/Utils/merge_duplicate_rows.R
@@ -38,10 +38,6 @@ merge_duplicate_rows <- function(peakgroup_list) {
     single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
     single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
     single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
-    single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index)
-    if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA
-    single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index)
-    if (single_peakgroup[, "theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA
     single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
     single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
     if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA

From d0aff6cf2791fa2b9b1d361d321c6800735ebbbe Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 9 Dec 2025 12:55:18 +0100
Subject: [PATCH 3/8] added as.data.frame to avoid error of non-numeric
 argument

---
 DIMS/GenerateQCOutput.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R
index 4e3394d0..bdc14f86 100644
--- a/DIMS/GenerateQCOutput.R
+++ b/DIMS/GenerateQCOutput.R
@@ -329,6 +329,7 @@ if (length(sst_colnrs) > 0) {
   sst_list_intensities <- sst_list[, intensity_col_ids]
 }
 for (col_nr in seq_len(ncol(sst_list_intensities))) {
+  sst_list_intensities <- as.data.frame(sst_list_intensities)
   sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr])
   if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) {
     sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr], 2)

From 519c5b71b0a46503851bb30707e05ad0fb213263 Mon Sep 17 00:00:00 2001
From: mraves2 <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 9 Dec 2025 14:57:39 +0100
Subject: [PATCH 4/8] linting modifications

---
 DIMS/FillMissing.R                          |  1 -
 DIMS/preprocessing/fill_missing_functions.R | 13 +++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index 32480d38..0e8a26c7 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -32,4 +32,3 @@ outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
 # save output
 save(peakgrouplist_filled, file = outputfile_name)
-
diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R
index 86a70d9a..e1c455d5 100644
--- a/DIMS/preprocessing/fill_missing_functions.R
+++ b/DIMS/preprocessing/fill_missing_functions.R
@@ -1,4 +1,4 @@
-fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh) {
+fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, not_random = FALSE) {
   #' Replace intensities that are zero with random value
   #'
   #' @param peakgroup_list: Peak groups (matrix)
@@ -7,16 +7,21 @@ fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh) {
   #'
   #' @return final_outlist: peak groups with filled-in intensities (matrix)
 
+  # for unit test, turn off randomness
+  if (not_random) {
+    set.seed(123)
+  }
+
   # replace missing intensities with random values around threshold
   if (!is.null(peakgroup_list)) {
-    for (sample_index in 1:length(names(repl_pattern))) {
+    for (sample_index in seq_along(names(repl_pattern))) {
       sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]]
       zero_intensity <- which(sample_peaks <= 0)
       if (!length(zero_intensity)) {
         next
       }
-      for (zero_index in 1:length(zero_intensity)) {
-        peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, 
+      for (zero_index in seq_along(zero_intensity)) {
+        peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
                                                                                                mean = thresh,
                                                                                                sd = 100)
       }

From 4bbccd867085bea356a9199de373cf72816025a9 Mon Sep 17 00:00:00 2001
From: mraves2 <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 9 Dec 2025 14:58:26 +0100
Subject: [PATCH 5/8] added unit test for FillMissing

---
 DIMS/tests/testthat/test_fill_missing.R | 45 +++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 DIMS/tests/testthat/test_fill_missing.R

diff --git a/DIMS/tests/testthat/test_fill_missing.R b/DIMS/tests/testthat/test_fill_missing.R
new file mode 100644
index 00000000..1926b632
--- /dev/null
+++ b/DIMS/tests/testthat/test_fill_missing.R
@@ -0,0 +1,45 @@
+# unit tests for FillMissing
+# function: fill_missing_intensities
+source("../../preprocessing/fill_missing_functions.R")
+
+# test fill_missing_intensities
+testthat::test_that("missing values are corretly filled with random values", {
+  # create peakgroup_list to test on in diagnostics setting
+  test_peakgroup_list <- data.frame(matrix(NA, nrow = 4, ncol = 23))
+  colnames(test_peakgroup_list) <- c("mzmed.pgrp", "nrsamples", "ppmdev", "assi_HMDB", "all_hmdb_names",
+                                     "iso_HMDB", "HMDB_code", "all_hmdb_ids", "sec_hmdb_ids", "theormz_HMDB",
+                                     "C101.1", "C102.1", "P2.1", "P3.1",
+                                     "avg.int", "assi_noise", "theormz_noise", "avg.ctrls", "sd.ctrls",
+                                     "C101.1_Zscore", "C102.1_Zscore", "P2.1_Zscore", "P3.1_Zscore")
+  test_peakgroup_list[, c(1)] <- 300 + runif(4)
+  test_peakgroup_list[, c(2, 3)] <- runif(8)
+  test_peakgroup_list[, "HMDB_code"] <- c("HMDB1234567", "HMDB1234567_1", "HMDB1234567_2", "HMDB1234567_7")
+  test_peakgroup_list[, "all_hmdb_ids"] <- paste(test_peakgroup_list[, "HMDB_code"],
+                                                 test_peakgroup_list[, "HMDB_code"], sep = ";")
+  test_peakgroup_list[, "all_hmdb_names"] <- paste(test_peakgroup_list[, "assi_HMDB"],
+                                                   test_peakgroup_list[, "assi_HMDB"], sep = ";")
+  test_peakgroup_list[, grep("C", colnames(test_peakgroup_list))] <- 1000 * (1:16)
+  test_peakgroup_list[, grep("P", colnames(test_peakgroup_list))] <- 0
+  test_repl_pattern <- c(list(1), list(2), list(3), list(4))
+  names(test_repl_pattern) <- c("C101.1", "C102.1", "P2.1", "P3.1")
+  test_thresh <- 2000
+
+  # create a large peak group list to test for negative values
+  test_large_peakgroup_list <- rbind(test_peakgroup_list, test_peakgroup_list)
+  for (index in 1:15) {
+    test_large_peakgroup_list <- rbind(test_large_peakgroup_list, test_large_peakgroup_list)
+  }
+  # for the sake of time, leave only one intensity column with zeros
+  test_large_peakgroup_list$P2.1 <- 1
+
+  expect_equal(round(fill_missing_intensities(test_peakgroup_list, test_repl_pattern, test_thresh, not_random = TRUE)$P2.1),
+               c(1944, 1977, 2156, 2007), TRUE, tolerance = 0.1)
+  # fill_missing_intensities should not produce any negative values, even if a large quantity of numbers are filled in
+  start.time <- Sys.time()
+  expect_gt(min(fill_missing_intensities(test_large_peakgroup_list, test_repl_pattern, test_thresh, not_random = FALSE)$P3.1),
+            0, TRUE)
+  end.time <- Sys.time()
+  time.taken <- end.time - start.time
+  time.taken
+
+})

From 7ea64f0ab240490301803482102ca8836f41dabc Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 22 Dec 2025 13:14:50 +0100
Subject: [PATCH 6/8] removed unused variables

---
 DIMS/FillMissing.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index 32480d38..27390376 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -4,8 +4,6 @@ cmd_args <- commandArgs(trailingOnly = TRUE)
 peakgrouplist_file <- cmd_args[1]
 preprocessing_scripts_dir <- cmd_args[2]
 thresh <- as.numeric(cmd_args[3])
-resol <- as.numeric(cmd_args[4])
-ppm <- as.numeric(cmd_args[5])
 
 # load in function scripts
 source(paste0(preprocessing_scripts_dir, "fill_missing_functions.R"))

From c386ea37877f8f646f1d3cfde8581cfbc3c8b8a1 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 22 Dec 2025 13:55:58 +0100
Subject: [PATCH 7/8] moved CollectFilled functions to preprocessing folder

---
 DIMS/CollectFilled.R                          | 11 ++--
 DIMS/CollectFilled.nf                         |  2 +-
 DIMS/preprocessing/collect_filled_functions.R | 57 +++++++++++++++++++
 3 files changed, 62 insertions(+), 8 deletions(-)
 create mode 100644 DIMS/preprocessing/collect_filled_functions.R

diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index 5735bc56..69f9e04c 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -1,14 +1,11 @@
-## adapted from 10-collectSamplesFilled.R
-
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-scripts_dir <- cmd_args[1]
+preprocessing_scripts_dir <- cmd_args[1]
 ppm <- as.numeric(cmd_args[2])
 z_score <- as.numeric(cmd_args[3])
 
-source(paste0(scripts_dir, "merge_duplicate_rows.R"))
-source(paste0(scripts_dir, "calculate_zscores.R"))
+source(paste0(preprocessing_scripts_dir, "collect_filled_functions.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
@@ -17,7 +14,7 @@ for (scanmode in scanmodes) {
   filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
   # load files and combine into one object
   outlist_total <- NULL
-  for (file_nr in 1:length(filled_files)) {
+  for (file_nr in seq_along(filled_files)) {
     peakgrouplist_filled <- get(load(filled_files[file_nr]))
     outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
@@ -47,7 +44,7 @@ for (scanmode in scanmodes) {
     outlist_stats_more <- cbind(outlist_stats_more, tmp)
     outlist_total <- outlist_stats_more
   }
-  
+
   # make a copy of the outlist
   outlist_ident <- outlist_total
   # select identified peak groups if ppm deviation is within limits
diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
index 53b19676..b0025286 100644
--- a/DIMS/CollectFilled.nf
+++ b/DIMS/CollectFilled.nf
@@ -14,6 +14,6 @@ process CollectFilled {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore
+        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.preprocessing_scripts_dir $params.ppm $params.zscore
         """
 }
diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
new file mode 100644
index 00000000..2f3ca38b
--- /dev/null
+++ b/DIMS/preprocessing/collect_filled_functions.R
@@ -0,0 +1,57 @@
+merge_duplicate_rows <- function(peakgroup_list) {
+  #' Merge identification info for peak groups with the same mass
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  collapse <- function(column_label, peakgroup_list, index_dup) {
+    #' Collapse identification info for peak groups with the same mass
+    #'
+    #' @param column_label: Name of column in peakgroup_list (string)
+    #' @param peakgroup_list: Peak group list (matrix)
+    #' @param index_dup: Index of duplicate peak group (integer)
+    #'
+    #' @return collapsed_items: Semicolon-separated list of info (string)
+    # get the item(s) that need to be collapsed
+    list_items <- as.vector(peakgroup_list[index_dup, column_label])
+    # remove NA
+    if (length(which(is.na(list_items))) > 0) list_items <- list_items[-which(is.na(list_items))]
+    collapsed_items <- paste(list_items, collapse = ";")
+    return(collapsed_items)
+  }
+
+  options(digits = 16)
+  collect <- NULL
+  remove <- NULL
+
+  # check for peak groups with identical mass
+  index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"]))
+
+  while (length(index_dup) > 0) {
+    # get the index for the peak group which is double
+    peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
+    single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]
+
+    # use function collapse to concatenate info
+    single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
+    single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
+    single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
+    if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA
+
+    # keep track of deduplicated entries
+    collect <- rbind(collect, single_peakgroup)
+    remove <- c(remove, peaklist_index)
+
+    # remove current entry from index
+    index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
+  }
+
+  # remove duplicate entries
+  if (!is.null(remove)) peakgroup_list <- peakgroup_list[-remove, ]
+  # append deduplicated entries
+  peakgroup_list_dedup <- rbind(peakgroup_list, collect)
+  return(peakgroup_list_dedup)
+}

From 1ff7e0ee8bec66bac9799d5892fac36c728fb25f Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 22 Dec 2025 15:05:04 +0100
Subject: [PATCH 8/8] refactored collect_filled_functions

---
 DIMS/preprocessing/collect_filled_functions.R | 96 +++++++++++++++----
 1 file changed, 79 insertions(+), 17 deletions(-)

diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
index 2f3ca38b..6741076f 100644
--- a/DIMS/preprocessing/collect_filled_functions.R
+++ b/DIMS/preprocessing/collect_filled_functions.R
@@ -1,3 +1,23 @@
+# CollectFilled functions
+
+collapse <- function(column_label, peakgroup_list, index_dup) {
+  #' Collapse identification info for peak groups with the same mass
+  #'
+  #' @param column_label: Name of column in peakgroup_list (string)
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param index_dup: Index of duplicate peak group (integer)
+  #'
+  #' @return collapsed_items: Semicolon-separated list of info (string)
+  # get the item(s) that need to be collapsed
+  list_items <- as.vector(peakgroup_list[index_dup, column_label])
+  # remove NA
+  if (length(which(is.na(list_items))) > 0) {
+    list_items <- list_items[-which(is.na(list_items))]
+  }
+  collapsed_items <- paste(list_items, collapse = ";")
+  return(collapsed_items)
+}
+
 merge_duplicate_rows <- function(peakgroup_list) {
   #' Merge identification info for peak groups with the same mass
   #'
@@ -5,22 +25,6 @@ merge_duplicate_rows <- function(peakgroup_list) {
   #'
   #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
 
-  collapse <- function(column_label, peakgroup_list, index_dup) {
-    #' Collapse identification info for peak groups with the same mass
-    #'
-    #' @param column_label: Name of column in peakgroup_list (string)
-    #' @param peakgroup_list: Peak group list (matrix)
-    #' @param index_dup: Index of duplicate peak group (integer)
-    #'
-    #' @return collapsed_items: Semicolon-separated list of info (string)
-    # get the item(s) that need to be collapsed
-    list_items <- as.vector(peakgroup_list[index_dup, column_label])
-    # remove NA
-    if (length(which(is.na(list_items))) > 0) list_items <- list_items[-which(is.na(list_items))]
-    collapsed_items <- paste(list_items, collapse = ";")
-    return(collapsed_items)
-  }
-
   options(digits = 16)
   collect <- NULL
   remove <- NULL
@@ -50,8 +54,66 @@ merge_duplicate_rows <- function(peakgroup_list) {
   }
 
   # remove duplicate entries
-  if (!is.null(remove)) peakgroup_list <- peakgroup_list[-remove, ]
+  if (!is.null(remove)) {
+    peakgroup_list <- peakgroup_list[-remove, ]
+  }
   # append deduplicated entries
   peakgroup_list_dedup <- rbind(peakgroup_list, collect)
   return(peakgroup_list_dedup)
 }
+
+calculate_zscores <- function(peakgroup_list) {
+  #' Calculate Z-scores for peak groups based on average and standard deviation of controls
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param sort_col: Column to sort on (string)
+  #' @param adducts: Parameter indicating whether there are adducts in the list (boolean)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  case_label <- "P"
+  control_label <- "C"
+  # get index for new column names
+  startcol <- ncol(peakgroup_list) + 3
+
+  # calculate mean and standard deviation for Control group
+  ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
+  case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
+  int_cols <- c(ctrl_cols, case_cols)
+  # set all zeros to NA
+  peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA
+  ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE]
+  peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
+  peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
+
+  # set new column names and calculate Z-scores
+  colnames_zscores <- NULL
+  for (col_index in int_cols) {
+    col_name <- colnames(peakgroup_list)[col_index]
+    colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore"))
+    zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) -
+                     peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
+    peakgroup_list <- cbind(peakgroup_list, zscores_1col)
+  }
+
+  # apply new column names to columns at end plus avg and sd columns
+  colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores
+
+  # add ppm deviation column
+  zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE)
+  # calculate ppm deviation
+  for (row_index in seq_len(nrow(peakgroup_list))) {
+    if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
+        !is.null(peakgroup_list$theormz_HMDB[row_index]) &&
+        (peakgroup_list$theormz_HMDB[row_index] != "")) {
+      peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
+    } else {
+      peakgroup_list$ppmdev[row_index] <- NA
+    }
+  }
+
+  return(peakgroup_list)
+}
+