diff --git a/.github/workflows/dims_test.yml b/.github/workflows/dims_test.yml index fa2a8aa2..e8c4078c 100644 --- a/.github/workflows/dims_test.yml +++ b/.github/workflows/dims_test.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v4 - name: Install dependencies - run: Rscript -e "install.packages(c('testthat', 'withr'))" + run: Rscript -e "install.packages(c('testthat', 'withr', 'vdiffr', 'pdftools'))" - name: Run tests run: Rscript tests/testthat.R diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R index 4e3394d0..1bc3e65c 100644 --- a/DIMS/GenerateQCOutput.R +++ b/DIMS/GenerateQCOutput.R @@ -328,6 +328,7 @@ if (length(sst_colnrs) > 0) { } else { sst_list_intensities <- sst_list[, intensity_col_ids] } +sst_list_intensities <- as.data.frame(sst_list_intensities) for (col_nr in seq_len(ncol(sst_list_intensities))) { sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr]) if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) { diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 263c7201..38c0a105 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -1,14 +1,3 @@ -# For untargeted metabolomics, this tool calculates probability scores for -# metabolic disorders. In addition, it provides visual support with violin plots -# of the DIMS measurements for the lab specialists. -# Input needed: -# 1. Excel file in which metabolites are listed with their intensities for -# controls (with C in samplename) and patients (with P in samplename) and their -# corresponding Z-scores. -# 2. All files from github: https://github.com/UMCUGenetics/DIMS - -## adapted from 15-dIEM_violin.R - # load packages suppressPackageStartupMessages(library("dplyr")) library(reshape2) @@ -21,462 +10,208 @@ library(stringr) cmd_args <- commandArgs(trailingOnly = TRUE) run_name <- cmd_args[1] -scripts_dir <- cmd_args[2] -z_score <- as.numeric(cmd_args[3]) -path_metabolite_groups <- cmd_args[4] -file_ratios_metabolites <- cmd_args[5] -file_expected_biomarkers_iem <- cmd_args[6] -file_explanation <- cmd_args[7] -file_isomers <- cmd_args[8] - -if (z_score == 1){ - # path: output folder for dIEM and violin plots - output_dir <- "./" - - file.copy(file_isomers, output_dir) - - # load functions - source(paste0(scripts_dir, "check_same_samplename.R")) - source(paste0(scripts_dir, "prepare_data.R")) - source(paste0(scripts_dir, "prepare_data_perpage.R")) - source(paste0(scripts_dir, "prepare_toplist.R")) - source(paste0(scripts_dir, "create_violin_plots.R")) - source(paste0(scripts_dir, "prepare_alarmvalues.R")) - source(paste0(scripts_dir, "output_helix.R")) - source(paste0(scripts_dir, "get_patient_data_to_helix.R")) - source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R")) - source(paste0(scripts_dir, "is_diagnostic_patient.R")) - - # number of diseases that score highest in algorithm to plot - top_nr_iem <- 5 - # probability score cut-off for plotting the top diseases - threshold_iem <- 5 - # z-score cutoff of axis on the left for top diseases - ratios_cutoff <- -5 - # number of violin plots per page in PDF - nr_plots_perpage <- 20 - - # binary variable: run function, yes(1) or no(0) - if (z_score == 1) { - algorithm <- ratios <- violin <- 1 - } else { - algorithm <- ratios <- violin <- 0 - } - # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) - header_row <- 1 - # column name where the data starts (default B) - col_start <- "B" - zscore_cutoff <- 5 - xaxis_cutoff <- 20 - protocol_name <- "DIMS_PL_DIAG" - - #### STEP 1: Preparation #### - # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS - - # load outlist instead of excel file - load("outlist.RData") - - # save outlist as dims_xls, will be changed during refactor - dims_xls <- outlist - rm(outlist) - - #### STEP 2: Edit DIMS data ##### - # in: dims_xls ||| out: Data, nr_contr, nr_pat - # Input: the xlsx file that comes out of the pipeline with format: - # [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore] - # Output: "_CSV.csv" file that is suited for the algorithm in shiny. - - # Determine the number of Contols and Patients in column names: - nr_contr <- length(grep("C", names(dims_xls))) / 2 - nr_pat <- length(grep("P", names(dims_xls))) / 2 - # total number of samples - nrsamples <- nr_contr + nr_pat - # check whether the number of intensity columns equals the number of Zscore columns - if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) { - cat("\n**** Error: there aren't as many intensities listed as Zscores") - } - cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n")) - - # Move the columns HMDB_code and HMDB_name to the beginning. - hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) - other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] - dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] - # Remove the columns from 'name' to 'pathway' - from_col <- which(colnames(dims_xls_copy) == "name") - to_col <- which(colnames(dims_xls_copy) == "pathway") - dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)] - # in case the excel had an empty "plots" column, remove it - if ("plots" %in% colnames(dims_xls_copy)) { - dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))] - } - # Rename columns - names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy)) - names(dims_xls_copy) <- gsub("sd.ctrls", "SD_controls", names(dims_xls_copy)) - names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy)) - names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy)) - - # intensity columns and mean and standard deviation of controls - numeric_cols <- c(3:ncol(dims_xls_copy)) - # make sure all values are numeric - dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric) - - if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { - cat("\n### Step 2 # Edit dims data is done.\n") - } else { - cat("\n**** Error: Could not execute step 2 \n") - } - - #### STEP 3: Calculate ratios of intensities for metabolites #### - # in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file) - # This script loads the file with Ratios (file_ratios_metabolites) and calculates - # the ratios of the intensities of the given metabolites. It also calculates - # Zs-cores based on the avg and sd of the ratios of the controls. - - # Input: dataframe with intenstities and Zscores of controls and patients: - # [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore] - - # Output: "_CSV.csv" file that is suited for the algorithm, with format: - # "_Ratios_CSV.csv" file, same file as above, but with ratio rows added. - - if (ratios == 1) { - cat(paste0("\nloading ratios file:\n -> ", file_ratios_metabolites, "\n")) - ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) - - # Prepare empty data frame to fill with ratios - ratio_list <- setNames(data.frame(matrix( - ncol = ncol(dims_xls_copy), - nrow = nrow(ratio_input) - )), colnames(dims_xls_copy)) - ratio_list <- as.data.frame(ratio_list) - - # put HMDB info into first two columns of ratio_list - ratio_list[, 1:2] <- ratio_input[, 1:2] - - # look for intensity columns (exclude Zscore columns) - control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - intensity_cols <- c(control_cols, patient_cols) - # calculate each of the ratios of intensities - for (ratio_index in 1:nrow(ratio_input)) { - ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] - ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] - ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] - ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] - # find these HMDB IDs in dataset. Could be a sum of multiple metabolites - sel_denominator <- sel_numerator <- c() - for (numerator_index in 1:length(ratio_numerator)) { - sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index])) - } - for (denominator_index in 1:length(ratio_denominator)) { - # special case for sum of metabolites (dividing by one) - if (ratio_denominator[denominator_index] != "one") { - sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index])) - } - } - # calculate ratio - if (ratio_denominator[denominator_index] != "one") { - ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / - apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) - } else { - # special case for sum of metabolites (dividing by one) - ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) - } - # calculate log of ratio - ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols]) - } - - # Calculate means and SD's of the calculated ratios for Controls - ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean) - ratio_list[, "SD_controls"] <- apply(ratio_list[, control_cols], 1, sd) - - # Calc z-scores with the means and SD's of Controls - zscore_cols <- grep("Zscore", colnames(ratio_list)) - for (sample_index in 1:length(zscore_cols)) { - zscore_col <- zscore_cols[sample_index] - # matching intensity column - int_col <- intensity_cols[sample_index] - # test on column names - if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { - # calculate Z-scores - ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"] - } +export_scripts_dir <- cmd_args[2] +path_metabolite_groups <- cmd_args[3] +file_ratios_metabolites <- cmd_args[4] +file_expected_biomarkers_iem <- cmd_args[5] +file_explanation <- cmd_args[6] + +# load functions +source(paste0(export_scripts_dir, "generate_violin_plots_functions.R")) +# load dataframe with intensities and Z-scores for all samples +intensities_zscore_df <- get(load("outlist.RData")) +# read input files +ratios_metabs_df <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) +expected_biomarkers_df <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) +explanation_violin_plot <- readLines(file_explanation) + + +## Set global variables +output_dir <- "./" # path: output folder for dIEM and violin plots +top_number_iem_diseases <- 5 # number of diseases that score highest in algorithm to plot +threshold_iem <- 5 # probability score cut-off for plotting the top diseases +ratios_cutoff <- -5 # z-score cutoff of axis on the left for top diseases +nr_plots_perpage <- 20 # number of violin plots per page in PDF +zscore_cutoff <- 5 +xaxis_cutoff <- 20 +protocol_name <- "DIMS_PL_DIAG" + +# Remove columns, move HMDB_code & HMDB_name column to the front, change intensity columns to numeric +intensities_zscore_df <- intensities_zscore_df %>% + select(-c(plots, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID, HMDB_key, sec_HMBD_ID_rlvnc, name, + relevance, descr, origin, fluids, tissue, disease, pathway, nr_ctrls)) %>% + relocate(c(HMDB_code, HMDB_name)) %>% + rename(mean_controls = avg_ctrls, sd_controls = sd_ctrls) %>% + mutate(across(!c(HMDB_name, HMDB_code), as.numeric)) + +# Get the controls and patient IDs, select the intensity columns +controls <- colnames(intensities_zscore_df)[grepl("^C", colnames(intensities_zscore_df)) & + !grepl("_Zscore$", colnames(intensities_zscore_df))] +control_intensities_cols_index <- which(colnames(intensities_zscore_df) %in% controls) +nr_of_controls <- length(controls) + +patients <- colnames(intensities_zscore_df)[grepl("^P", colnames(intensities_zscore_df)) & + !grepl("_Zscore$", colnames(intensities_zscore_df))] +patient_intensities_cols_index <- which(colnames(intensities_zscore_df) %in% patients) +nr_of_patients <- length(patients) + +intensity_cols_index <- c(control_intensities_cols_index, patient_intensities_cols_index) +intensity_cols <- colnames(intensities_zscore_df)[intensity_cols_index] + +#### Calculate ratios of intensities for metabolites #### +# Prepare empty data frame to fill with ratios +ratio_zscore_df <- data.frame(matrix( + ncol = ncol(intensities_zscore_df), + nrow = nrow(ratios_metabs_df) +)) +colnames(ratio_zscore_df) <- colnames(intensities_zscore_df) + +# put HMDB info into first two columns of ratio_zscore_df +ratio_zscore_df$HMDB_code <- ratios_metabs_df$HMDB.code +ratio_zscore_df$HMDB_name <- ratios_metabs_df$Ratio_name + +for (row_index in seq_len(nrow(ratios_metabs_df))) { + numerator_intensities <- get_intentities_for_ratios(ratios_metabs_df, row_index, + intensities_zscore_df, "HMDB_numerator", intensity_cols) + denominator_intensities <- get_intentities_for_ratios(ratios_metabs_df, row_index, + intensities_zscore_df, "HMDB_denominator", intensity_cols) + # calculate intensity ratios + ratio_zscore_df[row_index, intensity_cols_index] <- log2(numerator_intensities / denominator_intensities) +} +# Calculate means and SD's of the calculated ratios for Controls +ratio_zscore_df[, "mean_controls"] <- apply(ratio_zscore_df[, control_intensities_cols_index], 1, mean) +ratio_zscore_df[, "sd_controls"] <- apply(ratio_zscore_df[, control_intensities_cols_index], 1, sd) + +# Calculate Zscores for the ratios +samples_zscore_columns <- get_zscore_columns(colnames(intensities_zscore_df), intensity_cols) +ratio_zscore_df[, samples_zscore_columns] <- (ratio_zscore_df[, intensity_cols] - ratio_zscore_df[, "mean_controls"]) / + ratio_zscore_df[, "sd_controls"] + +intensities_zscore_ratios_df <- rbind(intensities_zscore_df, ratio_zscore_df) + +# for debugging: +save(intensities_zscore_ratios_df, file = paste0(output_dir, "/outlist_with_ratios.RData")) + +# Select only the cols with zscores of the patients +zscore_patients_df <- intensities_zscore_ratios_df %>% select(HMDB_code, HMDB_name, any_of(paste0(patients, "_Zscore"))) +zscore_controls_df <- intensities_zscore_ratios_df %>% select(HMDB_code, HMDB_name, any_of(paste0(controls, "_Zscore"))) + +#### Make violin plots ##### +# preparation +colnames(zscore_patients_df) <- gsub("_Zscore", "", colnames(zscore_patients_df)) +colnames(zscore_controls_df) <- gsub("_Zscore", "", colnames(zscore_controls_df)) + +expected_biomarkers_df <- expected_biomarkers_df %>% rename(HMDB_code = HMDB.code, HMDB_name = Metabolite) + +expected_biomarkers_info <- expected_biomarkers_df %>% + select(c(Disease, HMDB_code, HMDB_name)) %>% + distinct(Disease, HMDB_code, .keep_all = TRUE) + +metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) +for (metabolite_dir in metabolite_dirs) { + # create a directory for the output PDFs + pdf_dir <- paste(output_dir, metabolite_dir, sep = "/") + dir.create(pdf_dir, showWarnings = FALSE) + + metab_list_all <- get_list_metabolites(paste(path_metabolite_groups, metabolite_dir, sep = "/")) + + # prepare list of metabolites; max nr_plots_perpage on one page + metab_interest_sorted <- combine_metab_info_zscores(metab_list_all, zscore_patients_df) + metab_interest_controls <- combine_metab_info_zscores(metab_list_all, zscore_controls_df) + metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, + nr_plots_perpage, nr_of_patients, nr_of_controls) + + # for Diagnostics metabolites to be saved in Helix + if (grepl("Diagnost", pdf_dir)) { + # get table that combines DIMS results with stofgroepen/Helix table + dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all) + + # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs + if (any(is_diagnostic_patient(dims_helix_table$Sample))) { + # get output file for Helix + output_helix <- output_for_helix(protocol_name, dims_helix_table) + # write output to file + path_helixfile <- paste0(output_dir, "output_Helix_", run_name, ".csv") + write.csv(output_helix, path_helixfile, quote = FALSE, row.names = FALSE) } - - # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. - dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) - - # Edit the DIMS output Zscores of all patients in format: - # HMDB_code patientname1 patientname2 - names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios)) - names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) - - # for debugging: - write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t") - - # Select only the cols with zscores of the patients - zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] - # Select only the cols with zscores of the controls - zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] - } - #### STEP 4: Run the IEM algorithm ######### - # in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file) - # algorithm taken from DOI: 10.3390/ijms21030979 - - if (algorithm == 1) { - # Load data - cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_iem, "\n")) - expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) - # modify column names - names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) - names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) - - # prepare dataframe scaffold rank_patients - rank_patients <- zscore_patients - # Fill df rank_patients with the ranks for each patient - for (patient_index in 3:ncol(zscore_patients)) { - # number of positive zscores in patient - pos <- sum(zscore_patients[, patient_index] > 0) - # sort the column on zscore; NB: this sorts the entire object, not just one column - rank_patients <- rank_patients[order(-rank_patients[patient_index]), ] - # Rank all positive zscores highest to lowest - rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) - # Rank all negative zscores lowest to highest - rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1): - nrow(rank_patients), patient_index])) - } - - # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). - expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - expected_zscores_original <- expected_zscores - - # determine which columns contain Z-scores and which contain disease info - select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) - select_info_cols <- 1:(min(select_zscore_cols) - 1) - # set some zscores to zero - select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, - select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) - select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, - select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) - - # calculate rank score: - expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / - (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) - # combine disease info with rank scores - expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) - - # multiply weight score and rank score - weight_score <- expected_zscores - weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols] - - # sort table on Disease and Absolute_Weight - weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] - - # select columns to check duplicates - dup <- weight_score[, c("Disease", "M.z")] - uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ] - - # calculate probability score - prob_score <- aggregate(uni[, select_zscore_cols], uni["Disease"], sum) - - # list of all diseases that have at least one metabolite Zscore at 0 - for (patient_index in 2:ncol(prob_score)) { - patient_zscore_colname <- colnames(prob_score)[patient_index] - matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname) - # determine which Zscores are 0 for this patient - zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0) - # get Disease for these - disease_zero <- unique(expected_zscores[zscores_zero, "Disease"]) - # set the probability score of these diseases to 0 - prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0 - } - - # determine disease rank per patient - disease_rank <- prob_score - # rank diseases in decreasing order - disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) - as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) - # modify column names, Zscores have now been converted to probability scores - colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score)) - colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank)) - - # Create conditional formatting for output Excel sheet. Colors according to values. - wb <- createWorkbook() - addWorksheet(wb, "Probability Scores") - writeData(wb, "Probability Scores", prob_score) - conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), - type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) - saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) - # check whether prob_score df exists and has expected dimensions. - if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { - cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") + # make violin plots per patient + for (patient_id in patients) { + # for category Diagnostics, make list of metabolites that exceed alarm values for this patient + # for category Other, make list of top highest and lowest Z-scores for this patient + if (grepl("Diagnost", pdf_dir)) { + top_metabs_patient <- prepare_alarmvalues(patient_id, dims_helix_table) } else { - cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n") + top_metabs_patient <- prepare_toplist(patient_id, zscore_patients_df) } - rm(wb) + # generate normal violin plots + create_pdf_violin_plots(pdf_dir, patient_id, metab_perpage, top_metabs_patient, explanation_violin_plot) } - #### STEP 5: Make violin plots ##### - # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, - # top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file - - if (violin == 1) { - - # preparation - zscore_patients_copy <- zscore_patients - colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients)) - colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls)) - - # Make patient list for violin plots - patient_list <- names(zscore_patients)[-c(1, 2)] - - # from table expected_biomarkers, choose selected columns - select_columns <- c("Disease", "HMDB_code", "HMDB_name") - #select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) - expected_biomarkers_select <- expected_biomarkers %>% select(all_of(select_columns)) - # remove duplicates - expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ] - - # load file with explanatory information to be included in PDF. - explanation <- readLines(file_explanation) - - # first step: normal violin plots - # Find all text files in the given folder, which contain metabolite lists of which - # each file will be a page in the pdf with violin plots. - # Make a PDF file for each of the categories in metabolite_dirs - metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) - for (metabolite_dir in metabolite_dirs) { - # create a directory for the output PDFs - pdf_dir <- paste(output_dir, metabolite_dir, sep = "/") - dir.create(pdf_dir, showWarnings = FALSE) - cat("making plots in category:", metabolite_dir, "\n") - - # get a list of all metabolite files - metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), - pattern = "*.txt", full.names = FALSE, recursive = FALSE) - # put all metabolites into one list - metab_list_all <- list() - metab_list_names <- c() - cat("making plots from the input files:") - # open the text files and add each to a list of dataframes (metab_list_all) - for (file_index in seq_along(metabolite_files)) { - infile <- metabolite_files[file_index] - metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), - sep = "\t", header = TRUE, quote = "") - # put into list of all lists - metab_list_all[[file_index]] <- metab_list - metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) - cat(paste0("\n", infile)) - } - # include list of classes in metabolite list - names(metab_list_all) <- metab_list_names - - # prepare list of metabolites; max nr_plots_perpage on one page - metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients) - metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) - metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) - - # for Diagnostics metabolites to be saved in Helix - if(grepl("Diagnost", pdf_dir)) { - # get table that combines DIMS results with stofgroepen/Helix table - dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all) - - # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs - if(any(is_diagnostic_patient(dims_helix_table$Patient))){ - # get output file for Helix - output_helix <- output_for_helix(protocol_name, dims_helix_table) - # write output to file - path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv") - write.csv(output_helix, path_helixfile, quote = F, row.names = F) - } - } - - # make violin plots per patient - for (pt_nr in 1:length(patient_list)) { - pt_name <- patient_list[pt_nr] - # for category Diagnostics, make list of metabolites that exceed alarm values for this patient - # for category Other, make list of top highest and lowest Z-scores for this patient - if (grepl("Diagnost", pdf_dir)) { - top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table) - } else { - top_metab_pt <- prepare_toplist(pt_name, zscore_patients) - } - - # generate normal violin plots - create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) - - } - - } - - # Second step: dIEM plots in separate directory - diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") - dir.create(diem_plot_dir) - - # Select the metabolites that are associated with the top highest scoring IEM, for each patient - # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely. - for (pt_nr in 1:length(patient_list)) { - pt_name <- patient_list[pt_nr] - # get top diseases for this patient - pt_colnr <- which(colnames(disease_rank) == pt_name) - pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem) - pt_iems <- disease_rank[pt_top_indices, "Disease"] - pt_top_iems <- pt_prob_score_top_iems <- c() - for (single_iem in pt_iems) { - # get the probability score - prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr] - # use only diseases for which probability score is above threshold - if (prob_score_iem >= threshold_iem) { - pt_top_iems <- c(pt_top_iems, single_iem) - pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem) - } - } - - # prepare data for plotting dIEM violin plots - # If prob_score_top_iem is an empty list, don't make a plot - if (length(pt_top_iems) > 0) { - # Sorting from high to low, both prob_score_top_iems and pt_top_iems. - pt_prob_score_order <- order(-pt_prob_score_top_iems) - pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1) - pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order] - pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order] - # getting metabolites for each top_iem disease exactly like in metab_list_all - metab_iem_all <- list() - metab_iem_names <- c() - for (single_iem_index in 1:length(pt_top_iem_sorted)) { - single_iem <- pt_top_iem_sorted[single_iem_index] - single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index] - select_rows <- which(expected_biomarkers_select$Disease == single_iem) - metab_list <- expected_biomarkers_select[select_rows, ] - metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score)) - metab_list <- metab_list[, -1] - metab_iem_all[[single_iem_index]] <- metab_list - } - # put all metabolites into one list - names(metab_iem_all) <- metab_iem_names - - # get Zscore information from zscore_patients_copy, similar to normal violin plots - metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy) - metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls) - # make sure every page has 20 metabolites - diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat) - # add table of metabolites with increased or decreased Z-scores - top_metab_pt <- prepare_toplist(pt_name, zscore_patients) - - # generate dIEM violin plots - create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt) - - } else { - cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ". - Therefore, this pdf was not made:\t ", pt_name, "_iem \n")) - } +} +#### Run the IEM algorithm ######### +diem_probability_score <- run_diem_algorithm(expected_biomarkers_df, zscore_patients_df, patients) + +save_prob_scores_to_excel(diem_probability_score, output_dir, run_name) + + +#### Generate dIEM plots ######### +diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") +dir.create(diem_plot_dir) + +colnames(diem_probability_score) <- gsub("_Zscore", "", colnames(diem_probability_score)) +patient_no_iem <- c() + +for (patient_id in patients) { + # Select the top IEMs and filter on the IEM threshold + patient_top_iems_probs <- diem_probability_score %>% + select(c(Disease, !!sym(patient_id))) %>% + arrange(desc(!!sym(patient_id))) %>% + slice(1:top_number_iem_diseases) %>% + filter(!!sym(patient_id) >= threshold_iem) + + if (nrow(patient_top_iems_probs) > 0) { + top_iems <- patient_top_iems_probs %>% pull(Disease) + # Get the metabolites for each IEM and their probability + metabs_iems <- list() + metabs_iems_names <- c() + for (iem in top_iems) { + iem_probablity <- patient_top_iems_probs %>% filter(Disease == iem) %>% pull(!!sym(patient_id)) + metabs_iems_names <- c(metabs_iems_names, paste0(iem, ", probability score ", iem_probablity)) + metab_iem_df <- expected_biomarkers_df %>% filter(Disease == iem) %>% select(HMDB_code, HMDB_name) + metabs_iems[[iem]] <- metab_iem_df } + # metabs_iems <- lapply(top_iems, function(iem) { + # iem_probablity <- patient_top_iems_probs %>% filter(Disease == iem) %>% pull(!!sym(patient_id)) + # metabs_iems_names <- c(metabs_iems_names, paste0(iem, ", probability score ", iem_probablity)) + # metab_iem <- expected_biomarkers_df %>% filter(Disease == iem) %>% select(HMDB_code, HMDB_name) + # return(metab_iem) + # }) + # names(metabs_iems) <- metabs_iems_names + + # Get the Z-scores with metabolite information + metab_iem_sorted <- combine_metab_info_zscores(metabs_iems, zscore_patients_df) + metab_iem_controls <- combine_metab_info_zscores(metabs_iems, zscore_controls_df) + # Get a list of dataframes for each IEM + diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, + nr_plots_perpage, nr_of_patients, nr_of_controls) + # Get a dataframe of the top metabolites + top_metabs_patient <- prepare_toplist(patient_id, zscore_patients_df) + + # Generate and save dIEM violin plots + create_pdf_violin_plots(diem_plot_dir, patient_id, diem_metab_perpage, top_metabs_patient, explanation_violin_plot) + } else { + patient_no_iem <- c(patient_no_iem, patient_id) } } + +if (length(patient_no_iem) > 0) { + patient_no_iem <- c(paste0("The following patient(s) did not have dIEM probability scores higher than ", + threshold_iem, " :"), + patient_no_iem) + write(file = paste0(output_dir, "missing_probability_scores.txt"), patient_no_iem) +} diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index 1c4b532d..ec65a2e9 100755 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -18,11 +18,10 @@ process GenerateViolinPlots { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore \ + Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.export_scripts_dir \ $params.path_metabolite_groups \ $params.file_ratios_metabolites \ $params.file_expected_biomarkers_IEM \ - $params.file_explanation \ - $params.file_isomers + $params.file_explanation """ } diff --git a/DIMS/export/generate_violin_plots_functions.R b/DIMS/export/generate_violin_plots_functions.R new file mode 100644 index 00000000..0ae7edff --- /dev/null +++ b/DIMS/export/generate_violin_plots_functions.R @@ -0,0 +1,571 @@ +#' Getting the intensities for calculating ratio Z-scores +#' +#' @param ratios_metabs_df: dataframe with HMDB codes for the ratios (dataframe) +#' @param row_index: index of the row in the ratios_metabs_df (integer) +#' @param intensities_zscore_df: dataframe with intensities for each sample (dataframe) +#' @param fraction_side: either numerator or denominator, which side of the fraction (string) +#' @param intensity_cols: names of the columns that contain the intensities (string) +#' +#' @returns fraction_side_intensity: a vector of intensities (vector of integers) +get_intentities_for_ratios <- function(ratios_metabs_df, row_index, intensities_zscore_df, fraction_side, intensity_cols) { + fraction_side_hmdb_ids <- ratios_metabs_df[row_index, fraction_side] + if (grepl("plus", fraction_side_hmdb_ids)) { + fraction_side_hmdb_id_list <- strsplit(fraction_side_hmdb_ids, "plus")[[1]] + fraction_side_intensity_list <- intensities_zscore_df %>% + filter(HMDB_code %in% fraction_side_hmdb_id_list) %>% + select(any_of(intensity_cols)) + fraction_side_intensity <- apply(fraction_side_intensity_list, 2, sum) + } else if (fraction_side_hmdb_ids == "one") { + fraction_side_intensity <- 1 + } else { + fraction_side_intensity <- intensities_zscore_df %>% + filter(HMDB_code == fraction_side_hmdb_ids) %>% + select(any_of(intensity_cols)) + } + return(fraction_side_intensity) +} + +#' Get the sample IDs for columns that have Z-score and intensities +#' +#' @param colnames_zscore: vector of sample IDs from the dataframe containing Z-scores (vector of strings) +#' @param intensity_cols: vector of sample IDs form the dataframe containing intensities (vector of strings) +#' +#' @returns: vector of sample IDs that are in both input vectors (vector of strings) +get_zscore_columns <- function(colnames_zscore, intensity_cols) { + sample_intersect <- intersect(paste0(intensity_cols, "_Zscore"), grep("_Zscore", colnames_zscore, value = TRUE)) + return(sample_intersect) +} + +#' Get a list with dataframes for all off the metabolite group in a directory +#' +#' @param metab_group_dir: directory containing txt files with metabolites per group (string) +#' +#' @returns: list with dataframes with info on metabolites (list of dataframes) +get_list_metabolites <- function(metab_group_dir) { + # get a list of all metabolite files + metabolite_files <- list.files(metab_group_dir, pattern = "*.txt", full.names = FALSE, recursive = FALSE) + # put all metabolites into one list + metab_list_all <- lapply(paste(metab_group_dir, metabolite_files, sep = "/"), + read.table, sep = "\t", header = TRUE, quote = "") + names(metab_list_all) <- gsub(".txt", "", metabolite_files) + + return(metab_list_all) +} + +#' Combine patient Z-scores with metabolite info +#' +#' @param metab_list_all: list of dataframes with metabolite information for different stofgroepen (list) +#' @param zscore_df: dataframe with metabolite Z-scores for all patient +#' +#' @return: list of dataframes for each stofgroep with data for each metabolite and patient/control per row +combine_metab_info_zscores <- function(metab_list_all, zscore_df) { + # remove HMDB_name column and "_Zscore" from column (patient) names + zscore_df <- zscore_df %>% + select(-HMDB_name) %>% + rename_with(~ str_remove(.x, "_Zscore"), .cols = contains("_Zscore")) + + # put data into pages, max 20 violin plots per page in PDF + metab_interest_sorted <- list() + + for (metab_class in names(metab_list_all)) { + metab_df <- metab_list_all[[metab_class]] + # Select HMDB_code and HMDB_name columns + metab_df <- metab_df %>% select(HMDB_code, HMDB_name) + + # Change the HMDB_name column so all names have 45 characters + metab_df <- metab_df %>% mutate(HMDB_name = case_when( + str_length(HMDB_name) > 45 ~ str_c(str_sub(HMDB_name, 1, 42), "..."), + str_length(HMDB_name) < 45 ~ str_pad(HMDB_name, 45, side = "right", pad = " "), + TRUE ~ HMDB_name + )) + + # Join metabolite info with the Z-score dataframe + metab_interest <- metab_df %>% inner_join(zscore_df, by = "HMDB_code") %>% select(-HMDB_code) + + # put the data frame in long format + metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name", variable.name = "Sample", + value.name = "Z_score") + # Add the dataframe sorted on HMDB_name to a list + metab_interest_sorted[[metab_class]] <- metab_interest_melt + } + + return(metab_interest_sorted) +} + +#' Combine patient and control data for each page of the violinplot pdf +#' +#' @param metab_interest_sorted: list of dataframes with data for each metabolite and patient (list) +#' @param metab_interest_contr: list of dataframes with data for each metabolite and control (list) +#' @param nr_plots_perpage: number of plots per page in the violinplot pdf (integer) +#' @param nr_pat: number of patients (integer) +#' @param nr_contr: number of controls (integer) +#' +#' @return: list of dataframes with metabolite Z-scores for each patient and control, +#' the length of list is the number of pages for the violinplot pdf (list) +prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat, nr_contr) { + metab_perpage <- list() + metab_category <- c() + + for (metab_class in names(metab_interest_sorted)) { + # Get the data for patients and controls for the metab_interest_sorted list + metab_sort_patients_df <- metab_interest_sorted[[metab_class]] + metab_sort_controls_df <- metab_interest_contr[[metab_class]] + + # Calculate the number of pages + nr_pages <- ceiling(length(unique(metab_sort_patients_df$HMDB_name)) / nr_plots_perpage) + + # Get all metabolites and create list with HMDB naames of max nr_plots_perpage long + metabolites <- unique(metab_sort_patients_df$HMDB_name) + metabolites_in_chunks <- split(metabolites, ceiling(seq_along(metabolites) / nr_plots_perpage)) + nr_chunks <- length(metabolites_in_chunks) + + current_perpage <- lapply(metabolites_in_chunks, function(metab_name) { + patients_df <- metab_sort_patients_df %>% filter(HMDB_name %in% metab_name) + controls_df <- metab_sort_controls_df %>% filter(HMDB_name %in% metab_name) + + # Combine both dataframes + combined_df <- rbind(patients_df, controls_df) + + # Add empty dummy's to extend the number of metabs to the nr_plots_perpage + n_missing <- nr_plots_perpage - length(metab_name) + if (n_missing > 0) { + dummy_names <- paste0(" ", strrep(" ", seq_len(n_missing))) + metab_order <- c(metab_name, dummy_names) + } else { + metab_order <- metab_name + } + attr(combined_df, "y_order") <- rev(metab_order) + + return(combined_df) + }) + # Add new items to main list + metab_perpage <- append(metab_perpage, current_perpage) + # create list of page headers + metab_category <- c(metab_category, paste(metab_class, seq(nr_chunks), sep = "_")) + } + # add page headers to list + names(metab_perpage) <- metab_category + + return(metab_perpage) +} + +#' Get patient data to be uploaded to Helix +#' +#' @param metab_interest_sorted: list of dataframes with metabolite Z-scores for each sample/patient (list) +#' @param metab_list_all: list of tables with metabolites for Helix and violin plots (list) +#' +#' @return: dataframe with patient data with only metabolites for Helix and violin plots +#' with Helix name, high/low Z-score cutoffs +get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all) { + # Combine Z-scores of metab groups together + df_all_metabs_zscores <- bind_rows(metab_interest_sorted) + + # Change the Sample column to characters, trim HMDB_name and split HMDB_name in new column + df_all_metabs_zscores <- df_all_metabs_zscores %>% + mutate(Sample = as.character(Sample), + HMDB_name = str_trim(HMDB_name, "right"), + HMDB_name_split = str_split_fixed(HMDB_name, "nitine;", 2)[, 1]) + + # Combine stofgroepen + dims_helix_table <- bind_rows(metab_list_all) + + # Filter for Helix metabolites and split HMDB_name column for matching with df_all_metabs_zscores + dims_helix_table <- dims_helix_table %>% + filter(Helix == "ja") %>% + mutate(HMDB_name_split = str_split_fixed(HMDB_name, "nitine;", 2)[, 1]) %>% + select(HMDB_name_split, Helix_naam, high_zscore, low_zscore) + + # Filter DIMS results for metabolites for Helix and combine Helix info + df_metabs_helix <- df_all_metabs_zscores %>% + filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split) %>% + left_join(dims_helix_table, by = join_by(HMDB_name_split)) %>% + select(HMDB_name, Sample, Z_score, Helix_naam, high_zscore, low_zscore) + + return(df_metabs_helix) +} + +#' Check for Diagnostics patients with correct patient number (e.g. starting with "P2024M") +#' +#' @param patient_column: a column from dataframe with IDs (character vector) +#' +#' @return: a logical vector with TRUE or FALSE for each element (vector) +is_diagnostic_patient <- function(patient_column) { + diagnostic_patients <- grepl("^P[0-9]{4}M", patient_column) + + return(diagnostic_patients) +} + +#' Get the output dataframe for Helix +#' +#' @param protocol_name: protocol name (string) +#' @param df_metabs_helix: dataframe with metabolite Z-scores for patients (dataframe) +#' +#' @return: dataframe with patient metabolite Z-scores in correct format for Helix +output_for_helix <- function(protocol_name, df_metabs_helix) { + # Remove positive controls + df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Sample)) + + # Add 'Vial' column, each patient has unique ID + df_metabs_helix <- df_metabs_helix %>% + group_by(Sample) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + # Split patient number into labnummer and Onderzoeksnummer + df_metabs_helix <- add_lab_id_and_onderzoeksnr(df_metabs_helix) + + # Add column with protocol name + df_metabs_helix$Protocol <- protocol_name + + # Change name Z_score and Helix_naam columns to Amount and Name + change_columns <- c(Amount = "Z_score", Name = "Helix_naam") + df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns)) + + # Select only necessary columns and set them in correct order + df_metabs_helix <- df_metabs_helix %>% + select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount)) + + # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times) + df_metabs_helix <- df_metabs_helix %>% + group_by(Onderzoeksnummer, Name) %>% + distinct() %>% + ungroup() + + return(df_metabs_helix) +} + +#' Adding labnummer and Onderzoeksnummer to a dataframe +#' +#' @param df_metabs_helix: dataframe with patient data to be uploaded to Helix +#' +#' @return: dataframe with added labnummer and Onderzoeksnummer columns +add_lab_id_and_onderzoeksnr <- function(df_metabs_helix) { + # Split patient number into labnummer and Onderzoeksnummer + for (row in seq_len(nrow(df_metabs_helix))) { + df_metabs_helix[row, "labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row, "Sample"]) + labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]] + df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2]) + } + return(df_metabs_helix) +} + +#' Create a dataframe with all metabolites that exceed the min and max Z-score cutoffs +#' +#' @param patient_name: patient code (string) +#' @param dims_helix_table: dataframe with metabolite Z-scores for each patient and Helix info (dataframe) +#' +#' @return: dataframe with metabolites that exceed the min and max Z-score cutoffs for the selected patient +prepare_alarmvalues <- function(patient_name, dims_helix_table) { + # extract data for patient of interest (patient_name) + patient_metabs_helix <- dims_helix_table %>% + filter(Sample == patient_name) %>% + mutate(Z_score = round(Z_score, 2)) + + patient_high_df <- patient_metabs_helix %>% filter(Z_score > high_zscore) + patient_low_df <- patient_metabs_helix %>% filter(Z_score < low_zscore) + + if (nrow(patient_high_df) > 0 || nrow(patient_low_df) > 0) { + # sort tables on zscore + patient_high_df <- patient_high_df %>% arrange(desc(Z_score)) %>% select(c(HMDB_name, Z_score)) + patient_low_df <- patient_low_df %>% arrange(Z_score) %>% select(c(HMDB_name, Z_score)) + } + # add lines for increased, decreased + extra_line1 <- c("Increased", "") + extra_line2 <- c("Decreased", "") + + # combine the two lists + top_metab_patient <- rbind(extra_line1, patient_high_df, extra_line2, patient_low_df) + + # remove row names + rownames(top_metab_patient) <- NULL + # change column names for display + colnames(top_metab_patient) <- c("Metabolite", "Z-score") + + return(top_metab_patient) +} + +#' Create a dataframe with the top 20 highest and top 10 lowest metabolites per patient +#' +#' @param pt_name: patient code (string) +#' @param zscore_patients: dataframe with metabolite Z-scores per patient (dataframe) +#' @param top_highest: the number of metabolites with the highest Z-score to display in the table (numeric) +#' @param top_lowest: the number of metabolites with the lowest Z-score to display in the table (numeric) +#' +#' @return: dataframe with 30 metabolites and Z-scores (dataframe) +prepare_toplist <- function(patient_id, zscore_patients) { + top_highest <- 20 + top_lowest <- 10 + patient_df <- zscore_patients %>% + select(HMDB_code, HMDB_name, !!sym(patient_id)) %>% + arrange(!!sym(patient_id)) + + # Get lowest Zscores + patient_df_low <- patient_df[1:top_lowest, ] + patient_df_low <- patient_df_low %>% mutate(across(!!sym(patient_id), ~ round(.x, 2))) + + # Get highest Zscores + patient_df_high <- patient_df[nrow(patient_df):(nrow(patient_df) - top_highest + 1), ] + patient_df_high <- patient_df_high %>% mutate(across(!!sym(patient_id), ~ round(.x, 2))) + + # add lines for increased, decreased + extra_line1 <- c("Increased", "", "") + extra_line2 <- c("Decreased", "", "") + top_metab_pt <- rbind(extra_line1, patient_df_high, extra_line2, patient_df_low) + # remove row names + rownames(top_metab_pt) <- NULL + + # change column names for display + colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score") + + return(top_metab_pt) +} + +#' Create a pdf with table with metabolites and violin plots +#' +#' @param pdf_dir: location where to save the pdf file (string) +#' @param patient_id: patient id (string) +#' @param metab_perpage: list of dataframes, each dataframe contains data for a page in de pdf (list) +#' @param top_metab_pt: dataframe with increased and decreased metabolites for this patient (dataframe) +#' @param explanation: text that explains the violin plots and the pipeline version (string) +create_pdf_violin_plots <- function(pdf_dir, patient_id, metab_perpage, top_metab_pt, explanation) { + # set parameters for plots + plot_height <- 9.6 + plot_width <- 6 + + # patient plots, create the PDF device + patient_id_sub <- patient_id + suffix <- "" + if (grepl("Diagnostics", pdf_dir) && is_diagnostic_patient(patient_id)) { + prefix <- "MB" + suffix <- "_DIMS_PL_DIAG" + # substitute P and M in P2020M00001 into right format for Helix + patient_id_sub <- gsub("[PM]", "", patient_id) + patient_id_sub <- gsub("\\..*", "", patient_id_sub) + } else if (grepl("Diagnostics", pdf_dir)) { + prefix <- "Dx_" + } else if (grepl("IEM", pdf_dir)) { + prefix <- "IEM_" + } else { + prefix <- "R_" + } + + pdf(paste0(pdf_dir, "/", prefix, patient_id_sub, suffix, ".pdf"), + onefile = TRUE, + width = plot_width, + height = plot_height) + + # page headers: + page_headers <- names(metab_perpage) + + # put table into PDF file, if not empty + if (!is.null(dim(top_metab_pt))) { + max_rows_per_page <- 35 + total_rows <- nrow(top_metab_pt) + number_of_pages <- ceiling(total_rows / max_rows_per_page) + + # get the names and numbers in the table aligned + table_theme <- ttheme_default(core = list(fg_params = list(hjust = 0, x = 0.05, fontsize = 6)), + colhead = list(fg_params = list(fontsize = 8, fontface = "bold"))) + + for (page in seq(number_of_pages)) { + start_row <- (page - 1) * max_rows_per_page + 1 + end_row <- min(page * max_rows_per_page, total_rows) + page_data <- top_metab_pt[start_row:end_row, ] + + table_grob <- tableGrob(page_data, theme = table_theme, rows = NULL) + + grid.arrange( + table_grob, + top = paste0("Top deviating metabolites for patient: ", patient_id) + ) + } + } + + # violin plots + for (metab_class in names(metab_perpage)) { + # extract list of metabolites to plot on a page + metab_zscores_df <- metab_perpage[[metab_class]] + # extract original data for patient of interest (pt_name) before cut-offs + patient_zscore_df <- metab_zscores_df %>% filter(Sample == patient_id) + + # Remove patient column and change Z-score. If under -5 to -5 and if above 20 to 20. + metab_zscores_df <- metab_zscores_df %>% + filter(Sample != patient_id) %>% + mutate(Z_score = pmin(pmax(Z_score, -5), 20)) + + # subtitle per page + sub_perpage <- gsub("_", " ", metab_class) + # for IEM plots, put subtitle on two lines + sub_perpage <- gsub("probability", "\nprobability", sub_perpage) + + # draw violin plot. + ggplot_object <- create_violin_plot(metab_zscores_df, patient_zscore_df, sub_perpage, patient_id) + + suppressWarnings(print(ggplot_object)) + } + + # add explanation of violin plots, version number etc. + plot(NA, xlim = c(0, 5), ylim = c(0, 5), bty = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "") + if (length(explanation) > 0) { + text(0.2, 5, explanation[1], pos = 4, cex = 0.8) + for (line_index in 2:length(explanation)) { + text_y_position <- 5 - (line_index * 0.2) + text(-0.2, text_y_position, explanation[line_index], pos = 4, cex = 0.5) + } + } + + # close the PDF file + dev.off() +} + +#' Create violin plots +#' +#' @param metab_zscores_df: dataframe with Z-scores for all samples (dataframe) +#' @param patient_zscore_df: dataframe with Z-scores for the specified patient (dataframe) +#' @param sub_perpage: subtitle of the page (string) +#' @param patient_id: the patient id of the selected patient (string) +#' +#' @returns ggpplot_object: a violin plot of metabolites that highlights the selected patient (ggplot object) +create_violin_plot <- function(metab_zscores_df, patient_zscore_df, sub_perpage, patient_id) { + fontsize <- 1 + circlesize <- 0.8 + # Set colors for the violinplot: green, blue, blue/purple, purple, orange, red + colors_plot <- c("#22E4AC", "#00B0F0", "#504FFF", "#A704FD", "#F36265", "#DA0641") + + y_order <- attr(metab_zscores_df, "y_order") + metab_zscores_df$HMDB_name <- rev(factor(metab_zscores_df$HMDB_name, levels = rev(y_order))) + patient_zscore_df$HMDB_name <- rev(factor(patient_zscore_df$HMDB_name, levels = rev(y_order))) + + ggplot_object <- ggplot(metab_zscores_df, aes(x = Z_score, y = HMDB_name)) + + # Make violin plots + geom_violin(scale = "width", na.rm = TRUE) + + # Add Z-score for the selected patient, shape=22 gives square for patient of interest + geom_point(data = patient_zscore_df, aes(color = Z_score), + size = 3.5 * circlesize, shape = 22, fill = "white", na.rm = TRUE) + + # Add the Z-score at the right side of the plot + geom_text(data = patient_zscore_df, + aes(16, label = paste0("Z=", round(Z_score, 2))), + hjust = "left", vjust = +0.2, size = 3, na.rm = TRUE) + + # Set colour for the Z-score of the selected patient + scale_fill_gradientn(colors = colors_plot, values = NULL, space = "Lab", + na.value = "grey50", guide = "colourbar", aesthetics = "colour") + + # Add labels to the axis + labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + + # Add a title to the page + ggtitle(label = paste0("Results for patient ", patient_id)) + + # Set theme: size and font type of y-axis labels, remove legend and make the + theme(axis.text.y = element_text(family = "Courier", size = 6), + legend.position = "none", + plot.caption = element_text(size = rel(fontsize))) + + # Set y-axis to set order + scale_y_discrete(limits = y_order) + + # Limit the x-axis to between -5 and 20 + xlim(-5, 20) + + # Set grey vertical lines at -2 and 2 + geom_vline(xintercept = c(-2, 2), col = "grey", lwd = 0.5, lty = 2) + + + return(ggplot_object) +} + +#' Run the dIEM algorithm (DOI: 10.3390/ijms21030979) +#' +#' @param expected_biomarkers_df: table with information for HMDB codes about IEMs (dataframe) +#' @param zscore_patients: dataframe containing Z-scores for patient (dataframe) +#' +#' @returns probability_score: a dataframe with probability scores for IEMs for each patient (dataframe) +run_diem_algorithm <- function(expected_biomarkers_df, zscore_patients_df, sample_cols) { + # Rank the metabolites for each patient individually + ranking_patients <- zscore_patients_df %>% + mutate(across(-c(HMDB_code, HMDB_name), rank_patient_zscores)) + + ranking_patients <- merge(x = expected_biomarkers_df, y = ranking_patients, + by.x = c("HMDB_code"), by.y = c("HMDB_code")) + + zscore_expected_df <- merge(x = expected_biomarkers_df, y = zscore_patients_df, + by.x = c("HMDB_code"), by.y = c("HMDB_code")) + + # Change Z-score to zero for specific cases + zscore_expected_df <- zscore_expected_df %>% mutate(across( + all_of(sample_cols), + ~ case_when( + Change == "Increase" & Dispensability == "Indispensable" & .x <= 1.6 ~ 0, + Change == "Decrease" & Dispensability == "Indispensable" & .x >= -1.2 ~ 0, + TRUE ~ .x + ) + )) + + # Sort both dataframes on HMDB_code for calculating the metabolite score + zscore_expected_df <- zscore_expected_df[order(zscore_expected_df$HMDB_code), ] + ranking_patients <- ranking_patients[order(ranking_patients$HMDB_code), ] + + # Set up dataframe for the metabolite score, copy zscore_expected_df for biomarker info + metabolite_score_info <- zscore_expected_df + # Calculate metabolite score: Z-score/(Rank * 0.9) + metabolite_score_info[sample_cols] <- zscore_expected_df[sample_cols] / (ranking_patients[sample_cols] * 0.9) + + # Calculate the weighted score: metabolite_score * Total_Weight + metabolite_weight_score <- metabolite_score_info %>% + mutate(across( + all_of(sample_cols), + ~ .x * Total_Weight + )) %>% + arrange(desc(Disease), desc(Absolute_Weight)) + + # Calculate the probability score for each disease - Mz combination + probability_score <- metabolite_weight_score %>% + filter(!duplicated(select(., Disease, M.z)) | + !duplicated(select(., Disease, M.z), fromLast = FALSE)) %>% + group_by(Disease) %>% + summarise(across(all_of(sample_cols), sum), .groups = "drop") + + # Set probability score to 0 for Z-scores == 0 + for (sample_col in sample_cols) { + # Get indexes of Zscore that equal 0 + zscores_zero_idx <- which(zscore_expected_df[[sample_col]] == 0) + # Get diseases that have a Zscore of 0 + diseases_zero <- unique(zscore_expected_df[zscores_zero_idx, "Disease"]) + # Set probabilty of these diseases to 0 + probability_score[probability_score$Disease %in% diseases_zero, sample_col] <- 0 + } + + colnames(probability_score) <- gsub("_Zscore", "_prob_score", colnames(probability_score)) + + return(probability_score) +} + +#' Ranking Z-scores for a patient, separate for positive and negative Z-scores +#' +#' @param zscore_col: vector with Z-scores for a single patient (vector of integers) +#' +#' @returns ranking: a vector of the ranking of the Z-scores (vector of integers) +rank_patient_zscores <- function(zscore_col) { + # Create ranking column with default NA values + ranking <- rep(NA_real_, length(zscore_col)) + + # Get indexes for negative and positive rows + neg_indexes <- which(zscore_col <= 0) + pos_indexes <- which(zscore_col > 0) + + # Rank the negative and positive Zscores + ranking[neg_indexes] <- dense_rank(zscore_col[neg_indexes]) + ranking[pos_indexes] <- dense_rank(-zscore_col[pos_indexes]) + + return(ranking) +} + +#' Save the probability score dataframe as an Excel file +#' +#' @param probability_score: a dataframe containing probability scores for each patient (dataframe) +#' @param output_dir: location where to save the Excel file (string) +#' @param run_name: name of the run, for the file name (string) +save_prob_scores_to_excel <- function(probability_score, output_dir, run_name) { + # Create conditional formatting for output Excel sheet. Colors according to values. + wb <- createWorkbook() + addWorksheet(wb, "Probability Scores") + writeData(wb, "Probability Scores", probability_score) + conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(probability_score), rows = seq_len(nrow(probability_score)), + type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) + saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) + rm(wb) +} diff --git a/DIMS/tests/testthat.R b/DIMS/tests/testthat.R index 3f13a069..6e5a4962 100644 --- a/DIMS/tests/testthat.R +++ b/DIMS/tests/testthat.R @@ -1,6 +1,8 @@ # Run all unit tests library(testthat) library(withr) +library(vdiffr) +library(pdftools) # enable snapshots local_edition(3) diff --git a/DIMS/tests/testthat/_snaps/generate_violin_plots.md b/DIMS/tests/testthat/_snaps/generate_violin_plots.md new file mode 100644 index 00000000..5df67e7a --- /dev/null +++ b/DIMS/tests/testthat/_snaps/generate_violin_plots.md @@ -0,0 +1,21 @@ +# Create a pdf with a table of top metabolites and violin plots + + Code + content_pdf_violinplots + Output + [1] "Top deviating metabolites for patient: P2025M1\n Metabolite Z.score\n Increased\n metab1 2.45\n Decreased\n metab11 −1.51\n" + [2] " Results for patient P2025M1\n test acyl carnitines\n metab1 Z=2.34\nMetabolites\n metab3 Z=0.31\n −5 0 5 10 15 20\n Z−scores\n" + [3] " Results for patient P2025M1\n test crea gua\n metab4 Z=−0.46\nMetabolites\n metab11 Z=0.84\n −5 0 5 10 15 20\n Z−scores\n" + [4] " Unit test Generate Violin Plots\nUnit test Generate Violin Plots\n" + +# Saving the probability score dataframe as an Excel file + + Disease P2025M1 P2025M2 P2025M3 P2025M4 + 1 Disease A 10.900 -10.9 49.90 -49.9 + 2 Disease B 0.953 0.0 2.29 0.0 + 3 Disease C 12.100 0.0 0.00 12.1 + 4 Disease D 0.000 -12.5 0.00 18.2 + 5 Disease E 44.300 0.0 0.00 28.1 + 6 Disease F 0.000 -77.4 -77.40 0.0 + 7 Disease G -38.700 38.7 38.70 -38.7 + diff --git a/DIMS/tests/testthat/_snaps/generate_violin_plots/violin-plot-p2025m1.svg b/DIMS/tests/testthat/_snaps/generate_violin_plots/violin-plot-p2025m1.svg new file mode 100644 index 00000000..fee3d1d9 --- /dev/null +++ b/DIMS/tests/testthat/_snaps/generate_violin_plots/violin-plot-p2025m1.svg @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Z=0.31 +Z=2.34 + + + + +metab3 +metab1 + + + + + + + + +-5 +0 +5 +10 +15 +20 +Z-scores +Metabolites +test acyl carnitines +Results for patient P2025M1 + + diff --git a/DIMS/tests/testthat/fixtures/make_test_data_GenerateViolinPlots.R b/DIMS/tests/testthat/fixtures/make_test_data_GenerateViolinPlots.R new file mode 100644 index 00000000..0664e00b --- /dev/null +++ b/DIMS/tests/testthat/fixtures/make_test_data_GenerateViolinPlots.R @@ -0,0 +1,290 @@ +### Functions used to create mock dataframes used for unit testing of GenerateViolinPlots ### + +make_intensities_zscore_df <- function() { + test_intensities_zscore_df <- data.frame( + HMDB_code = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012"), + HMDB_name = c("metab1", "metab2", "metab3", "metab4", "metab5", "metab6"), + C101.1 = c(1000, 1200, 1300, 1400, 1500, 1600), + C102.1 = c(1100, 1700, 925, 1125, 1200, 1050), + C103.1 = c(1300, 750, 1000, 1220, 1100, 1200), + C104.1 = c(1650, 925, 1600, 1650, 1025, 1150), + C105.1 = c(180000, 1950, 750, 15050, 1100, 1300), + P2025M1 = c(1000, 1200, 1300, 1400, 1100, 975), + P2025M2 = c(1100, 1700, 925, 1125, 1050, 1175), + P2025M3 = c(1300, 750, 1000, 1220, 975, 1100), + P2025M4 = c(1650, 925, 1600, 1650, 1700, 1750), + P2025M5 = c(180000, 1950, 750, 15050, 10000, 1500), + mean_control = c(37010, 1305, 1115, 4089, 1185, 1260), + sd_control = c(79934.23, 508.80, 336.15, 6130.65, 186.75, 210.36), + C101.1_Zscore = c(0.45, 1.67, -1.86, 0.58, 2.47, -0.56), + C102.1_Zscore = c(2.89, 0.79, -1.88, 5.46, -0.68, 1.65), + C103.1_Zscore = c(0.54, -0.85, 1.58, 3.84, 0.84, -1.11), + C104.1_Zscore = c(0.53, 1.84, 0.35, -0.54, 1.48, 0.43), + C105.1_Zscore = c(3.46, -1.31, 0.14, -0.15, 1.48, 0.36), + P2025M1_Zscore = c(0.31, 1.84, 2.34, 0.84, -0.46, 0.14), + P2025M2_Zscore = c(2.45, 0.48, 1.45, -0.15, -1.51, 3.56), + P2025M3_Zscore = c(2.14, 0.15, -1.44, -0.78, 1.68, 0.51), + P2025M4_Zscore = c(12.18, 2.48, -0.18, 0.84, 1.48, -2.45), + P2025M5_Zscore = c(3.22, 0.48, -3.18, 0.47, 1.18, 2.14) + ) + rownames(test_intensities_zscore_df) <- c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012") + + write.table(test_intensities_zscore_df, file = "tests/testthat/fixtures/test_intensities_zscore_df.txt", sep = "\t") +} + +make_files_metabolite_groups <- function() { + # Create new directories + dir.create("tests/testthat/fixtures/test_metabolite_groups") + + test_acyl_carnitines <- data.frame( + HMDB_code = c("HMDB001", "HMDB003", "HMDBAA1"), + HMDB_name = c("metab1", "metab3", "ratio1"), + Helix = c("ja", "nee", "ja"), + Helix_naam = c("Metab_1", "Metab_3", "Ratio_1"), + high_zscore = c(2, 2, 2), + low_zscore = c(-1.5, -1.5, -1.5) + ) + + write.table(test_acyl_carnitines, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_metabolite_groups/test_acyl_carnitines.txt") + + test_crea_gua <- data.frame( + HMDB_code = c("HMDB004", "HMDB011"), + HMDB_name = c("metab4", "metab11"), + Helix = c("ja", "ja"), + Helix_naam = c("Metab_4", "Metab_11"), + high_zscore = c(2, 2), + low_zscore = c(-1.5, -1.5) + ) + + write.table(test_crea_gua, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_metabolite_groups/test_crea_gua.txt") +} + +make_test_metab_interest_sort <- function() { + test_acyl_carnitines_patients <- data.frame( + HMDB_name = c( + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5"), + Z_score = c(0.31, 2.34, 2.45, 1.45, 2.14, -1.44, 12.18, -0.18, 3.22, -3.18) + ) + + test_acyl_carnitines_controls <- data.frame( + HMDB_name = c( + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 " + ), + Sample = c("C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.45, -1.86, 2.89, -1.88, 0.54, 1.58, 0.53, 0.35, 3.46, 0.14) + ) + + write.table(test_acyl_carnitines_patients, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_acyl_carnitines_patients.txt") + write.table(test_acyl_carnitines_controls, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_acyl_carnitines_controls.txt") + + test_crea_gua_patients <- data.frame( + HMDB_name = c( + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5"), + Z_score = c(0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18) + ) + + test_crea_gua_controls <- data.frame( + HMDB_name = c( + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 " + ), + Sample = c("C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.58, 2.47, 5.46, -0.68, 3.84, 0.84, -0.54, 1.48, -0.15, 1.48) + ) + + write.table(test_crea_gua_patients, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_crea_gua_patients.txt") + write.table(test_crea_gua_controls, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_crea_gua_controls.txt") +} + +make_test_df_metabs_helix <- function() { + test_df_metabs_helix <- data.frame( + HMDB_name = c("metab1", "metab1", "metab1", "metab1", "metab1", "metab4", "metab11", "metab4", + "metab11", "metab4", "metab11", "metab4", "metab11", "metab4", "metab11"), + Sample = c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5", "P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5"), + Z_score = c(0.31, 2.45, 2.14, 12.18, 3.22, 0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18), + Helix_naam = c("Metab_1", "Metab_1", "Metab_1", "Metab_1", "Metab_1", "Metab_4", "Metab_11", + "Metab_4", "Metab_11", "Metab_4", "Metab_11", "Metab_4", "Metab_11", "Metab_4", "Metab_11"), + high_zscore = rep(2, 15), + low_zscore = rep(-1.5, 15) + ) + + write.table(test_df_metabs_helix, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_df_metabs_helix.txt") +} + +make_test_zscore_patient_df <- function() { + test_zscore_patient_df <- data.frame( + HMDB_code = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB005", "HMDB006", "HMDB007", "HMDB008", "HMDB009", "HMDB010", + "HMDB011", "HMDB012", "HMDB013", "HMDB014", "HMDB015", "HMDB016", "HMDB017", "HMDB018", "HMDB019", "HMDB020", + "HMDB021", "HMDB022", "HMDB023", "HMDB024", "HMDB025", "HMDB026", "HMDB027", "HMDB028", "HMDB029", + "HMDB030"), + HMDB_name = c("metab1", "metab2", "metab3", "metab4", "metab5", "metab6", "metab7", "metab8", "metab9", "metab10", + "metab11", "metab12", "metab13", "metab14", "metab15", "metab16", "metab17", "metab18", "metab19", "metab20", + "metab21", "metab22", "metab23", "metab24", "metab25", "metab26", "metab27", "metab28", "metab29", + "metab30"), + P2025M1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + P2025M2 = c(-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, + -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30), + P2025M3 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30), + P2025M4 = c(-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30) + ) + + write.table(test_zscore_patient_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_zscore_patient_df.txt") +} + +make_test_metab_perpage <- function() { + test_acyl_carnitines_df <- data.frame( + HMDB_name = c( + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5", + "C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", + "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.31, 2.34, 2.45, 1.45, 2.14, -1.44, 12.18, -0.18, 3.22, -3.18, + 0.45, -1.86, 2.89, -1.88, 0.54, 1.58, 0.53, 0.35, 3.46, 0.14) + ) + + write.table(test_acyl_carnitines_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_acyl_carnitines_df.txt") + + test_crea_gua_df <- data.frame( + HMDB_name = c( + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5", + "C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", + "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18, + 0.58, 2.47, 5.46, -0.68, 3.84, 0.84, -0.54, 1.48, -0.15, 1.48) + ) + + write.table(test_crea_gua_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_crea_gua_df.txt") +} + +make_test_expected_biomark_df <- function() { + test_expected_biomarkers_df <- data.frame( + HMDB_code = c("HMDB002", "HMDB002", "HMDB005", "HMDB005", "HMDB005", "HMDB009", "HMDB012", "HMDB012", "HMDB020", "HMDB020", + "HMDB025", "HMDB025", "HMDB025", "HMDB028", "HMDB028"), + Disease = c("Disease A", "Disease B", "Disease B", "Disease B", "Disease C", "Disease D", "Disease A", "Disease E", + "Disease F", "Disease C", "Disease F", "Disease G", "Disease D", "Disease G", "Disease E"), + M.z = c("1.2", "1.2", "2.5", "2.5", "2.5", "3.0", "4.5", "4.5", "2.5", "2.5", "5.1", "5.1", "5.1", "5.8", "5.8"), + Change = c("Increase", "Decrease", "Increase", "Increase", "Decrease", "Decrease", "Increase", "Increase", "Decrease", + "Increase", "Increase", "Decrease", "Increase", "Decrease", "Increase"), + Total_Weight = c(10.0, -1.5, 2.0, 5.0, -2.5, -3.0, 14.5, 4.0, + -7.5, 6.0, 20.0, -5.0, 3.0, -1.5, 4.0), + Absolute_Weight = c(10.0, 1.5, 2.0, 5.0, 2.5, 3.0, 14.5, 4.0, + 7.5, 6.0, 20.0, 5.0, 3.0, 1.5, 4.0), + Dispensability = c("Dispensable", "Dispensable", "Indispensable", "Dispensable", "Dispensable", "Indispensable", + "Dispensable", "Dispensable", "Indispensable", "Indispensable", "Dispensable", "Dispensable", + "Dispensable", "Dispensable", "Indispensable") + ) + + write.table(test_expected_biomarkers_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_expected_biomarkers_df.txt") +} + +make_test_probability_score_df <- function() { + test_probability_score_df <- data.frame( + Disease = c("Disease A", "Disease B", "Disease C", "Disease D", "Disease E", "Disease F", "Disease G"), + P2025M1 = c(10.9, 0.953, 12.1, 0, 44.3, 0, -38.7), + P2025M2 = c(-10.9, 0, 0, -12.5, 0, -77.4, 38.7), + P2025M3 = c(49.9, 2.29, 0, 0, 0, -77.4, 38.7), + P2025M4 = c(-49.9, 0, 12.1, 18.2, 28.1, 0, -38.7) + ) + + write.table(test_probability_score_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_probability_score_df.txt") +} diff --git a/DIMS/tests/testthat/fixtures/test_acyl_carnitines_controls.txt b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_controls.txt new file mode 100644 index 00000000..acf72775 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_controls.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab1 C101.1 0.45 +metab3 C101.1 -1.86 +metab1 C102.1 2.89 +metab3 C102.1 -1.88 +metab1 C103.1 0.54 +metab3 C103.1 1.58 +metab1 C104.1 0.53 +metab3 C104.1 0.35 +metab1 C105.1 3.46 +metab3 C105.1 0.14 diff --git a/DIMS/tests/testthat/fixtures/test_acyl_carnitines_df.txt b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_df.txt new file mode 100644 index 00000000..988ce267 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_df.txt @@ -0,0 +1,21 @@ +HMDB_name Sample Z_score +metab1 P2025M1 0.31 +metab3 P2025M1 2.34 +metab1 P2025M2 2.45 +metab3 P2025M2 1.45 +metab1 P2025M3 2.14 +metab3 P2025M3 -1.44 +metab1 P2025M4 12.18 +metab3 P2025M4 -0.18 +metab1 P2025M5 3.22 +metab3 P2025M5 -3.18 +metab1 C101.1 0.45 +metab3 C101.1 -1.86 +metab1 C102.1 2.89 +metab3 C102.1 -1.88 +metab1 C103.1 0.54 +metab3 C103.1 1.58 +metab1 C104.1 0.53 +metab3 C104.1 0.35 +metab1 C105.1 3.46 +metab3 C105.1 0.14 diff --git a/DIMS/tests/testthat/fixtures/test_acyl_carnitines_patients.txt b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_patients.txt new file mode 100644 index 00000000..0d9686e1 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_patients.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab1 P2025M1 0.31 +metab3 P2025M1 2.34 +metab1 P2025M2 2.45 +metab3 P2025M2 1.45 +metab1 P2025M3 2.14 +metab3 P2025M3 -1.44 +metab1 P2025M4 12.18 +metab3 P2025M4 -0.18 +metab1 P2025M5 3.22 +metab3 P2025M5 -3.18 diff --git a/DIMS/tests/testthat/fixtures/test_crea_gua_controls.txt b/DIMS/tests/testthat/fixtures/test_crea_gua_controls.txt new file mode 100644 index 00000000..184dc09f --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_crea_gua_controls.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab4 C101.1 0.58 +metab11 C101.1 2.47 +metab4 C102.1 5.46 +metab11 C102.1 -0.68 +metab4 C103.1 3.84 +metab11 C103.1 0.84 +metab4 C104.1 -0.54 +metab11 C104.1 1.48 +metab4 C105.1 -0.15 +metab11 C105.1 1.48 diff --git a/DIMS/tests/testthat/fixtures/test_crea_gua_df.txt b/DIMS/tests/testthat/fixtures/test_crea_gua_df.txt new file mode 100644 index 00000000..1219723d --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_crea_gua_df.txt @@ -0,0 +1,21 @@ +HMDB_name Sample Z_score +metab4 P2025M1 0.84 +metab11 P2025M1 -0.46 +metab4 P2025M2 -0.15 +metab11 P2025M2 -1.51 +metab4 P2025M3 -0.78 +metab11 P2025M3 1.68 +metab4 P2025M4 0.84 +metab11 P2025M4 1.48 +metab4 P2025M5 0.47 +metab11 P2025M5 1.18 +metab4 C101.1 0.58 +metab11 C101.1 2.47 +metab4 C102.1 5.46 +metab11 C102.1 -0.68 +metab4 C103.1 3.84 +metab11 C103.1 0.84 +metab4 C104.1 -0.54 +metab11 C104.1 1.48 +metab4 C105.1 -0.15 +metab11 C105.1 1.48 diff --git a/DIMS/tests/testthat/fixtures/test_crea_gua_patients.txt b/DIMS/tests/testthat/fixtures/test_crea_gua_patients.txt new file mode 100644 index 00000000..66302789 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_crea_gua_patients.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab4 P2025M1 0.84 +metab11 P2025M1 -0.46 +metab4 P2025M2 -0.15 +metab11 P2025M2 -1.51 +metab4 P2025M3 -0.78 +metab11 P2025M3 1.68 +metab4 P2025M4 0.84 +metab11 P2025M4 1.48 +metab4 P2025M5 0.47 +metab11 P2025M5 1.18 diff --git a/DIMS/tests/testthat/fixtures/test_df_metabs_helix.txt b/DIMS/tests/testthat/fixtures/test_df_metabs_helix.txt new file mode 100644 index 00000000..4a557143 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_df_metabs_helix.txt @@ -0,0 +1,16 @@ +HMDB_name Sample Z_score Helix_naam high_zscore low_zscore +metab1 P2025M1 0.31 Metab_1 2 -1.5 +metab1 P2025M2 2.45 Metab_1 2 -1.5 +metab1 P2025M3 2.14 Metab_1 2 -1.5 +metab1 P2025M4 12.18 Metab_1 2 -1.5 +metab1 P2025M5 3.22 Metab_1 2 -1.5 +metab4 P2025M1 0.84 Metab_4 2 -1.5 +metab11 P2025M1 -0.46 Metab_11 2 -1.5 +metab4 P2025M2 -0.15 Metab_4 2 -1.5 +metab11 P2025M2 -1.51 Metab_11 2 -1.5 +metab4 P2025M3 -0.78 Metab_4 2 -1.5 +metab11 P2025M3 1.68 Metab_11 2 -1.5 +metab4 P2025M4 0.84 Metab_4 2 -1.5 +metab11 P2025M4 1.48 Metab_11 2 -1.5 +metab4 P2025M5 0.47 Metab_4 2 -1.5 +metab11 P2025M5 1.18 Metab_11 2 -1.5 diff --git a/DIMS/tests/testthat/fixtures/test_expected_biomarkers_df.txt b/DIMS/tests/testthat/fixtures/test_expected_biomarkers_df.txt new file mode 100644 index 00000000..9fb0bb0b --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_expected_biomarkers_df.txt @@ -0,0 +1,16 @@ +HMDB_code Disease M.z Change Total_Weight Absolute_Weight Dispensability +HMDB002 Disease A 1.2 Increase 10 10 Dispensable +HMDB002 Disease B 1.2 Decrease -1.5 1.5 Dispensable +HMDB005 Disease B 2.5 Increase 2 2 Indispensable +HMDB005 Disease B 2.5 Increase 5 5 Dispensable +HMDB005 Disease C 2.5 Decrease -2.5 2.5 Dispensable +HMDB009 Disease D 3.0 Decrease -3 3 Indispensable +HMDB012 Disease A 4.5 Increase 14.5 14.5 Dispensable +HMDB012 Disease E 4.5 Increase 4 4 Dispensable +HMDB020 Disease F 2.5 Decrease -7.5 7.5 Indispensable +HMDB020 Disease C 2.5 Increase 6 6 Indispensable +HMDB025 Disease F 5.1 Increase 20 20 Dispensable +HMDB025 Disease G 5.1 Decrease -5 5 Dispensable +HMDB025 Disease D 5.1 Increase 3 3 Dispensable +HMDB028 Disease G 5.8 Decrease -1.5 1.5 Dispensable +HMDB028 Disease E 5.8 Increase 4 4 Indispensable diff --git a/DIMS/tests/testthat/fixtures/test_intensities_zscore_df.txt b/DIMS/tests/testthat/fixtures/test_intensities_zscore_df.txt new file mode 100644 index 00000000..90edb160 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_intensities_zscore_df.txt @@ -0,0 +1,7 @@ +"HMDB_code" "HMDB_name" "C101.1" "C102.1" "C103.1" "C104.1" "C105.1" "P2025M1" "P2025M2" "P2025M3" "P2025M4" "P2025M5" "mean_control" "sd_control" "C101.1_Zscore" "C102.1_Zscore" "C103.1_Zscore" "C104.1_Zscore" "C105.1_Zscore" "P2025M1_Zscore" "P2025M2_Zscore" "P2025M3_Zscore" "P2025M4_Zscore" "P2025M5_Zscore" +"HMDB001" "HMDB001" "metab1" 1000 1100 1300 1650 180000 1000 1100 1300 1650 180000 37010 79934.23 0.45 2.89 0.54 0.53 3.46 0.31 2.45 2.14 12.18 3.22 +"HMDB002" "HMDB002" "metab2" 1200 1700 750 925 1950 1200 1700 750 925 1950 1305 508.8 1.67 0.79 -0.85 1.84 -1.31 1.84 0.48 0.15 2.48 0.48 +"HMDB003" "HMDB003" "metab3" 1300 925 1000 1600 750 1300 925 1000 1600 750 1115 336.15 -1.86 -1.88 1.58 0.35 0.14 2.34 1.45 -1.44 -0.18 -3.18 +"HMDB004" "HMDB004" "metab4" 1400 1125 1220 1650 15050 1400 1125 1220 1650 15050 4089 6130.65 0.58 5.46 3.84 -0.54 -0.15 0.84 -0.15 -0.78 0.84 0.47 +"HMDB011" "HMDB011" "metab5" 1500 1200 1100 1025 1100 1100 1050 975 1700 10000 1185 186.75 2.47 -0.68 0.84 1.48 1.48 -0.46 -1.51 1.68 1.48 1.18 +"HMDB012" "HMDB012" "metab6" 1600 1050 1200 1150 1300 975 1175 1100 1750 1500 1260 210.36 -0.56 1.65 -1.11 0.43 0.36 0.14 3.56 0.51 -2.45 2.14 diff --git a/DIMS/tests/testthat/fixtures/test_metabolite_groups/test_acyl_carnitines.txt b/DIMS/tests/testthat/fixtures/test_metabolite_groups/test_acyl_carnitines.txt new file mode 100644 index 00000000..03a6ad6b --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_metabolite_groups/test_acyl_carnitines.txt @@ -0,0 +1,4 @@ +HMDB_code HMDB_name Helix Helix_naam high_zscore low_zscore +HMDB001 metab1 ja Metab_1 2 -1.5 +HMDB003 metab3 nee Metab_3 2 -1.5 +HMDBAA1 ratio1 ja Ratio_1 2 -1.5 diff --git a/DIMS/tests/testthat/fixtures/test_metabolite_groups/test_crea_gua.txt b/DIMS/tests/testthat/fixtures/test_metabolite_groups/test_crea_gua.txt new file mode 100644 index 00000000..08dd589d --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_metabolite_groups/test_crea_gua.txt @@ -0,0 +1,3 @@ +HMDB_code HMDB_name Helix Helix_naam high_zscore low_zscore +HMDB004 metab4 ja Metab_4 2 -1.5 +HMDB011 metab11 ja Metab_11 2 -1.5 diff --git a/DIMS/tests/testthat/fixtures/test_probability_score_df.txt b/DIMS/tests/testthat/fixtures/test_probability_score_df.txt new file mode 100644 index 00000000..385fe1d3 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_probability_score_df.txt @@ -0,0 +1,8 @@ +Disease P2025M1 P2025M2 P2025M3 P2025M4 +Disease A 10.9 -10.9 49.9 -49.9 +Disease B 0.953 0 2.29 0 +Disease C 12.1 0 0 12.1 +Disease D 0 -12.5 0 18.2 +Disease E 44.3 0 0 28.1 +Disease F 0 -77.4 -77.4 0 +Disease G -38.7 38.7 38.7 -38.7 diff --git a/DIMS/tests/testthat/fixtures/test_zscore_patient_df.txt b/DIMS/tests/testthat/fixtures/test_zscore_patient_df.txt new file mode 100644 index 00000000..0de283ad --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_zscore_patient_df.txt @@ -0,0 +1,31 @@ +HMDB_code HMDB_name P2025M1 P2025M2 P2025M3 P2025M4 +HMDB001 metab1 1 -1 1 -1 +HMDB002 metab2 2 -2 2 -2 +HMDB003 metab3 3 -3 3 -3 +HMDB004 metab4 4 -4 4 -4 +HMDB005 metab5 5 -5 5 -5 +HMDB006 metab6 6 -6 6 -6 +HMDB007 metab7 7 -7 7 -7 +HMDB008 metab8 8 -8 8 -8 +HMDB009 metab9 9 -9 9 -9 +HMDB010 metab10 10 -10 10 -10 +HMDB011 metab11 11 -11 11 -11 +HMDB012 metab12 12 -12 12 -12 +HMDB013 metab13 13 -13 13 -13 +HMDB014 metab14 14 -14 14 -14 +HMDB015 metab15 15 -15 15 -15 +HMDB016 metab16 16 -16 -16 16 +HMDB017 metab17 17 -17 -17 17 +HMDB018 metab18 18 -18 -18 18 +HMDB019 metab19 19 -19 -19 19 +HMDB020 metab20 20 -20 -20 20 +HMDB021 metab21 21 -21 -21 21 +HMDB022 metab22 22 -22 -22 22 +HMDB023 metab23 23 -23 -23 23 +HMDB024 metab24 24 -24 -24 24 +HMDB025 metab25 25 -25 -25 25 +HMDB026 metab26 26 -26 -26 26 +HMDB027 metab27 27 -27 -27 27 +HMDB028 metab28 28 -28 -28 28 +HMDB029 metab29 29 -29 -29 29 +HMDB030 metab30 30 -30 -30 30 diff --git a/DIMS/tests/testthat/test_generate_violin_plots.R b/DIMS/tests/testthat/test_generate_violin_plots.R new file mode 100644 index 00000000..55a2dac1 --- /dev/null +++ b/DIMS/tests/testthat/test_generate_violin_plots.R @@ -0,0 +1,401 @@ +# unit tests for GenerateViolinPlots +# functions: get_intentities_for_ratios, get_zscore_columns, +# get_list_metabolites, combine_metab_info_zscores, +# prepare_data_perpage, get_patient_data_to_helix +# is_diagnostic_patient, output_for_helix +# add_lab_id_and_onderzoeksnummer, prepare_alarmvalues +# prepare_toplist, create_pdf_violin_plots +# create_violin_plot, run_diem_algorithm +# rank_patient_zscores, save_prob_scores_to_Excel + +suppressPackageStartupMessages(library("dplyr")) +library(reshape2) +library(openxlsx) +library(ggplot2) +suppressPackageStartupMessages(library("gridExtra")) +library(stringr) + +source("../../export/generate_violin_plots_functions.R") + +testthat::test_that("Get intensities for calculating the ratios", { + test_intensities_zscore_df <- read.delim(test_path("fixtures", "test_intensities_zscore_df.txt")) + + test_ratios_metabs_df <- data.frame( + HMDB.code = c("HMDBAA1", "HMDBAA2", "HMDBAB1"), + Ratio_name = c("ratio1", "ratio2", "ratio3"), + HMDB_numerator = c("HMDB001", "HMDB002plusHMDB003", "HMDB004"), + HMDB_denominator = c("HMDB011", "HMDB012", "one") + ) + + test_intensity_cols <- c("C101.1", "C102.1", "C103.1", "C104.1", "C105.1", + "P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5") + + expect_equal(colnames(get_intentities_for_ratios(test_ratios_metabs_df, 1, test_intensities_zscore_df, + "HMDB_numerator", test_intensity_cols)), + c("C101.1", "C102.1", "C103.1", "C104.1", "C105.1", + "P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5")) + expect_equal(unname(get_intentities_for_ratios(test_ratios_metabs_df, 2, test_intensities_zscore_df, + "HMDB_numerator", test_intensity_cols)), + c(2500, 2625, 1750, 2525, 2700, 2500, 2625, 1750, 2525, 2700)) + expect_equal(get_intentities_for_ratios(test_ratios_metabs_df, 3, test_intensities_zscore_df, + "HMDB_denominator", test_intensity_cols), + 1) +}) + +testthat::test_that("Get samples that have both intensity and Z-score columns", { + test_intensities_zscore_df <- read.delim(test_path("fixtures", "test_intensities_zscore_df.txt")) + + test_intensity_cols <- c("C101.1", "C102.1", "C103.1", "C104.1", "C105.1", + "P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5") + + expect_equal(length(get_zscore_columns(colnames(test_intensities_zscore_df), test_intensity_cols)), + 10) + expect_equal(get_zscore_columns(colnames(test_intensities_zscore_df), test_intensity_cols), + c("C101.1_Zscore", "C102.1_Zscore", "C103.1_Zscore", "C104.1_Zscore", "C105.1_Zscore", + "P2025M1_Zscore", "P2025M2_Zscore", "P2025M3_Zscore", "P2025M4_Zscore", "P2025M5_Zscore")) + + test_intensity_cols <- c("C101.1", "C102.1", "C103.1", "C104.1", "C105.1", "P2025M1", "P2025M2", "P2025M3") + expect_equal(length(get_zscore_columns(colnames(test_intensities_zscore_df), test_intensity_cols)), + 8) + expect_equal(get_zscore_columns(colnames(test_intensities_zscore_df), test_intensity_cols), + c("C101.1_Zscore", "C102.1_Zscore", "C103.1_Zscore", "C104.1_Zscore", "C105.1_Zscore", + "P2025M1_Zscore", "P2025M2_Zscore", "P2025M3_Zscore")) +}) + +testthat::test_that("Get a list with dataframes from metabilite files in a directory", { + test_path_metabolite_groups <- test_path("fixtures/test_metabolite_groups") + + expect_type(get_list_metabolites(test_path_metabolite_groups), "list") + expect_identical(names(get_list_metabolites(test_path_metabolite_groups)), + c("test_acyl_carnitines", "test_crea_gua")) + + expect_identical(colnames(get_list_metabolites(test_path_metabolite_groups)$test_acyl_carnitines), + c("HMDB_code", "HMDB_name", "Helix", "Helix_naam", "high_zscore", "low_zscore")) + expect_identical(get_list_metabolites(test_path_metabolite_groups)$test_acyl_carnitines$HMDB_name, + c("metab1", "metab3", "ratio1")) + expect_identical(get_list_metabolites(test_path_metabolite_groups)$test_acyl_carnitines$Helix, + c("ja", "nee", "ja")) + + expect_identical(colnames(get_list_metabolites(test_path_metabolite_groups)$test_crea_gua), + c("HMDB_code", "HMDB_name", "Helix", "Helix_naam", "high_zscore", "low_zscore")) + expect_identical(get_list_metabolites(test_path_metabolite_groups)$test_crea_gua$HMDB_name, + c("metab4", "metab11")) + expect_identical(get_list_metabolites(test_path_metabolite_groups)$test_crea_gua$Helix, + c("ja", "ja")) +}) + +testthat::test_that("Combine metabolite info dataframe and Z-score dataframe", { + test_acyl_carnitines_df <- read.delim(test_path("fixtures/test_metabolite_groups/", "test_acyl_carnitines.txt")) + test_crea_gua_df <- read.delim(test_path("fixtures/test_metabolite_groups/", "test_crea_gua.txt")) + + test_metab_list_all <- list(test_acyl_carnitines_df, test_crea_gua_df) + names(test_metab_list_all) <- c("test_acyl_carnitines", "test_crea_gua") + + test_patient_cols <- c("P2025M1_Zscore", "P2025M2_Zscore", "P2025M3_Zscore", "P2025M4_Zscore", "P2025M5_Zscore") + test_intensities_zscore_df <- read.delim(test_path("fixtures", "test_intensities_zscore_df.txt")) + test_zscore_patients_df <- test_intensities_zscore_df %>% select(HMDB_code, HMDB_name, any_of(test_patient_cols)) + + expect_type(combine_metab_info_zscores(test_metab_list_all, test_zscore_patients_df), "list") + expect_identical(names(combine_metab_info_zscores(test_metab_list_all, test_zscore_patients_df)), + c("test_acyl_carnitines", "test_crea_gua")) + + expect_identical(colnames(combine_metab_info_zscores(test_metab_list_all, test_zscore_patients_df)$test_acyl_carnitines), + c("HMDB_name", "Sample", "Z_score")) + expect_identical((combine_metab_info_zscores(test_metab_list_all, + test_zscore_patients_df)$test_acyl_carnitines$Z_score), + c(0.31, 2.34, 2.45, 1.45, 2.14, -1.44, 12.18, -0.18, 3.22, -3.18)) + expect_identical(as.character(combine_metab_info_zscores(test_metab_list_all, + test_zscore_patients_df)$test_acyl_carnitines$Sample), + c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5")) + expect_equal(nchar(combine_metab_info_zscores(test_metab_list_all, + test_zscore_patients_df)$test_acyl_carnitines$HMDB_name[1]), + 45) + + expect_identical(colnames(combine_metab_info_zscores(test_metab_list_all, test_zscore_patients_df)$test_crea_gua), + c("HMDB_name", "Sample", "Z_score")) + expect_identical((combine_metab_info_zscores(test_metab_list_all, + test_zscore_patients_df)$test_crea_gua$Z_score), + c(0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18)) + expect_identical(as.character(combine_metab_info_zscores(test_metab_list_all, + test_zscore_patients_df)$test_crea_gua$Sample), + c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5")) + expect_equal(nchar(combine_metab_info_zscores(test_metab_list_all, + test_zscore_patients_df)$test_crea_gua$HMDB_name[1]), + 45) +}) + +testthat::test_that("Combine patient and control data for each page of the violinplot pdf", { + test_acyl_carnitines_pat <- read.delim(test_path("fixtures/", "test_acyl_carnitines_patients.txt")) + test_acyl_carnitines_ctrl <- read.delim(test_path("fixtures/", "test_acyl_carnitines_controls.txt")) + + test_crea_gua_pat <- read.delim(test_path("fixtures/", "test_crea_gua_patients.txt")) + test_crea_gua_ctrl <- read.delim(test_path("fixtures/", "test_crea_gua_controls.txt")) + + test_metab_interest_sorted <- list(test_acyl_carnitines_pat, test_crea_gua_pat) + names(test_metab_interest_sorted) <- c("test_acyl_carnitines", "test_crea_gua") + + test_metab_interest_contr <- list(test_acyl_carnitines_ctrl, test_crea_gua_ctrl) + names(test_metab_interest_contr) <- c("test_acyl_carnitines", "test_crea_gua") + + test_nr_plots_perpage <- 1 + test_nr_pat <- 5 + test_nr_contr <- 5 + + expect_type(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr), + "list") + expect_equal(length(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr)), + 4) + expect_identical(names(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr)), + c("test_acyl_carnitines_1", "test_acyl_carnitines_2", "test_crea_gua_1", "test_crea_gua_2")) + expect_identical(unique(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr)$test_acyl_carnitines_1$HMDB_name), + c("metab1 ")) + expect_identical(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr)$test_acyl_carnitines_1$Sample, + c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5", "C101.1", "C102.1", "C103.1", "C104.1", "C105.1")) + + test_nr_plots_perpage <- 2 + + expect_equal(length(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr)), + 2) + expect_identical(names(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr)), + c("test_acyl_carnitines_1", "test_crea_gua_1")) + expect_identical(unique(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr)$test_acyl_carnitines_1$HMDB_name), + c("metab1 ", "metab3 ")) + expect_identical(prepare_data_perpage(test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr)$test_acyl_carnitines_1$Sample, + c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5", + "C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", "C104.1", "C104.1", "C105.1", "C105.1")) +}) + +testthat::test_that("Generate a dataframe with information for Helix", { + test_acyl_carnitines_pat <- read.delim(test_path("fixtures/", "test_acyl_carnitines_patients.txt")) + test_crea_gua_pat <- read.delim(test_path("fixtures/", "test_crea_gua_patients.txt")) + + test_metab_interest_sorted <- list(test_acyl_carnitines_pat, test_crea_gua_pat) + names(test_metab_interest_sorted) <- c("test_acyl_carnitines", "test_crea_gua") + + test_acyl_carnitines_df <- read.delim(test_path("fixtures/test_metabolite_groups/", "test_acyl_carnitines.txt")) + test_crea_gua_df <- read.delim(test_path("fixtures/test_metabolite_groups/", "test_crea_gua.txt")) + + test_metab_list_all <- list(test_acyl_carnitines_df, test_crea_gua_df) + names(test_metab_list_all) <- c("test_acyl_carnitines", "test_crea_gua") + + expect_identical(colnames(get_patient_data_to_helix(test_metab_interest_sorted, test_metab_list_all)), + c("HMDB_name", "Sample", "Z_score", "Helix_naam", "high_zscore", "low_zscore")) + expect_equal(dim(get_patient_data_to_helix(test_metab_interest_sorted, test_metab_list_all)), + c(15, 6)) + expect_identical(unique(get_patient_data_to_helix(test_metab_interest_sorted, test_metab_list_all)$HMDB_name), + c("metab1", "metab4", "metab11")) + expect_false("ratio1" %in% get_patient_data_to_helix(test_metab_interest_sorted, test_metab_list_all)$HMDB_name) + expect_equal(get_patient_data_to_helix(test_metab_interest_sorted, test_metab_list_all)$Z_score, + c(0.31, 2.45, 2.14, 12.18, 3.22, 0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18)) +}) + +testthat::test_that("Check for diagnostic patients", { + test_patient_column <- c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "C101.1", "C102.1", "P2025D1", "P225M1") + + expect_equal(is_diagnostic_patient(test_patient_column), c(TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)) + expect_equal(length(is_diagnostic_patient(test_patient_column)), 8) +}) + +testthat::test_that("Adding labnummer and Onderzoeksnummer to the Helix dataframe", { + test_df_metabs_helix <- read.delim(test_path("fixtures/", "test_df_metabs_helix.txt")) + + test_df_metabs_helix <- test_df_metabs_helix %>% + group_by(Sample) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + expect_true("labnummer" %in% colnames(add_lab_id_and_onderzoeksnr(test_df_metabs_helix))) + expect_true("Onderzoeksnummer" %in% colnames(add_lab_id_and_onderzoeksnr(test_df_metabs_helix))) + expect_identical(unique(add_lab_id_and_onderzoeksnr(test_df_metabs_helix)$labnummer), + c("2025M1", "2025M2", "2025M3", "2025M4", "2025M5")) + expect_identical(unique(add_lab_id_and_onderzoeksnr(test_df_metabs_helix)$Onderzoeksnummer), + c("MB2025/1", "MB2025/2", "MB2025/3", "MB2025/4", "MB2025/5")) +}) + +testthat::test_that("Make the output for Helix", { + test_protocol_name <- "test_protocol_name" + + test_df_metabs_helix <- read.delim(test_path("fixtures/", "test_df_metabs_helix.txt")) + + expect_equal(dim(output_for_helix(test_protocol_name, test_df_metabs_helix)), + c(15, 6)) + expect_identical(colnames(output_for_helix(test_protocol_name, test_df_metabs_helix)), + c("Vial", "labnummer", "Onderzoeksnummer", "Protocol", "Name", "Amount")) + expect_identical(unique(output_for_helix(test_protocol_name, test_df_metabs_helix)$Protocol), + "test_protocol_name") + expect_identical(unique(output_for_helix(test_protocol_name, test_df_metabs_helix)$labnummer), + c("2025M1", "2025M2", "2025M3", "2025M4", "2025M5")) + expect_identical(unique(output_for_helix(test_protocol_name, test_df_metabs_helix)$Onderzoeksnummer), + c("MB2025/1", "MB2025/2", "MB2025/3", "MB2025/4", "MB2025/5")) + expect_equal(output_for_helix(test_protocol_name, test_df_metabs_helix)$Amount, + c(0.31, 2.45, 2.14, 12.18, 3.22, 0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18)) +}) + +testthat::test_that("Create a dataframe with all metabolites that exceed the min and max Z-score cutoff", { + test_df_metabs_helix <- read.delim(test_path("fixtures/", "test_df_metabs_helix.txt")) + test_patient_id <- "P2025M1" + + expect_equal(dim(prepare_alarmvalues(test_patient_id, test_df_metabs_helix)), + c(2, 2)) + expect_equal(colnames(prepare_alarmvalues(test_patient_id, test_df_metabs_helix)), + c("Metabolite", "Z-score")) + + test_patient_id <- "P2025M2" + expect_equal(dim(prepare_alarmvalues(test_patient_id, test_df_metabs_helix)), + c(4, 2)) + expect_equal(prepare_alarmvalues(test_patient_id, test_df_metabs_helix)$`Z-score`, + c("", "2.45", "", "-1.51")) + + test_patient_id <- "P2025M4" + expect_equal(dim(prepare_alarmvalues(test_patient_id, test_df_metabs_helix)), + c(3, 2)) + expect_equal(prepare_alarmvalues(test_patient_id, test_df_metabs_helix)$`Z-score`, + c("", "12.18", "")) +}) + +testthat::test_that("Create a dataframe with the top 20 highest and top 10 lowest metabolites", { + test_zscore_patient_df <- read.delim(test_path("fixtures/", "test_zscore_patient_df.txt")) + test_patient_id <- "P2025M1" + + expect_equal(dim(prepare_toplist(test_patient_id, test_zscore_patient_df)), + c(32, 3)) + expect_equal(colnames(prepare_toplist(test_patient_id, test_zscore_patient_df)), + c("HMDB_ID", "Metabolite", "Z-score")) + expect_equal(prepare_toplist(test_patient_id, test_zscore_patient_df)$HMDB_ID, + c("Increased", "HMDB030", "HMDB029", "HMDB028", "HMDB027", "HMDB026", "HMDB025", "HMDB024", "HMDB023", + "HMDB022", "HMDB021", "HMDB020", "HMDB019", "HMDB018", "HMDB017", "HMDB016", "HMDB015", "HMDB014", "HMDB013", + "HMDB012", "HMDB011", "Decreased", "HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB005", "HMDB006", + "HMDB007", "HMDB008", "HMDB009", "HMDB010")) + expect_equal(prepare_toplist(test_patient_id, test_zscore_patient_df)$`Z-score`, + c("", "30", "29", "28", "27", "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16", "15", + "14", "13", "12", "11", "", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10")) + + test_patient_id <- "P2025M2" + + expect_equal(prepare_toplist(test_patient_id, test_zscore_patient_df)$Metabolite, + c("", "metab1", "metab2", "metab3", "metab4", "metab5", "metab6", "metab7", "metab8", "metab9", "metab10", + "metab11", "metab12", "metab13", "metab14", "metab15", "metab16", "metab17", "metab18", "metab19", "metab20", + "", "metab30", "metab29", "metab28", "metab27", "metab26", "metab25", "metab24", "metab23", "metab22", + "metab21")) + expect_equal(prepare_toplist(test_patient_id, test_zscore_patient_df)$`Z-score`, + c("", "-1", "-2", "-3", "-4", "-5", "-6", "-7", "-8", "-9", "-10", "-11", "-12", "-13", "-14", "-15", + "-16", "-17", "-18", "-19", "-20", "", "-30", "-29", "-28", "-27", "-26", "-25", "-24", "-23", "-22", "-21")) +}) + +testthat::test_that("Create a pdf with a table of top metabolites and violin plots", { + local_edition(3) + temp_dir <- "./" + dir.create(paste0(temp_dir, "violin_plots/")) + + test_pdf_dir <- paste0(temp_dir, "violin_plots/") + test_patient_id <- "P2025M1" + test_explanation <- "Unit test Generate Violin Plots" + + test_acyl_carnitines_df <- read.delim(test_path("fixtures/", "test_acyl_carnitines_df.txt")) + attr(test_acyl_carnitines_df, "y_order") <- rev(unique(test_acyl_carnitines_df$HMDB_name)) + test_crea_gua_df <- read.delim(test_path("fixtures/", "test_crea_gua_df.txt")) + attr(test_crea_gua_df, "y_order") <- rev(unique(test_crea_gua_df$HMDB_name)) + + test_metab_perpage <- list(test_acyl_carnitines_df, test_crea_gua_df) + names(test_metab_perpage) <- c("test_acyl_carnitines", "test_crea_gua") + + test_top_metab_pt <- data.frame( + Metabolite = c("Increased", "metab1", "Decreased", "metab11"), + `Z-score` = c("", "2.45", "", "-1.51") + ) + + expect_silent(create_pdf_violin_plots(test_pdf_dir, test_patient_id, + test_metab_perpage, test_top_metab_pt, test_explanation)) + + out_pdf_violinplots <- file.path(test_pdf_dir, "R_P2025M1.pdf") + expect_true(file.exists(out_pdf_violinplots)) + content_pdf_violinplots <- pdftools::pdf_text(out_pdf_violinplots) + expect_snapshot(content_pdf_violinplots) + + unlink(test_pdf_dir, recursive = TRUE) +}) + +testthat::test_that("Create a violin plot", { + test_patient_id <- "P2025M1" + test_sub_perpage <- "test acyl carnitines" + + test_acyl_carnitines_df <- read.delim(test_path("fixtures/", "test_acyl_carnitines_df.txt")) + attr(test_acyl_carnitines_df, "y_order") <- rev(unique(test_acyl_carnitines_df$HMDB_name)) + + test_patient_zscore_df <- test_acyl_carnitines_df %>% filter(Sample == test_patient_id) + + test_metab_zscores_df <- test_acyl_carnitines_df %>% filter(Sample != test_patient_id) + + expect_silent(create_violin_plot(test_metab_zscores_df, test_patient_zscore_df, test_sub_perpage, test_patient_id)) + + expect_doppelganger("violin_plot_P2025M1", create_violin_plot(test_metab_zscores_df, test_patient_zscore_df, + test_sub_perpage, test_patient_id)) +}) + +testthat::test_that("Run dIEM algorithm", { + test_expected_biomarkers_df <- read.delim(test_path("fixtures/", "test_expected_biomarkers_df.txt")) + test_zscore_patient_df <- read.delim(test_path("fixtures/", "test_zscore_patient_df.txt")) + test_sample_cols <- c("P2025M1", "P2025M2", "P2025M3", "P2025M4") + + expect_equal(dim(run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)), + c(7, 5)) + expect_identical(colnames(run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)), + c("Disease", "P2025M1", "P2025M2", "P2025M3", "P2025M4")) + expect_identical(run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)$Disease, + c("Disease A", "Disease B", "Disease C", "Disease D", "Disease E", "Disease F", "Disease G")) + expect_equal(run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)$P2025M1, + c(10.94172, 0.95343, 12.12121, 0.00000, 44.28850, 0.00000, -38.70370), tolerance = 0.0001) +}) + +testthat::test_that("Ranking Z-scores for a patient", { + test_zscore_col <- c(1, 5, 6, 2, 7, -2, 3) + + expect_equal(length(rank_patient_zscores(test_zscore_col)), 7) + + expect_identical(rank_patient_zscores(test_zscore_col), + c(6, 3, 2, 5, 1, 1, 4)) + + test_zscore_col <- c(3, 2, 1, 3) + + expect_identical(rank_patient_zscores(test_zscore_col), + c(1, 2, 3, 1)) + + test_zscore_col <- c(-1, -2, -3, -4) + + expect_identical(rank_patient_zscores(test_zscore_col), + c(4, 3, 2, 1)) +}) + +testthat::test_that("Saving the probability score dataframe as an Excel file", { + local_edition(3) + test_probability_score_df <- read.delim(test_path("fixtures/", "test_probability_score_df.txt")) + test_output_dir <- "./test_excel" + dir.create(test_output_dir) + + test_run_name <- "test_run" + out_excel_file <- file.path(test_output_dir, paste0("/dIEM_algoritme_output_", test_run_name, ".xlsx")) + + expect_silent(save_prob_scores_to_excel(test_probability_score_df, test_output_dir, test_run_name)) + expect_true(file.exists(out_excel_file)) + + content_excel_file <- openxlsx::read.xlsx(out_excel_file, sheet = 1) + expect_snapshot_output(content_excel_file) + + unlink(test_output_dir, recursive = TRUE) +})