From 43398e8452b45322f8762cf2c3e19e826f68a492 Mon Sep 17 00:00:00 2001 From: ESCRI11 Date: Mon, 27 Oct 2025 18:57:32 +0100 Subject: [PATCH 1/2] =?UTF-8?q?feat=C3=91=20mdPatterns=20fct?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NAMESPACE | 1 + R/mdPatternDS.R | 121 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 R/mdPatternDS.R diff --git a/NAMESPACE b/NAMESPACE index 897148d1..e52e5d10 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -82,6 +82,7 @@ export(matrixDimnamesDS) export(matrixInvertDS) export(matrixMultDS) export(matrixTransposeDS) +export(mdPatternDS) export(meanDS) export(meanSdGpDS) export(mergeDS) diff --git a/R/mdPatternDS.R b/R/mdPatternDS.R new file mode 100644 index 00000000..0f817034 --- /dev/null +++ b/R/mdPatternDS.R @@ -0,0 +1,121 @@ +#' +#' @title Missing data pattern with disclosure control +#' @description This function is a serverside aggregate function that computes the +#' missing data pattern using mice::md.pattern and applies disclosure control to +#' prevent revealing small cell counts. +#' @details This function calls the mice::md.pattern function to generate a matrix +#' showing the missing data patterns in the input data. To ensure disclosure control, +#' any pattern counts that are below the threshold (nfilter.tab, default=3) are +#' suppressed. +#' +#' \strong{Suppression Method:} +#' +#' When a pattern count is below threshold: +#' - Row name is changed to "suppressed()" where N is the threshold +#' - All pattern values in that row are set to NA +#' - Summary row is also set to NA (prevents back-calculation) +#' +#' \strong{Output Matrix Structure:} +#' +#' - Rows represent different missing data patterns (plus a summary row at the bottom) +#' - Row names contain pattern counts (or "suppressed()" for invalid patterns) +#' - Columns show 1 if variable is observed, 0 if missing +#' - Last column shows total number of missing values per pattern +#' - Last row shows total number of missing values per variable +#' +#' \strong{Note for Pooling:} +#' +#' When this function is called from ds.mdPattern with type='combine', suppressed +#' patterns are excluded from pooling to prevent disclosure through subtraction. +#' This means pooled counts may underestimate the true total when patterns are +#' suppressed in some studies. +#' +#' @param x a character string specifying the name of a data frame or matrix +#' containing the data to analyze for missing patterns. +#' @return A list containing: +#' \item{pattern}{The missing data pattern matrix with disclosure control applied} +#' \item{valid}{Logical indicating if all patterns meet disclosure requirements} +#' \item{message}{A message describing the validity status} +#' @author Xavier EscribĂ  montagut for DataSHIELD Development Team +#' @import mice +#' @export +#' +mdPatternDS <- function(x){ + + ############################################################# + # MODULE 1: CAPTURE THE nfilter SETTINGS + thr <- dsBase::listDisclosureSettingsDS() + nfilter.tab <- as.numeric(thr$nfilter.tab) + ############################################################# + + # Parse the input data name with error handling + x.val <- tryCatch( + { + eval(parse(text=x), envir = parent.frame()) + }, + error = function(e) { + stop(paste0("Object '", x, "' does not exist on the server"), call. = FALSE) + } + ) + + # Check object class + typ <- class(x.val) + + # Check that input is a data frame or matrix + if(!("data.frame" %in% typ || "matrix" %in% typ)){ + stop(paste0("The input object must be of type 'data.frame' or 'matrix'. Current type: ", + paste(typ, collapse = ", ")), call. = FALSE) + } + + # Use x.val for further processing + x <- x.val + + # Call mice::md.pattern with plot=FALSE + pattern <- mice::md.pattern(x, plot = FALSE) + + # Apply disclosure control + # Pattern counts are stored in row names (except last row which is empty/summary) + # The last row contains variable-level missing counts + + validity <- "valid" + n_patterns <- nrow(pattern) - 1 # exclude the summary row + + if(n_patterns > 0){ + # Check pattern counts (stored in row names, excluding last row) + pattern_counts <- as.numeric(rownames(pattern)[1:n_patterns]) + + # Find patterns with counts below threshold + invalid_idx <- which(pattern_counts > 0 & pattern_counts < nfilter.tab) + + if(length(invalid_idx) > 0){ + validity <- "invalid" + + # For invalid patterns, suppress by: + # - Setting row name to "suppressed" + # - Setting all pattern values to NA + rnames <- rownames(pattern) + for(idx in invalid_idx){ + rnames[idx] <- paste0("suppressed(<", nfilter.tab, ")") + pattern[idx, ] <- NA + } + rownames(pattern) <- rnames + + # Also need to recalculate the last row (summary) if patterns were suppressed + # Set to NA to avoid disclosures + pattern[nrow(pattern), seq_len(ncol(pattern))] <- NA + } + } + + # Return the pattern with validity information + return(list( + pattern = pattern, + valid = (validity == "valid"), + message = ifelse(validity == "valid", + "Valid: all pattern counts meet disclosure requirements", + paste0("Invalid: some pattern counts below threshold (", + nfilter.tab, ") have been suppressed")) + )) +} + +#AGGREGATE FUNCTION +# mdPatternDS From 987fbcc2f1129a491ce8e9793268a263451a2d8b Mon Sep 17 00:00:00 2001 From: ESCRI11 Date: Mon, 27 Oct 2025 19:00:24 +0100 Subject: [PATCH 2/2] add mdPatterns to DATASHIELD --- inst/DATASHIELD | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/DATASHIELD b/inst/DATASHIELD index c9dd9390..0c59f0c2 100644 --- a/inst/DATASHIELD +++ b/inst/DATASHIELD @@ -37,6 +37,7 @@ AggregateMethods: lmerSLMADS2, lsDS, matrixDetDS1, + mdPatternDS, meanDS, meanSdGpDS, messageDS,