From 43d844f28fc2175e971b129b5e065073916fffa1 Mon Sep 17 00:00:00 2001
From: "Chavez, Mauro" <mchavez1@illumina.com>
Date: Wed, 6 Apr 2022 17:55:32 -0700
Subject: [PATCH 1/4] custom manifest support

---
 methylprep/files/manifests.py     | 136 ++++++++++++++++++------------
 methylprep/models/arrays.py       |   3 +-
 methylprep/processing/pipeline.py |   9 +-
 3 files changed, 92 insertions(+), 56 deletions(-)

diff --git a/methylprep/files/manifests.py b/methylprep/files/manifests.py
index d51a416..b6285bf 100644
--- a/methylprep/files/manifests.py
+++ b/methylprep/files/manifests.py
@@ -1,6 +1,6 @@
 # Lib
 import logging
-from pathlib import Path
+from pathlib import Path, PurePath
 from urllib.parse import urljoin
 import numpy as np
 import pandas as pd
@@ -20,8 +20,9 @@
 
 LOGGER = logging.getLogger(__name__)
 
-MANIFEST_DIR_NAME = '.methylprep_manifest_files'
-MANIFEST_DIR_PATH = f'~/{MANIFEST_DIR_NAME}'
+MANIFEST_DIR_NAME = 'methylprep_manifest_files/'
+# Download manifests into library so multiple users don't need their own copy
+MANIFEST_DIR_PATH = PurePath(Path(__file__).parent.resolve(), MANIFEST_DIR_NAME)
 MANIFEST_DIR_PATH_LAMBDA = f'/tmp/{MANIFEST_DIR_NAME}'
 MANIFEST_BUCKET_NAME = 'array-manifest-files'
 MANIFEST_REMOTE_PATH = f'https://s3.amazonaws.com/{MANIFEST_BUCKET_NAME}/'
@@ -107,25 +108,77 @@ class Manifest():
     __genome_df = None
     __probe_type_subsets = None # apparently not used anywhere in methylprep
 
-    def __init__(self, array_type, filepath_or_buffer=None, on_lambda=False, verbose=True):
+    def __init__(self, array_name: str, filepath_or_buffer=None, on_lambda=False, verbose=True):
+        self.array_name = array_name
         array_str_to_class = dict(zip(list(ARRAY_FILENAME.keys()), list(ARRAY_TYPE_MANIFEST_FILENAMES.keys())))
-        if array_type in array_str_to_class:
-            array_type = array_str_to_class[array_type]
-        self.array_type = array_type
+        # Map array_name as provided as a string to self.array_type which will be of type enum 'ArrayType'
+        if array_name in list(array_str_to_class.keys()):
+            self.array_type = array_str_to_class[array_name]
+        else:
+            self.array_type = ArrayType.CUSTOM
         self.on_lambda = on_lambda # changes filepath to /tmp for the read-only file system
         self.verbose = verbose
-
+        # If you don't pass in a filepath, Methylprep uses one of its own manifests which may need downloading
         if filepath_or_buffer is None:
-            filepath_or_buffer = self.download_default(array_type, self.on_lambda)
-
-        with get_file_object(filepath_or_buffer) as manifest_file:
-            self.__data_frame = self.read_probes(manifest_file)
-            self.__control_data_frame = self.read_control_probes(manifest_file)
-            self.__snp_data_frame = self.read_snp_probes(manifest_file)
-            if self.array_type == ArrayType.ILLUMINA_MOUSE:
-                self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
-            else:
-                self.__mouse_data_frame = pd.DataFrame()
+            filepath_or_buffer = self.download_default(self.array_type, self.on_lambda)
+            LOGGER.info(f"Using Methylprep manifest {self.array_name} @ {filepath_or_buffer}")
+            # This approach to loading a manifest only makes sense if array type is configured in arrays.py
+            # which is only true for Methylprep manifests
+            with get_file_object(filepath_or_buffer) as manifest_file:
+                self.__data_frame = self.read_probes(manifest_file) # DF uses IlmnID as index
+                self.__control_data_frame = self.read_control_probes(manifest_file) # DF uses Address_ID as index
+                self.__snp_data_frame = self.read_snp_probes(manifest_file) # DF uses neither IlmnID nor Address as index?
+                if self.array_type == ArrayType.ILLUMINA_MOUSE:
+                    self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
+                else:
+                    self.__mouse_data_frame = pd.DataFrame()
+        else:
+            LOGGER.info(f"Using CUSTOM manifest {self.array_name} @ {filepath_or_buffer}")
+            self._load_custom_manifest(filepath_or_buffer)
+
+        LOGGER.info(f"""Manifest has {str(len(self.__data_frame))} probes (where {str(len(self.__snp_data_frame))} are snps) and {str(len(self.__control_data_frame))} control probes""")
+
+    def _load_custom_manifest(self, custom_manifest_path):
+        optional = ['OLD_CHR', 'OLD_Strand', 'OLD_Genome_Build', 'OLD_MAPINFO']
+        minimum_cols = [col for col in self.columns if col not in optional]
+
+        with get_file_object(custom_manifest_path) as manifest_file:
+            file_df = pd.read_csv(manifest_file, comment='[', low_memory=False)
+            missing_cols = [col for col in minimum_cols if col not in (file_df.columns.to_list()+[file_df.index.name])]
+            if missing_cols != []:
+                raise ValueError(f"Custom manifest {custom_manifest_path} missing {','.join(missing_cols)} columns")
+            # Controls in manifests follow column order of 'Address_ID', 'Control_Type', 'Color', 'Extended_Type',
+            # After row starting with [Controls],,,, 
+            # Non control probes follow column order of 'IlmnID', 'Name', 'AddressA_ID', 'AddressB_ID'...
+            # This means that after loading the file, any row where AddressB_ID is a string is a control probe
+            control_probes = file_df[file_df["AddressB_ID"].str.contains('[^0-9.]', regex=True, na=False)]
+            control_probes = control_probes[["IlmnID", "Name", "AddressA_ID", "AddressB_ID"]]
+            self.__control_data_frame = control_probes.rename(
+                columns={
+                    "IlmnID": "Address_ID",
+                    "Name": "Control_Type",
+                    "AddressA_ID": "Color",
+                    "AddressB_ID": "Extended_Type"
+                }
+            )
+            # Snps have IlmnID with leading rs
+            self.__snp_data_frame = file_df[file_df["IlmnID"].str.match('rs', na=False)].astype(
+                {'AddressA_ID':'float64', 'AddressB_ID':'float64'})
+            # At this point self.__data_frame should be any probe that is not a control
+            self.__data_frame = file_df.drop(self.__control_data_frame.index, axis=0).astype(
+                {'AddressA_ID':'float64', 'AddressB_ID':'float64'})
+            self.__data_frame = self.__data_frame.set_index('IlmnID')
+            # Methylprep wants index of control df to be Address_ID
+            self.__control_data_frame = self.__control_data_frame.astype({'Address_ID': 'int64'})
+            self.__control_data_frame = self.__control_data_frame.set_index('Address_ID')
+            # Do transformation on Infinim_Design_Type column 
+            vectorized_get_type = np.vectorize(self.get_probe_type)
+            self.__data_frame['probe_type'] = vectorized_get_type(
+                self.__data_frame.index.values,
+                self.__data_frame['Infinium_Design_Type'].values,
+            )
+            # IDK it seems like methylsuite wants this variable avaialable?
+            self.__mouse_data_frame = pd.DataFrame()
 
     @property
     def columns(self):
@@ -150,6 +203,15 @@ def snp_data_frame(self):
     def mouse_data_frame(self):
         return self.__mouse_data_frame
 
+    @staticmethod
+    def get_probe_type(name, infinium_type):
+        """returns one of (I, II, SnpI, SnpII, Control)
+
+        .from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
+        (starts with 'rs' == SnpI) and 'Control' is none of the above."""
+        probe_type = ProbeType.from_manifest_values(name, infinium_type)
+        return probe_type.value
+
     @staticmethod
     def download_default(array_type, on_lambda=False):
         """Downloads the appropriate manifest file if one does not already exist.
@@ -224,16 +286,7 @@ def read_probes(self, manifest_file):
         # AddressB_ID in manifest includes NaNs and INTs and becomes floats, which breaks. forcing back here.
         #data_frame['AddressB_ID'] = data_frame['AddressB_ID'].astype('Int64') # converts floats to ints; leaves NaNs inplace
         # TURNS out, int or float both work for manifests. NOT the source of the error with mouse.
-
-        def get_probe_type(name, infinium_type):
-            """returns one of (I, II, SnpI, SnpII, Control)
-
-            .from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
-            (starts with 'rs' == SnpI) and 'Control' is none of the above."""
-            probe_type = ProbeType.from_manifest_values(name, infinium_type)
-            return probe_type.value
-
-        vectorized_get_type = np.vectorize(get_probe_type)
+        vectorized_get_type = np.vectorize(self.get_probe_type)
         data_frame['probe_type'] = vectorized_get_type(
             data_frame.index.values,
             data_frame['Infinium_Design_Type'].values,
@@ -254,7 +307,7 @@ def read_control_probes(self, manifest_file):
             index_col=0, # illumina_id, not IlmnID here
             names=CONTROL_COLUMNS, # this gives these columns new names, because they have none. loading stuff at end of CSV after probes end.
             nrows=self.array_type.num_controls,
-            skiprows=self.array_type.num_probes +1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
+            skiprows=self.array_type.num_probes+1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
             usecols=range(len(CONTROL_COLUMNS)),
         )
 
@@ -283,29 +336,6 @@ def read_mouse_probes(self, manifest_file):
         mouse_df = mouse_df[(mouse_df['design'] == 'Multi') | (mouse_df['design'] == 'Random')]
         return mouse_df
 
-    """ NEVER CALLED ANYWHERE - belongs in methylize
-    def map_to_genome(self, data_frame):
-        genome_df = self.get_genome_data()
-        merged_df = inner_join_data(data_frame, genome_df)
-        return merged_df
-
-    def get_genome_data(self, build=None):
-        if self.__genome_df is not None:
-            return self.__genome_df
-
-        LOGGER.info('Building genome data frame')
-        # new in version 1.5.6: support for both new and old genomes
-        GENOME_COLUMNS = (
-            'Genome_Build',
-            'CHR',
-            'MAPINFO',
-            'Strand',
-        ) # PLUS four more optional columns with OLD_ prefix (for prev genome build)
-        genome_columns = GENOME_COLUMNS + ['OLD_'+col for col in GENOME_COLUMNS]
-        self.__genome_df = self.data_frame[genome_columns]
-        return self.__genome_df
-    """
-
     def get_data_types(self):
         data_types = {
             key: str for key in self.columns
@@ -330,4 +360,4 @@ def get_probe_details(self, probe_type, channel=None):
             return data_frame[probe_type_mask]
 
         channel_mask = data_frame['Color_Channel'].values == channel.value
-        return data_frame[probe_type_mask & channel_mask]
+        return data_frame[probe_type_mask & channel_mask]
\ No newline at end of file
diff --git a/methylprep/models/arrays.py b/methylprep/models/arrays.py
index e54738d..4d77417 100644
--- a/methylprep/models/arrays.py
+++ b/methylprep/models/arrays.py
@@ -43,7 +43,8 @@ def from_probe_count(cls, probe_count):
             LOGGER.warning(f'Probe count ({probe_count}) falls outside of normal range. Setting to newest array type: EPIC')
             return cls.ILLUMINA_EPIC
 
-        raise ValueError(f'Unknown array type: ({probe_count} probes detected)')
+        LOGGER.warning(f'Unknown array type: ({probe_count} probes detected) - using type CUSTOM')
+        return cls.CUSTOM
 
     @property
     def num_probes(self):
diff --git a/methylprep/processing/pipeline.py b/methylprep/processing/pipeline.py
index 33efd2b..4a2929c 100644
--- a/methylprep/processing/pipeline.py
+++ b/methylprep/processing/pipeline.py
@@ -307,13 +307,18 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
     # 200 samples still uses 4.8GB of memory/disk space (float64)
     missing_probe_errors = {'noob': [], 'raw':[]}
 
+    manifest = None
     for batch_num, batch in enumerate(batches, 1):
         idat_datasets = parse_sample_sheet_into_idat_datasets(sample_sheet, sample_name=batch, from_s3=None, meta_only=False) # replaces get_raw_datasets
         # idat_datasets are a list; each item is a dict of {'green_idat': ..., 'red_idat':..., 'array_type', 'sample'} to feed into SigSet
         #--- pre v1.5 --- raw_datasets = get_raw_datasets(sample_sheet, sample_name=batch)
         if array_type is None: # use must provide either the array_type or manifest_filepath.
-            array_type = get_array_type(idat_datasets)
-        manifest = Manifest(array_type, manifest_filepath) # this allows each batch to be a different array type; but not implemented yet. common with older GEO sets.
+            array_type = get_array_type(idat_datasets).value
+        # Only load a manifest if one hasn't been loaded or we are processing a sample with manifest different from previous ones
+        if manifest is None:
+            manifest = Manifest(array_type, manifest_filepath)
+        elif manifest.array_type != array_type:
+            manifest = Manifest(array_type, manifest_filepath)
 
         batch_data_containers = []
         export_paths = set() # inform CLI user where to look

From a6ae5333396d5018cac64e225584ba925c909f1a Mon Sep 17 00:00:00 2001
From: "Chavez, Mauro" <mchavez1@illumina.com>
Date: Tue, 10 May 2022 12:54:12 -0700
Subject: [PATCH 2/4] Update pOOBAH pval calculation

---
 .../processing/p_value_probe_detection.py     | 31 ++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/methylprep/processing/p_value_probe_detection.py b/methylprep/processing/p_value_probe_detection.py
index 449cc88..26e138e 100644
--- a/methylprep/processing/p_value_probe_detection.py
+++ b/methylprep/processing/p_value_probe_detection.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 
-def _pval_sesame_preprocess(data_container):
+def _pval_sesame_preprocess(data_container, combine_neg=True):
     """Performs p-value detection of low signal/noise probes. This ONE SAMPLE version uses meth/unmeth before it is contructed into a _SampleDataContainer__data_frame.
     - returns a dataframe of probes and their detected p-value levels.
     - this will be saved to the csv output, so it can be used to drop probes at later step.
@@ -17,9 +17,32 @@ def _pval_sesame_preprocess(data_container):
     - called by pipeline CLI --poobah option.
     - confirmed that this version produces identical results to the pre-v1.5.0 version on 2021-06-16
     """
-    # 2021-03-22 assumed 'mean_value' for red and green MEANT meth and unmeth (OOBS), respectively.
-    funcG = ECDF(data_container.oobG['Unmeth'].values)
-    funcR = ECDF(data_container.oobR['Meth'].values)
+    # oob[G/R]['Unmeth'] is the out of band signal from the probe capturing the unmethylated state
+    # oob[G/R]['Meth'] is the out of band signal from the probe capturing the methylated state
+    bgG = (
+        list(data_container.oobG['Unmeth'].values)
+        + list(data_container.oobG['Meth'].values)
+    )
+    bgR = (
+        list(data_container.oobR['Unmeth'].values)
+        + list(data_container.oobR['Meth'].values)
+    )
+    # SeSAMe by default includes negative controls as a part of the background green and red intensities
+    if combine_neg:
+        # Add green signal from negative controls to background green intensities
+        dfG = data_container.ctrl_green
+        dfG = dfG[dfG['Control_Type']=='NEGATIVE']
+        dfG = dfG[['Extended_Type','mean_value']]
+        bgG += list(dfG['mean_value'].values)
+        # Add reg signal from negative controls to background red intensitites
+        dfR = data_container.ctrl_red
+        dfR = dfR[dfR['Control_Type']=='NEGATIVE']
+        dfR = dfR[['Extended_Type','mean_value']]
+        bgR += list(dfR['mean_value'].values)
+    # Build empirical cumulative distribution functions
+    funcG = ECDF(bgG)
+    funcR = ECDF(bgR)
+    # Apply function of background red intensity to red probes and background green intensity to green probes
     pIR = pd.DataFrame(
         index=data_container.IR.index,
         data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])),

From 2e6d016413d735fbbaa612d681224591b882dec5 Mon Sep 17 00:00:00 2001
From: Mauro Chavez <mauro.antoine.chavez@gmail.com>
Date: Thu, 12 May 2022 10:31:20 -0700
Subject: [PATCH 3/4] added feature to output negative control based pvalue -
 pNegECDF_pval

---
 methylprep/cli.py                             | 13 +++++++--
 .../processing/p_value_probe_detection.py     | 27 +++++++++++++++++++
 methylprep/processing/pipeline.py             | 27 ++++++++++++++++---
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/methylprep/cli.py b/methylprep/cli.py
index 5c40a2c..5d3b47c 100644
--- a/methylprep/cli.py
+++ b/methylprep/cli.py
@@ -204,6 +204,14 @@ def cli_process(cmd_args):
         help='By default, any beta-values or m-values output will contain all probes. If True, those probes that fail the p-value signal:noise detection are replaced with NaNs in dataframes in beta_values and m_value output.'
     )
 
+    parser.add_argument(
+        '--pneg_ecdf',
+        required=False,
+        action='store_true',
+        default=False,
+        help='Calculates a pvalue for each probe against ECDF of negative control intensity and outputs as pNegECDF_values.pkl if export_poobah option specified. Scores also included in csv output.'
+    )
+
     parser.add_argument(
         '--export_poobah',
         required=False,
@@ -276,8 +284,9 @@ def cli_process(cmd_args):
         poobah=args.poobah,
         export_poobah=args.export_poobah,
         quality_mask=(not args.no_quality_mask),
-        sesame=(not args.minfi) # default 'sesame' method can be turned off using --minfi
-        )
+        sesame=(not args.minfi), # default 'sesame' method can be turned off using --minfi,
+        pneg_ecdf=args.pneg_ecdf
+    )
 
 
 def cli_beta_bakery(cmd_args):
diff --git a/methylprep/processing/p_value_probe_detection.py b/methylprep/processing/p_value_probe_detection.py
index 26e138e..a83a675 100644
--- a/methylprep/processing/p_value_probe_detection.py
+++ b/methylprep/processing/p_value_probe_detection.py
@@ -59,6 +59,33 @@ def _pval_sesame_preprocess(data_container, combine_neg=True):
     pval = pd.concat([pIR,pIG,pII])
     return pval
 
+def _pval_neg_ecdf(data_container):
+    dfR = data_container.ctrl_red
+    dfR = dfR[dfR['Control_Type']=='NEGATIVE']
+    dfR = dfR[['Extended_Type','mean_value']]
+    funcR = ECDF(dfR['mean_value'].values)
+    dfG = data_container.ctrl_green
+    dfG = dfG[dfG['Control_Type']=='NEGATIVE']
+    dfG = dfG[['Extended_Type','mean_value']]
+    funcG = ECDF(dfG['mean_value'].values)
+    pIR = pd.DataFrame(
+        index=data_container.IR.index,
+        data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])),
+        columns=['pNegECDF_pval'])
+    pIG = pd.DataFrame(
+        index=data_container.IG.index,
+        data=1-np.maximum(funcG(data_container.IG['Meth']), funcG(data_container.IG['Unmeth'])),
+        columns=['pNegECDF_pval'])
+    pII = pd.DataFrame(
+        index=data_container.II.index,
+        data=1-np.maximum(funcG(data_container.II['Meth']), funcR(data_container.II['Unmeth'])),
+        columns=['pNegECDF_pval'])
+    # concat and sort
+    pval = pd.concat([pIR, pIG, pII])
+    return pval
+
+
+
 """ DEPRECATED FUNCTIONS (<v1.5.0)
 def detect_probes(data_containers, method='sesame', save=False, silent=True):
     '''
diff --git a/methylprep/processing/pipeline.py b/methylprep/processing/pipeline.py
index 33efd2b..23b339d 100644
--- a/methylprep/processing/pipeline.py
+++ b/methylprep/processing/pipeline.py
@@ -29,7 +29,7 @@
 )
 from ..utils import ensure_directory_exists, is_file_like
 from .preprocess import preprocess_noob, _apply_sesame_quality_mask
-from .p_value_probe_detection import _pval_sesame_preprocess
+from .p_value_probe_detection import _pval_sesame_preprocess, _pval_neg_ecdf
 from .infer_channel_switch import infer_type_I_probes
 from .dye_bias import nonlinear_dye_bias_correction
 from .multi_array_idat_batches import check_array_folders
@@ -46,7 +46,7 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
                  save_uncorrected=False, save_control=True, meta_data_frame=True,
                  bit='float32', poobah=False, export_poobah=False,
                  poobah_decimals=3, poobah_sig=0.05, low_memory=True,
-                 sesame=True, quality_mask=None, **kwargs):
+                 sesame=True, quality_mask=None, pneg_ecdf=False, **kwargs):
     """The main CLI processing pipeline. This does every processing step and returns a data set.
 
     Required Arguments:
@@ -332,6 +332,7 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
                 do_nonlinear_dye_bias=do_nonlinear_dye_bias, # start of run_pipeline sets this to True, False, or None
                 debug=kwargs.get('debug',False),
                 sesame=sesame,
+                pneg_ecdf=pneg_ecdf
             )
             data_container.process_all()
 
@@ -476,6 +477,19 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
                 df = df.sort_index().reindex(sorted(df.columns), axis=1)
                 pd.to_pickle(df, Path(data_dir,pkl_name))
                 LOGGER.info(f"saved {pkl_name}")
+            if all(['pNegECDF_pval' in e._SampleDataContainer__data_frame.columns for e in batch_data_containers]):
+                # this option will save a pickled dataframe of the negative control based pvalues for all samples, with
+                # sample_ids in the column headings and probe names in index.
+                df = consolidate_values_for_sheet(batch_data_containers, postprocess_func_colname='pNegECDF_pval', bit=bit, poobah=False, poobah_sig=poobah_sig, exclude_rs=True)
+                if not batch_size:
+                    pkl_name = 'pNegECDF_values.pkl'
+                else:
+                    pkl_name = f'pNegECDF_values_{batch_num}.pkl'
+                if df.shape[1] > df.shape[0]:
+                    df = df.transpose() # put probes as columns for faster loading.
+                df = df.sort_index().reindex(sorted(df.columns), axis=1)
+                pd.to_pickle(df, Path(data_dir,pkl_name))
+                LOGGER.info(f"saved {pkl_name}")
 
         # v1.3.0 fixing mem probs: pickling each batch_data_containers object then reloading it later.
         # consolidating data_containers this will break with really large sample sets, so skip here.
@@ -577,7 +591,7 @@ class SampleDataContainer(SigSet):
 
     def __init__(self, idat_dataset_pair, manifest=None, retain_uncorrected_probe_intensities=False,
                  bit='float32', pval=False, poobah_decimals=3, poobah_sig=0.05, do_noob=True,
-                 quality_mask=True, switch_probes=True, do_nonlinear_dye_bias=True, debug=False, sesame=True):
+                 quality_mask=True, switch_probes=True, do_nonlinear_dye_bias=True, debug=False, sesame=True, pneg_ecdf=False):
         self.debug = debug
         self.do_noob = do_noob
         self.pval = pval
@@ -591,6 +605,8 @@ def __init__(self, idat_dataset_pair, manifest=None, retain_uncorrected_probe_in
         self.sample = idat_dataset_pair['sample']
         self.retain_uncorrected_probe_intensities=retain_uncorrected_probe_intensities
         self.sesame = sesame # defines offsets in functions
+        # pneg_ecdf defines if negative control based pvalue is calculated - will use poobah_decimals for rounding
+        self.pneg_ecdf = pneg_ecdf
         self.data_type = 'float32' if bit == None else bit # options: (float64, float32, or float16)
         if debug:
             print(f'DEBUG SDC: sesame {self.sesame} switch {self.switch_probes} noob {self.do_noob} poobah {self.pval} mask {self.quality_mask}, dye {self.do_nonlinear_dye_bias}')
@@ -649,6 +665,7 @@ def process_all(self):
             return self.__data_frame
 
         pval_probes_df = _pval_sesame_preprocess(self) if self.pval == True else None
+        pneg_ecdf_probes_df = _pval_neg_ecdf(self) if self.pneg_ecdf == True else None
         # output: df with one column named 'poobah_pval'
         quality_mask_df = _apply_sesame_quality_mask(self) if self.quality_mask == True else None
         # output: df with one column named 'quality_mask' | if not supported array / custom array: returns nothing.
@@ -701,6 +718,10 @@ def process_all(self):
         if self.pval == True and isinstance(pval_probes_df, pd.DataFrame):
             pval_probes_df = pval_probes_df.loc[ ~pval_probes_df.index.duplicated() ]
             self.__data_frame = self.__data_frame.join(pval_probes_df, how='inner')
+        
+        if self.pneg_ecdf == True and isinstance(pneg_ecdf_probes_df, pd.DataFrame):
+            pneg_ecdf_probes_df = pneg_ecdf_probes_df.loc[ ~pneg_ecdf_probes_df.index.duplicated() ]
+            self.__data_frame = self.__data_frame.join(pneg_ecdf_probes_df, how='inner')
 
         self.check_for_probe_loss(f"preprocess_noob sesame={self.sesame} --> {self.methylated.shape} {self.unmethylated.shape}")
 

From ce40194fb954ff20ebf95a0df16e19e77a06e278 Mon Sep 17 00:00:00 2001
From: Mauro Chavez <mauro.antoine.chavez@gmail.com>
Date: Tue, 30 Aug 2022 16:07:55 -0700
Subject: [PATCH 4/4] Update to allow Mask column in custom manifest

Added the functionality to custom manifest support to pick up a "Mask" column and use that in probe masking the same way it does other arrays
---
 methylprep/processing/preprocess.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/methylprep/processing/preprocess.py b/methylprep/processing/preprocess.py
index f0ba878..3dec21f 100644
--- a/methylprep/processing/preprocess.py
+++ b/methylprep/processing/preprocess.py
@@ -269,14 +269,19 @@ def _apply_sesame_quality_mask(data_container):
         to use TCGA masking, only applies to HM450
 
     """
+    # If array type is custom
     if data_container.array_type not in (
         # ArrayType.ILLUMINA_27K,
         ArrayType.ILLUMINA_450K,
         ArrayType.ILLUMINA_EPIC,
         ArrayType.ILLUMINA_EPIC_PLUS,
         ArrayType.ILLUMINA_MOUSE):
-        LOGGER.info(f"Quality masking is not supported for {data_container.array_type}.")
-        return
+        # If custom array doesn't have a "Mask" column
+        if "Mask" not in data_container.man.columns:
+            LOGGER.info(
+                f"Quality masking is not supported for {data_container.array_type}. (not implemented/\"Mask\" column not present in custom manifest)"
+            )
+            return
     # load set of probes to remove from local file
     if data_container.array_type == ArrayType.ILLUMINA_450K:
         probes = qualityMask450
@@ -287,7 +292,13 @@ def _apply_sesame_quality_mask(data_container):
         probes = qualityMaskEPICPLUS
     elif data_container.array_type == ArrayType.ILLUMINA_MOUSE:
         probes = qualityMaskmouse
-
+    else:
+        data_container.man["Mask"].fillna("NA")
+        # Filter "mask" column to rows with True
+        mask_rows = data_container.man[data_container.man["Mask"].astype(str).str.lower() == "true"]
+        # Get indicies (probe names of masked probes)
+        probes = mask_rows.index.to_list()
+        LOGGER.info(f"Masking {str(len(probes))} probes")
     # v1.6+: the 1.0s are good probes and the 0.0 are probes to be excluded.
     cgs = pd.DataFrame( np.zeros((len(data_container.man.index), 1)), index=data_container.man.index, columns=['quality_mask'])
     cgs['quality_mask'] = 1.0