FoxoTech · notmaurox · Apr 7, 2022 · May 10, 2022 · May 12, 2022 · May 12, 2022
diff --git a/methylprep/cli.py b/methylprep/cli.py
@@ -204,6 +204,14 @@ def cli_process(cmd_args):
         help='By default, any beta-values or m-values output will contain all probes. If True, those probes that fail the p-value signal:noise detection are replaced with NaNs in dataframes in beta_values and m_value output.'
     )
 
+    parser.add_argument(
+        '--pneg_ecdf',
+        required=False,
+        action='store_true',
+        default=False,
+        help='Calculates a pvalue for each probe against ECDF of negative control intensity and outputs as pNegECDF_values.pkl if export_poobah option specified. Scores also included in csv output.'
+    )
+
     parser.add_argument(
         '--export_poobah',
         required=False,
@@ -276,8 +284,9 @@ def cli_process(cmd_args):
         poobah=args.poobah,
         export_poobah=args.export_poobah,
         quality_mask=(not args.no_quality_mask),
-        sesame=(not args.minfi) # default 'sesame' method can be turned off using --minfi
-        )
+        sesame=(not args.minfi), # default 'sesame' method can be turned off using --minfi,
+        pneg_ecdf=args.pneg_ecdf
+    )
 
 
 def cli_beta_bakery(cmd_args):

diff --git a/methylprep/files/manifests.py b/methylprep/files/manifests.py
@@ -1,6 +1,6 @@
 # Lib
 import logging
-from pathlib import Path
+from pathlib import Path, PurePath
 from urllib.parse import urljoin
 import numpy as np
 import pandas as pd
@@ -20,8 +20,9 @@
 
 LOGGER = logging.getLogger(__name__)
 
-MANIFEST_DIR_NAME = '.methylprep_manifest_files'
-MANIFEST_DIR_PATH = f'~/{MANIFEST_DIR_NAME}'
+MANIFEST_DIR_NAME = 'methylprep_manifest_files/'
+# Download manifests into library so multiple users don't need their own copy
+MANIFEST_DIR_PATH = PurePath(Path(__file__).parent.resolve(), MANIFEST_DIR_NAME)
 MANIFEST_DIR_PATH_LAMBDA = f'/tmp/{MANIFEST_DIR_NAME}'
 MANIFEST_BUCKET_NAME = 'array-manifest-files'
 MANIFEST_REMOTE_PATH = f'https://s3.amazonaws.com/{MANIFEST_BUCKET_NAME}/'
@@ -107,25 +108,77 @@ class Manifest():
     __genome_df = None
     __probe_type_subsets = None # apparently not used anywhere in methylprep
 
-    def __init__(self, array_type, filepath_or_buffer=None, on_lambda=False, verbose=True):
+    def __init__(self, array_name: str, filepath_or_buffer=None, on_lambda=False, verbose=True):
+        self.array_name = array_name
         array_str_to_class = dict(zip(list(ARRAY_FILENAME.keys()), list(ARRAY_TYPE_MANIFEST_FILENAMES.keys())))
-        if array_type in array_str_to_class:
-            array_type = array_str_to_class[array_type]
-        self.array_type = array_type
+        # Map array_name as provided as a string to self.array_type which will be of type enum 'ArrayType'
+        if array_name in list(array_str_to_class.keys()):
+            self.array_type = array_str_to_class[array_name]
+        else:
+            self.array_type = ArrayType.CUSTOM
         self.on_lambda = on_lambda # changes filepath to /tmp for the read-only file system
         self.verbose = verbose
-
+        # If you don't pass in a filepath, Methylprep uses one of its own manifests which may need downloading
         if filepath_or_buffer is None:
-            filepath_or_buffer = self.download_default(array_type, self.on_lambda)
-
-        with get_file_object(filepath_or_buffer) as manifest_file:
-            self.__data_frame = self.read_probes(manifest_file)
-            self.__control_data_frame = self.read_control_probes(manifest_file)
-            self.__snp_data_frame = self.read_snp_probes(manifest_file)
-            if self.array_type == ArrayType.ILLUMINA_MOUSE:
-                self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
-            else:
-                self.__mouse_data_frame = pd.DataFrame()
+            filepath_or_buffer = self.download_default(self.array_type, self.on_lambda)
+            LOGGER.info(f"Using Methylprep manifest {self.array_name} @ {filepath_or_buffer}")
+            # This approach to loading a manifest only makes sense if array type is configured in arrays.py
+            # which is only true for Methylprep manifests
+            with get_file_object(filepath_or_buffer) as manifest_file:
+                self.__data_frame = self.read_probes(manifest_file) # DF uses IlmnID as index
+                self.__control_data_frame = self.read_control_probes(manifest_file) # DF uses Address_ID as index
+                self.__snp_data_frame = self.read_snp_probes(manifest_file) # DF uses neither IlmnID nor Address as index?
+                if self.array_type == ArrayType.ILLUMINA_MOUSE:
+                    self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
+                else:
+                    self.__mouse_data_frame = pd.DataFrame()
+        else:
+            LOGGER.info(f"Using CUSTOM manifest {self.array_name} @ {filepath_or_buffer}")
+            self._load_custom_manifest(filepath_or_buffer)
+
+        LOGGER.info(f"""Manifest has {str(len(self.__data_frame))} probes (where {str(len(self.__snp_data_frame))} are snps) and {str(len(self.__control_data_frame))} control probes""")
+
+    def _load_custom_manifest(self, custom_manifest_path):
+        optional = ['OLD_CHR', 'OLD_Strand', 'OLD_Genome_Build', 'OLD_MAPINFO']
+        minimum_cols = [col for col in self.columns if col not in optional]
+
+        with get_file_object(custom_manifest_path) as manifest_file:
+            file_df = pd.read_csv(manifest_file, comment='[', low_memory=False)
+            missing_cols = [col for col in minimum_cols if col not in (file_df.columns.to_list()+[file_df.index.name])]
+            if missing_cols != []:
+                raise ValueError(f"Custom manifest {custom_manifest_path} missing {','.join(missing_cols)} columns")
+            # Controls in manifests follow column order of 'Address_ID', 'Control_Type', 'Color', 'Extended_Type',
+            # After row starting with [Controls],,,, 
+            # Non control probes follow column order of 'IlmnID', 'Name', 'AddressA_ID', 'AddressB_ID'...
+            # This means that after loading the file, any row where AddressB_ID is a string is a control probe
+            control_probes = file_df[file_df["AddressB_ID"].str.contains('[^0-9.]', regex=True, na=False)]
+            control_probes = control_probes[["IlmnID", "Name", "AddressA_ID", "AddressB_ID"]]
+            self.__control_data_frame = control_probes.rename(
+                columns={
+                    "IlmnID": "Address_ID",
+                    "Name": "Control_Type",
+                    "AddressA_ID": "Color",
+                    "AddressB_ID": "Extended_Type"
+                }
+            )
+            # Snps have IlmnID with leading rs
+            self.__snp_data_frame = file_df[file_df["IlmnID"].str.match('rs', na=False)].astype(
+                {'AddressA_ID':'float64', 'AddressB_ID':'float64'})
+            # At this point self.__data_frame should be any probe that is not a control
+            self.__data_frame = file_df.drop(self.__control_data_frame.index, axis=0).astype(
+                {'AddressA_ID':'float64', 'AddressB_ID':'float64'})
+            self.__data_frame = self.__data_frame.set_index('IlmnID')
+            # Methylprep wants index of control df to be Address_ID
+            self.__control_data_frame = self.__control_data_frame.astype({'Address_ID': 'int64'})
+            self.__control_data_frame = self.__control_data_frame.set_index('Address_ID')
+            # Do transformation on Infinim_Design_Type column 
+            vectorized_get_type = np.vectorize(self.get_probe_type)
+            self.__data_frame['probe_type'] = vectorized_get_type(
+                self.__data_frame.index.values,
+                self.__data_frame['Infinium_Design_Type'].values,
+            )
+            # IDK it seems like methylsuite wants this variable avaialable?
+            self.__mouse_data_frame = pd.DataFrame()
 
     @property
     def columns(self):
@@ -150,6 +203,15 @@ def snp_data_frame(self):
     def mouse_data_frame(self):
         return self.__mouse_data_frame
 
+    @staticmethod
+    def get_probe_type(name, infinium_type):
+        """returns one of (I, II, SnpI, SnpII, Control)
+
+        .from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
+        (starts with 'rs' == SnpI) and 'Control' is none of the above."""
+        probe_type = ProbeType.from_manifest_values(name, infinium_type)
+        return probe_type.value
+
     @staticmethod
     def download_default(array_type, on_lambda=False):
         """Downloads the appropriate manifest file if one does not already exist.
@@ -224,16 +286,7 @@ def read_probes(self, manifest_file):
         # AddressB_ID in manifest includes NaNs and INTs and becomes floats, which breaks. forcing back here.
         #data_frame['AddressB_ID'] = data_frame['AddressB_ID'].astype('Int64') # converts floats to ints; leaves NaNs inplace
         # TURNS out, int or float both work for manifests. NOT the source of the error with mouse.
-
-        def get_probe_type(name, infinium_type):
-            """returns one of (I, II, SnpI, SnpII, Control)
-
-            .from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
-            (starts with 'rs' == SnpI) and 'Control' is none of the above."""
-            probe_type = ProbeType.from_manifest_values(name, infinium_type)
-            return probe_type.value
-
-        vectorized_get_type = np.vectorize(get_probe_type)
+        vectorized_get_type = np.vectorize(self.get_probe_type)
         data_frame['probe_type'] = vectorized_get_type(
             data_frame.index.values,
             data_frame['Infinium_Design_Type'].values,
@@ -254,7 +307,7 @@ def read_control_probes(self, manifest_file):
             index_col=0, # illumina_id, not IlmnID here
             names=CONTROL_COLUMNS, # this gives these columns new names, because they have none. loading stuff at end of CSV after probes end.
             nrows=self.array_type.num_controls,
-            skiprows=self.array_type.num_probes +1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
+            skiprows=self.array_type.num_probes+1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
             usecols=range(len(CONTROL_COLUMNS)),
         )
 
@@ -283,29 +336,6 @@ def read_mouse_probes(self, manifest_file):
         mouse_df = mouse_df[(mouse_df['design'] == 'Multi') | (mouse_df['design'] == 'Random')]
         return mouse_df
 
-    """ NEVER CALLED ANYWHERE - belongs in methylize
-    def map_to_genome(self, data_frame):
-        genome_df = self.get_genome_data()
-        merged_df = inner_join_data(data_frame, genome_df)
-        return merged_df
-
-    def get_genome_data(self, build=None):
-        if self.__genome_df is not None:
-            return self.__genome_df
-
-        LOGGER.info('Building genome data frame')
-        # new in version 1.5.6: support for both new and old genomes
-        GENOME_COLUMNS = (
-            'Genome_Build',
-            'CHR',
-            'MAPINFO',
-            'Strand',
-        ) # PLUS four more optional columns with OLD_ prefix (for prev genome build)
-        genome_columns = GENOME_COLUMNS + ['OLD_'+col for col in GENOME_COLUMNS]
-        self.__genome_df = self.data_frame[genome_columns]
-        return self.__genome_df
-    """
-
     def get_data_types(self):
         data_types = {
             key: str for key in self.columns
@@ -330,4 +360,4 @@ def get_probe_details(self, probe_type, channel=None):
             return data_frame[probe_type_mask]
 
         channel_mask = data_frame['Color_Channel'].values == channel.value
-        return data_frame[probe_type_mask & channel_mask]
+        return data_frame[probe_type_mask & channel_mask]
diff --git a/methylprep/models/arrays.py b/methylprep/models/arrays.py
@@ -43,7 +43,8 @@ def from_probe_count(cls, probe_count):
             LOGGER.warning(f'Probe count ({probe_count}) falls outside of normal range. Setting to newest array type: EPIC')
             return cls.ILLUMINA_EPIC
 
-        raise ValueError(f'Unknown array type: ({probe_count} probes detected)')
+        LOGGER.warning(f'Unknown array type: ({probe_count} probes detected) - using type CUSTOM')
+        return cls.CUSTOM
 
     @property
     def num_probes(self):

diff --git a/methylprep/processing/p_value_probe_detection.py b/methylprep/processing/p_value_probe_detection.py
@@ -66,6 +66,33 @@ def _pval_sesame_preprocess(data_container, combine_neg=True):
     pval = pd.concat([pIR,pIG,pII])
     return pval
 
+def _pval_neg_ecdf(data_container):
+    dfR = data_container.ctrl_red
+    dfR = dfR[dfR['Control_Type']=='NEGATIVE']
+    dfR = dfR[['Extended_Type','mean_value']]
+    funcR = ECDF(dfR['mean_value'].values)
+    dfG = data_container.ctrl_green
+    dfG = dfG[dfG['Control_Type']=='NEGATIVE']
+    dfG = dfG[['Extended_Type','mean_value']]
+    funcG = ECDF(dfG['mean_value'].values)
+    pIR = pd.DataFrame(
+        index=data_container.IR.index,
+        data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])),
+        columns=['pNegECDF_pval'])
+    pIG = pd.DataFrame(
+        index=data_container.IG.index,
+        data=1-np.maximum(funcG(data_container.IG['Meth']), funcG(data_container.IG['Unmeth'])),
+        columns=['pNegECDF_pval'])
+    pII = pd.DataFrame(
+        index=data_container.II.index,
+        data=1-np.maximum(funcG(data_container.II['Meth']), funcR(data_container.II['Unmeth'])),
+        columns=['pNegECDF_pval'])
+    # concat and sort
+    pval = pd.concat([pIR, pIG, pII])
+    return pval
+
+
+
 """ DEPRECATED FUNCTIONS (<v1.5.0)
 def detect_probes(data_containers, method='sesame', save=False, silent=True):
     '''