From 43d844f28fc2175e971b129b5e065073916fffa1 Mon Sep 17 00:00:00 2001 From: "Chavez, Mauro" Date: Wed, 6 Apr 2022 17:55:32 -0700 Subject: [PATCH 1/4] custom manifest support --- methylprep/files/manifests.py | 136 ++++++++++++++++++------------ methylprep/models/arrays.py | 3 +- methylprep/processing/pipeline.py | 9 +- 3 files changed, 92 insertions(+), 56 deletions(-) diff --git a/methylprep/files/manifests.py b/methylprep/files/manifests.py index d51a416..b6285bf 100644 --- a/methylprep/files/manifests.py +++ b/methylprep/files/manifests.py @@ -1,6 +1,6 @@ # Lib import logging -from pathlib import Path +from pathlib import Path, PurePath from urllib.parse import urljoin import numpy as np import pandas as pd @@ -20,8 +20,9 @@ LOGGER = logging.getLogger(__name__) -MANIFEST_DIR_NAME = '.methylprep_manifest_files' -MANIFEST_DIR_PATH = f'~/{MANIFEST_DIR_NAME}' +MANIFEST_DIR_NAME = 'methylprep_manifest_files/' +# Download manifests into library so multiple users don't need their own copy +MANIFEST_DIR_PATH = PurePath(Path(__file__).parent.resolve(), MANIFEST_DIR_NAME) MANIFEST_DIR_PATH_LAMBDA = f'/tmp/{MANIFEST_DIR_NAME}' MANIFEST_BUCKET_NAME = 'array-manifest-files' MANIFEST_REMOTE_PATH = f'https://s3.amazonaws.com/{MANIFEST_BUCKET_NAME}/' @@ -107,25 +108,77 @@ class Manifest(): __genome_df = None __probe_type_subsets = None # apparently not used anywhere in methylprep - def __init__(self, array_type, filepath_or_buffer=None, on_lambda=False, verbose=True): + def __init__(self, array_name: str, filepath_or_buffer=None, on_lambda=False, verbose=True): + self.array_name = array_name array_str_to_class = dict(zip(list(ARRAY_FILENAME.keys()), list(ARRAY_TYPE_MANIFEST_FILENAMES.keys()))) - if array_type in array_str_to_class: - array_type = array_str_to_class[array_type] - self.array_type = array_type + # Map array_name as provided as a string to self.array_type which will be of type enum 'ArrayType' + if array_name in list(array_str_to_class.keys()): + self.array_type = array_str_to_class[array_name] + else: + self.array_type = ArrayType.CUSTOM self.on_lambda = on_lambda # changes filepath to /tmp for the read-only file system self.verbose = verbose - + # If you don't pass in a filepath, Methylprep uses one of its own manifests which may need downloading if filepath_or_buffer is None: - filepath_or_buffer = self.download_default(array_type, self.on_lambda) - - with get_file_object(filepath_or_buffer) as manifest_file: - self.__data_frame = self.read_probes(manifest_file) - self.__control_data_frame = self.read_control_probes(manifest_file) - self.__snp_data_frame = self.read_snp_probes(manifest_file) - if self.array_type == ArrayType.ILLUMINA_MOUSE: - self.__mouse_data_frame = self.read_mouse_probes(manifest_file) - else: - self.__mouse_data_frame = pd.DataFrame() + filepath_or_buffer = self.download_default(self.array_type, self.on_lambda) + LOGGER.info(f"Using Methylprep manifest {self.array_name} @ {filepath_or_buffer}") + # This approach to loading a manifest only makes sense if array type is configured in arrays.py + # which is only true for Methylprep manifests + with get_file_object(filepath_or_buffer) as manifest_file: + self.__data_frame = self.read_probes(manifest_file) # DF uses IlmnID as index + self.__control_data_frame = self.read_control_probes(manifest_file) # DF uses Address_ID as index + self.__snp_data_frame = self.read_snp_probes(manifest_file) # DF uses neither IlmnID nor Address as index? + if self.array_type == ArrayType.ILLUMINA_MOUSE: + self.__mouse_data_frame = self.read_mouse_probes(manifest_file) + else: + self.__mouse_data_frame = pd.DataFrame() + else: + LOGGER.info(f"Using CUSTOM manifest {self.array_name} @ {filepath_or_buffer}") + self._load_custom_manifest(filepath_or_buffer) + + LOGGER.info(f"""Manifest has {str(len(self.__data_frame))} probes (where {str(len(self.__snp_data_frame))} are snps) and {str(len(self.__control_data_frame))} control probes""") + + def _load_custom_manifest(self, custom_manifest_path): + optional = ['OLD_CHR', 'OLD_Strand', 'OLD_Genome_Build', 'OLD_MAPINFO'] + minimum_cols = [col for col in self.columns if col not in optional] + + with get_file_object(custom_manifest_path) as manifest_file: + file_df = pd.read_csv(manifest_file, comment='[', low_memory=False) + missing_cols = [col for col in minimum_cols if col not in (file_df.columns.to_list()+[file_df.index.name])] + if missing_cols != []: + raise ValueError(f"Custom manifest {custom_manifest_path} missing {','.join(missing_cols)} columns") + # Controls in manifests follow column order of 'Address_ID', 'Control_Type', 'Color', 'Extended_Type', + # After row starting with [Controls],,,, + # Non control probes follow column order of 'IlmnID', 'Name', 'AddressA_ID', 'AddressB_ID'... + # This means that after loading the file, any row where AddressB_ID is a string is a control probe + control_probes = file_df[file_df["AddressB_ID"].str.contains('[^0-9.]', regex=True, na=False)] + control_probes = control_probes[["IlmnID", "Name", "AddressA_ID", "AddressB_ID"]] + self.__control_data_frame = control_probes.rename( + columns={ + "IlmnID": "Address_ID", + "Name": "Control_Type", + "AddressA_ID": "Color", + "AddressB_ID": "Extended_Type" + } + ) + # Snps have IlmnID with leading rs + self.__snp_data_frame = file_df[file_df["IlmnID"].str.match('rs', na=False)].astype( + {'AddressA_ID':'float64', 'AddressB_ID':'float64'}) + # At this point self.__data_frame should be any probe that is not a control + self.__data_frame = file_df.drop(self.__control_data_frame.index, axis=0).astype( + {'AddressA_ID':'float64', 'AddressB_ID':'float64'}) + self.__data_frame = self.__data_frame.set_index('IlmnID') + # Methylprep wants index of control df to be Address_ID + self.__control_data_frame = self.__control_data_frame.astype({'Address_ID': 'int64'}) + self.__control_data_frame = self.__control_data_frame.set_index('Address_ID') + # Do transformation on Infinim_Design_Type column + vectorized_get_type = np.vectorize(self.get_probe_type) + self.__data_frame['probe_type'] = vectorized_get_type( + self.__data_frame.index.values, + self.__data_frame['Infinium_Design_Type'].values, + ) + # IDK it seems like methylsuite wants this variable avaialable? + self.__mouse_data_frame = pd.DataFrame() @property def columns(self): @@ -150,6 +203,15 @@ def snp_data_frame(self): def mouse_data_frame(self): return self.__mouse_data_frame + @staticmethod + def get_probe_type(name, infinium_type): + """returns one of (I, II, SnpI, SnpII, Control) + + .from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name + (starts with 'rs' == SnpI) and 'Control' is none of the above.""" + probe_type = ProbeType.from_manifest_values(name, infinium_type) + return probe_type.value + @staticmethod def download_default(array_type, on_lambda=False): """Downloads the appropriate manifest file if one does not already exist. @@ -224,16 +286,7 @@ def read_probes(self, manifest_file): # AddressB_ID in manifest includes NaNs and INTs and becomes floats, which breaks. forcing back here. #data_frame['AddressB_ID'] = data_frame['AddressB_ID'].astype('Int64') # converts floats to ints; leaves NaNs inplace # TURNS out, int or float both work for manifests. NOT the source of the error with mouse. - - def get_probe_type(name, infinium_type): - """returns one of (I, II, SnpI, SnpII, Control) - - .from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name - (starts with 'rs' == SnpI) and 'Control' is none of the above.""" - probe_type = ProbeType.from_manifest_values(name, infinium_type) - return probe_type.value - - vectorized_get_type = np.vectorize(get_probe_type) + vectorized_get_type = np.vectorize(self.get_probe_type) data_frame['probe_type'] = vectorized_get_type( data_frame.index.values, data_frame['Infinium_Design_Type'].values, @@ -254,7 +307,7 @@ def read_control_probes(self, manifest_file): index_col=0, # illumina_id, not IlmnID here names=CONTROL_COLUMNS, # this gives these columns new names, because they have none. loading stuff at end of CSV after probes end. nrows=self.array_type.num_controls, - skiprows=self.array_type.num_probes +1, #without the +1, it includes the last cpg probe in controls and breaks stuff. + skiprows=self.array_type.num_probes+1, #without the +1, it includes the last cpg probe in controls and breaks stuff. usecols=range(len(CONTROL_COLUMNS)), ) @@ -283,29 +336,6 @@ def read_mouse_probes(self, manifest_file): mouse_df = mouse_df[(mouse_df['design'] == 'Multi') | (mouse_df['design'] == 'Random')] return mouse_df - """ NEVER CALLED ANYWHERE - belongs in methylize - def map_to_genome(self, data_frame): - genome_df = self.get_genome_data() - merged_df = inner_join_data(data_frame, genome_df) - return merged_df - - def get_genome_data(self, build=None): - if self.__genome_df is not None: - return self.__genome_df - - LOGGER.info('Building genome data frame') - # new in version 1.5.6: support for both new and old genomes - GENOME_COLUMNS = ( - 'Genome_Build', - 'CHR', - 'MAPINFO', - 'Strand', - ) # PLUS four more optional columns with OLD_ prefix (for prev genome build) - genome_columns = GENOME_COLUMNS + ['OLD_'+col for col in GENOME_COLUMNS] - self.__genome_df = self.data_frame[genome_columns] - return self.__genome_df - """ - def get_data_types(self): data_types = { key: str for key in self.columns @@ -330,4 +360,4 @@ def get_probe_details(self, probe_type, channel=None): return data_frame[probe_type_mask] channel_mask = data_frame['Color_Channel'].values == channel.value - return data_frame[probe_type_mask & channel_mask] + return data_frame[probe_type_mask & channel_mask] \ No newline at end of file diff --git a/methylprep/models/arrays.py b/methylprep/models/arrays.py index e54738d..4d77417 100644 --- a/methylprep/models/arrays.py +++ b/methylprep/models/arrays.py @@ -43,7 +43,8 @@ def from_probe_count(cls, probe_count): LOGGER.warning(f'Probe count ({probe_count}) falls outside of normal range. Setting to newest array type: EPIC') return cls.ILLUMINA_EPIC - raise ValueError(f'Unknown array type: ({probe_count} probes detected)') + LOGGER.warning(f'Unknown array type: ({probe_count} probes detected) - using type CUSTOM') + return cls.CUSTOM @property def num_probes(self): diff --git a/methylprep/processing/pipeline.py b/methylprep/processing/pipeline.py index 33efd2b..4a2929c 100644 --- a/methylprep/processing/pipeline.py +++ b/methylprep/processing/pipeline.py @@ -307,13 +307,18 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None # 200 samples still uses 4.8GB of memory/disk space (float64) missing_probe_errors = {'noob': [], 'raw':[]} + manifest = None for batch_num, batch in enumerate(batches, 1): idat_datasets = parse_sample_sheet_into_idat_datasets(sample_sheet, sample_name=batch, from_s3=None, meta_only=False) # replaces get_raw_datasets # idat_datasets are a list; each item is a dict of {'green_idat': ..., 'red_idat':..., 'array_type', 'sample'} to feed into SigSet #--- pre v1.5 --- raw_datasets = get_raw_datasets(sample_sheet, sample_name=batch) if array_type is None: # use must provide either the array_type or manifest_filepath. - array_type = get_array_type(idat_datasets) - manifest = Manifest(array_type, manifest_filepath) # this allows each batch to be a different array type; but not implemented yet. common with older GEO sets. + array_type = get_array_type(idat_datasets).value + # Only load a manifest if one hasn't been loaded or we are processing a sample with manifest different from previous ones + if manifest is None: + manifest = Manifest(array_type, manifest_filepath) + elif manifest.array_type != array_type: + manifest = Manifest(array_type, manifest_filepath) batch_data_containers = [] export_paths = set() # inform CLI user where to look From a6ae5333396d5018cac64e225584ba925c909f1a Mon Sep 17 00:00:00 2001 From: "Chavez, Mauro" Date: Tue, 10 May 2022 12:54:12 -0700 Subject: [PATCH 2/4] Update pOOBAH pval calculation --- .../processing/p_value_probe_detection.py | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/methylprep/processing/p_value_probe_detection.py b/methylprep/processing/p_value_probe_detection.py index 449cc88..26e138e 100644 --- a/methylprep/processing/p_value_probe_detection.py +++ b/methylprep/processing/p_value_probe_detection.py @@ -9,7 +9,7 @@ import numpy as np -def _pval_sesame_preprocess(data_container): +def _pval_sesame_preprocess(data_container, combine_neg=True): """Performs p-value detection of low signal/noise probes. This ONE SAMPLE version uses meth/unmeth before it is contructed into a _SampleDataContainer__data_frame. - returns a dataframe of probes and their detected p-value levels. - this will be saved to the csv output, so it can be used to drop probes at later step. @@ -17,9 +17,32 @@ def _pval_sesame_preprocess(data_container): - called by pipeline CLI --poobah option. - confirmed that this version produces identical results to the pre-v1.5.0 version on 2021-06-16 """ - # 2021-03-22 assumed 'mean_value' for red and green MEANT meth and unmeth (OOBS), respectively. - funcG = ECDF(data_container.oobG['Unmeth'].values) - funcR = ECDF(data_container.oobR['Meth'].values) + # oob[G/R]['Unmeth'] is the out of band signal from the probe capturing the unmethylated state + # oob[G/R]['Meth'] is the out of band signal from the probe capturing the methylated state + bgG = ( + list(data_container.oobG['Unmeth'].values) + + list(data_container.oobG['Meth'].values) + ) + bgR = ( + list(data_container.oobR['Unmeth'].values) + + list(data_container.oobR['Meth'].values) + ) + # SeSAMe by default includes negative controls as a part of the background green and red intensities + if combine_neg: + # Add green signal from negative controls to background green intensities + dfG = data_container.ctrl_green + dfG = dfG[dfG['Control_Type']=='NEGATIVE'] + dfG = dfG[['Extended_Type','mean_value']] + bgG += list(dfG['mean_value'].values) + # Add reg signal from negative controls to background red intensitites + dfR = data_container.ctrl_red + dfR = dfR[dfR['Control_Type']=='NEGATIVE'] + dfR = dfR[['Extended_Type','mean_value']] + bgR += list(dfR['mean_value'].values) + # Build empirical cumulative distribution functions + funcG = ECDF(bgG) + funcR = ECDF(bgR) + # Apply function of background red intensity to red probes and background green intensity to green probes pIR = pd.DataFrame( index=data_container.IR.index, data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])), From 2e6d016413d735fbbaa612d681224591b882dec5 Mon Sep 17 00:00:00 2001 From: Mauro Chavez Date: Thu, 12 May 2022 10:31:20 -0700 Subject: [PATCH 3/4] added feature to output negative control based pvalue - pNegECDF_pval --- methylprep/cli.py | 13 +++++++-- .../processing/p_value_probe_detection.py | 27 +++++++++++++++++++ methylprep/processing/pipeline.py | 27 ++++++++++++++++--- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/methylprep/cli.py b/methylprep/cli.py index 5c40a2c..5d3b47c 100644 --- a/methylprep/cli.py +++ b/methylprep/cli.py @@ -204,6 +204,14 @@ def cli_process(cmd_args): help='By default, any beta-values or m-values output will contain all probes. If True, those probes that fail the p-value signal:noise detection are replaced with NaNs in dataframes in beta_values and m_value output.' ) + parser.add_argument( + '--pneg_ecdf', + required=False, + action='store_true', + default=False, + help='Calculates a pvalue for each probe against ECDF of negative control intensity and outputs as pNegECDF_values.pkl if export_poobah option specified. Scores also included in csv output.' + ) + parser.add_argument( '--export_poobah', required=False, @@ -276,8 +284,9 @@ def cli_process(cmd_args): poobah=args.poobah, export_poobah=args.export_poobah, quality_mask=(not args.no_quality_mask), - sesame=(not args.minfi) # default 'sesame' method can be turned off using --minfi - ) + sesame=(not args.minfi), # default 'sesame' method can be turned off using --minfi, + pneg_ecdf=args.pneg_ecdf + ) def cli_beta_bakery(cmd_args): diff --git a/methylprep/processing/p_value_probe_detection.py b/methylprep/processing/p_value_probe_detection.py index 26e138e..a83a675 100644 --- a/methylprep/processing/p_value_probe_detection.py +++ b/methylprep/processing/p_value_probe_detection.py @@ -59,6 +59,33 @@ def _pval_sesame_preprocess(data_container, combine_neg=True): pval = pd.concat([pIR,pIG,pII]) return pval +def _pval_neg_ecdf(data_container): + dfR = data_container.ctrl_red + dfR = dfR[dfR['Control_Type']=='NEGATIVE'] + dfR = dfR[['Extended_Type','mean_value']] + funcR = ECDF(dfR['mean_value'].values) + dfG = data_container.ctrl_green + dfG = dfG[dfG['Control_Type']=='NEGATIVE'] + dfG = dfG[['Extended_Type','mean_value']] + funcG = ECDF(dfG['mean_value'].values) + pIR = pd.DataFrame( + index=data_container.IR.index, + data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])), + columns=['pNegECDF_pval']) + pIG = pd.DataFrame( + index=data_container.IG.index, + data=1-np.maximum(funcG(data_container.IG['Meth']), funcG(data_container.IG['Unmeth'])), + columns=['pNegECDF_pval']) + pII = pd.DataFrame( + index=data_container.II.index, + data=1-np.maximum(funcG(data_container.II['Meth']), funcR(data_container.II['Unmeth'])), + columns=['pNegECDF_pval']) + # concat and sort + pval = pd.concat([pIR, pIG, pII]) + return pval + + + """ DEPRECATED FUNCTIONS ( df.shape[0]: + df = df.transpose() # put probes as columns for faster loading. + df = df.sort_index().reindex(sorted(df.columns), axis=1) + pd.to_pickle(df, Path(data_dir,pkl_name)) + LOGGER.info(f"saved {pkl_name}") # v1.3.0 fixing mem probs: pickling each batch_data_containers object then reloading it later. # consolidating data_containers this will break with really large sample sets, so skip here. @@ -577,7 +591,7 @@ class SampleDataContainer(SigSet): def __init__(self, idat_dataset_pair, manifest=None, retain_uncorrected_probe_intensities=False, bit='float32', pval=False, poobah_decimals=3, poobah_sig=0.05, do_noob=True, - quality_mask=True, switch_probes=True, do_nonlinear_dye_bias=True, debug=False, sesame=True): + quality_mask=True, switch_probes=True, do_nonlinear_dye_bias=True, debug=False, sesame=True, pneg_ecdf=False): self.debug = debug self.do_noob = do_noob self.pval = pval @@ -591,6 +605,8 @@ def __init__(self, idat_dataset_pair, manifest=None, retain_uncorrected_probe_in self.sample = idat_dataset_pair['sample'] self.retain_uncorrected_probe_intensities=retain_uncorrected_probe_intensities self.sesame = sesame # defines offsets in functions + # pneg_ecdf defines if negative control based pvalue is calculated - will use poobah_decimals for rounding + self.pneg_ecdf = pneg_ecdf self.data_type = 'float32' if bit == None else bit # options: (float64, float32, or float16) if debug: print(f'DEBUG SDC: sesame {self.sesame} switch {self.switch_probes} noob {self.do_noob} poobah {self.pval} mask {self.quality_mask}, dye {self.do_nonlinear_dye_bias}') @@ -649,6 +665,7 @@ def process_all(self): return self.__data_frame pval_probes_df = _pval_sesame_preprocess(self) if self.pval == True else None + pneg_ecdf_probes_df = _pval_neg_ecdf(self) if self.pneg_ecdf == True else None # output: df with one column named 'poobah_pval' quality_mask_df = _apply_sesame_quality_mask(self) if self.quality_mask == True else None # output: df with one column named 'quality_mask' | if not supported array / custom array: returns nothing. @@ -701,6 +718,10 @@ def process_all(self): if self.pval == True and isinstance(pval_probes_df, pd.DataFrame): pval_probes_df = pval_probes_df.loc[ ~pval_probes_df.index.duplicated() ] self.__data_frame = self.__data_frame.join(pval_probes_df, how='inner') + + if self.pneg_ecdf == True and isinstance(pneg_ecdf_probes_df, pd.DataFrame): + pneg_ecdf_probes_df = pneg_ecdf_probes_df.loc[ ~pneg_ecdf_probes_df.index.duplicated() ] + self.__data_frame = self.__data_frame.join(pneg_ecdf_probes_df, how='inner') self.check_for_probe_loss(f"preprocess_noob sesame={self.sesame} --> {self.methylated.shape} {self.unmethylated.shape}") From ce40194fb954ff20ebf95a0df16e19e77a06e278 Mon Sep 17 00:00:00 2001 From: Mauro Chavez Date: Tue, 30 Aug 2022 16:07:55 -0700 Subject: [PATCH 4/4] Update to allow Mask column in custom manifest Added the functionality to custom manifest support to pick up a "Mask" column and use that in probe masking the same way it does other arrays --- methylprep/processing/preprocess.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/methylprep/processing/preprocess.py b/methylprep/processing/preprocess.py index f0ba878..3dec21f 100644 --- a/methylprep/processing/preprocess.py +++ b/methylprep/processing/preprocess.py @@ -269,14 +269,19 @@ def _apply_sesame_quality_mask(data_container): to use TCGA masking, only applies to HM450 """ + # If array type is custom if data_container.array_type not in ( # ArrayType.ILLUMINA_27K, ArrayType.ILLUMINA_450K, ArrayType.ILLUMINA_EPIC, ArrayType.ILLUMINA_EPIC_PLUS, ArrayType.ILLUMINA_MOUSE): - LOGGER.info(f"Quality masking is not supported for {data_container.array_type}.") - return + # If custom array doesn't have a "Mask" column + if "Mask" not in data_container.man.columns: + LOGGER.info( + f"Quality masking is not supported for {data_container.array_type}. (not implemented/\"Mask\" column not present in custom manifest)" + ) + return # load set of probes to remove from local file if data_container.array_type == ArrayType.ILLUMINA_450K: probes = qualityMask450 @@ -287,7 +292,13 @@ def _apply_sesame_quality_mask(data_container): probes = qualityMaskEPICPLUS elif data_container.array_type == ArrayType.ILLUMINA_MOUSE: probes = qualityMaskmouse - + else: + data_container.man["Mask"].fillna("NA") + # Filter "mask" column to rows with True + mask_rows = data_container.man[data_container.man["Mask"].astype(str).str.lower() == "true"] + # Get indicies (probe names of masked probes) + probes = mask_rows.index.to_list() + LOGGER.info(f"Masking {str(len(probes))} probes") # v1.6+: the 1.0s are good probes and the 0.0 are probes to be excluded. cgs = pd.DataFrame( np.zeros((len(data_container.man.index), 1)), index=data_container.man.index, columns=['quality_mask']) cgs['quality_mask'] = 1.0