Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions methylprep/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@ def cli_process(cmd_args):
help='By default, any beta-values or m-values output will contain all probes. If True, those probes that fail the p-value signal:noise detection are replaced with NaNs in dataframes in beta_values and m_value output.'
)

parser.add_argument(
'--pneg_ecdf',
required=False,
action='store_true',
default=False,
help='Calculates a pvalue for each probe against ECDF of negative control intensity and outputs as pNegECDF_values.pkl if export_poobah option specified. Scores also included in csv output.'
)

parser.add_argument(
'--export_poobah',
required=False,
Expand Down Expand Up @@ -276,8 +284,9 @@ def cli_process(cmd_args):
poobah=args.poobah,
export_poobah=args.export_poobah,
quality_mask=(not args.no_quality_mask),
sesame=(not args.minfi) # default 'sesame' method can be turned off using --minfi
)
sesame=(not args.minfi), # default 'sesame' method can be turned off using --minfi,
pneg_ecdf=args.pneg_ecdf
)


def cli_beta_bakery(cmd_args):
Expand Down
136 changes: 83 additions & 53 deletions methylprep/files/manifests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Lib
import logging
from pathlib import Path
from pathlib import Path, PurePath
from urllib.parse import urljoin
import numpy as np
import pandas as pd
Expand All @@ -20,8 +20,9 @@

LOGGER = logging.getLogger(__name__)

MANIFEST_DIR_NAME = '.methylprep_manifest_files'
MANIFEST_DIR_PATH = f'~/{MANIFEST_DIR_NAME}'
MANIFEST_DIR_NAME = 'methylprep_manifest_files/'
# Download manifests into library so multiple users don't need their own copy
MANIFEST_DIR_PATH = PurePath(Path(__file__).parent.resolve(), MANIFEST_DIR_NAME)
MANIFEST_DIR_PATH_LAMBDA = f'/tmp/{MANIFEST_DIR_NAME}'
MANIFEST_BUCKET_NAME = 'array-manifest-files'
MANIFEST_REMOTE_PATH = f'https://s3.amazonaws.com/{MANIFEST_BUCKET_NAME}/'
Expand Down Expand Up @@ -107,25 +108,77 @@ class Manifest():
__genome_df = None
__probe_type_subsets = None # apparently not used anywhere in methylprep

def __init__(self, array_type, filepath_or_buffer=None, on_lambda=False, verbose=True):
def __init__(self, array_name: str, filepath_or_buffer=None, on_lambda=False, verbose=True):
self.array_name = array_name
array_str_to_class = dict(zip(list(ARRAY_FILENAME.keys()), list(ARRAY_TYPE_MANIFEST_FILENAMES.keys())))
if array_type in array_str_to_class:
array_type = array_str_to_class[array_type]
self.array_type = array_type
# Map array_name as provided as a string to self.array_type which will be of type enum 'ArrayType'
if array_name in list(array_str_to_class.keys()):
self.array_type = array_str_to_class[array_name]
else:
self.array_type = ArrayType.CUSTOM
self.on_lambda = on_lambda # changes filepath to /tmp for the read-only file system
self.verbose = verbose

# If you don't pass in a filepath, Methylprep uses one of its own manifests which may need downloading
if filepath_or_buffer is None:
filepath_or_buffer = self.download_default(array_type, self.on_lambda)

with get_file_object(filepath_or_buffer) as manifest_file:
self.__data_frame = self.read_probes(manifest_file)
self.__control_data_frame = self.read_control_probes(manifest_file)
self.__snp_data_frame = self.read_snp_probes(manifest_file)
if self.array_type == ArrayType.ILLUMINA_MOUSE:
self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
else:
self.__mouse_data_frame = pd.DataFrame()
filepath_or_buffer = self.download_default(self.array_type, self.on_lambda)
LOGGER.info(f"Using Methylprep manifest {self.array_name} @ {filepath_or_buffer}")
# This approach to loading a manifest only makes sense if array type is configured in arrays.py
# which is only true for Methylprep manifests
with get_file_object(filepath_or_buffer) as manifest_file:
self.__data_frame = self.read_probes(manifest_file) # DF uses IlmnID as index
self.__control_data_frame = self.read_control_probes(manifest_file) # DF uses Address_ID as index
self.__snp_data_frame = self.read_snp_probes(manifest_file) # DF uses neither IlmnID nor Address as index?
if self.array_type == ArrayType.ILLUMINA_MOUSE:
self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
else:
self.__mouse_data_frame = pd.DataFrame()
else:
LOGGER.info(f"Using CUSTOM manifest {self.array_name} @ {filepath_or_buffer}")
self._load_custom_manifest(filepath_or_buffer)

LOGGER.info(f"""Manifest has {str(len(self.__data_frame))} probes (where {str(len(self.__snp_data_frame))} are snps) and {str(len(self.__control_data_frame))} control probes""")

def _load_custom_manifest(self, custom_manifest_path):
optional = ['OLD_CHR', 'OLD_Strand', 'OLD_Genome_Build', 'OLD_MAPINFO']
minimum_cols = [col for col in self.columns if col not in optional]

with get_file_object(custom_manifest_path) as manifest_file:
file_df = pd.read_csv(manifest_file, comment='[', low_memory=False)
missing_cols = [col for col in minimum_cols if col not in (file_df.columns.to_list()+[file_df.index.name])]
if missing_cols != []:
raise ValueError(f"Custom manifest {custom_manifest_path} missing {','.join(missing_cols)} columns")
# Controls in manifests follow column order of 'Address_ID', 'Control_Type', 'Color', 'Extended_Type',
# After row starting with [Controls],,,,
# Non control probes follow column order of 'IlmnID', 'Name', 'AddressA_ID', 'AddressB_ID'...
# This means that after loading the file, any row where AddressB_ID is a string is a control probe
control_probes = file_df[file_df["AddressB_ID"].str.contains('[^0-9.]', regex=True, na=False)]
control_probes = control_probes[["IlmnID", "Name", "AddressA_ID", "AddressB_ID"]]
self.__control_data_frame = control_probes.rename(
columns={
"IlmnID": "Address_ID",
"Name": "Control_Type",
"AddressA_ID": "Color",
"AddressB_ID": "Extended_Type"
}
)
# Snps have IlmnID with leading rs
self.__snp_data_frame = file_df[file_df["IlmnID"].str.match('rs', na=False)].astype(
{'AddressA_ID':'float64', 'AddressB_ID':'float64'})
# At this point self.__data_frame should be any probe that is not a control
self.__data_frame = file_df.drop(self.__control_data_frame.index, axis=0).astype(
{'AddressA_ID':'float64', 'AddressB_ID':'float64'})
self.__data_frame = self.__data_frame.set_index('IlmnID')
# Methylprep wants index of control df to be Address_ID
self.__control_data_frame = self.__control_data_frame.astype({'Address_ID': 'int64'})
self.__control_data_frame = self.__control_data_frame.set_index('Address_ID')
# Do transformation on Infinim_Design_Type column
vectorized_get_type = np.vectorize(self.get_probe_type)
self.__data_frame['probe_type'] = vectorized_get_type(
self.__data_frame.index.values,
self.__data_frame['Infinium_Design_Type'].values,
)
# IDK it seems like methylsuite wants this variable avaialable?
self.__mouse_data_frame = pd.DataFrame()

@property
def columns(self):
Expand All @@ -150,6 +203,15 @@ def snp_data_frame(self):
def mouse_data_frame(self):
return self.__mouse_data_frame

@staticmethod
def get_probe_type(name, infinium_type):
"""returns one of (I, II, SnpI, SnpII, Control)

.from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
(starts with 'rs' == SnpI) and 'Control' is none of the above."""
probe_type = ProbeType.from_manifest_values(name, infinium_type)
return probe_type.value

@staticmethod
def download_default(array_type, on_lambda=False):
"""Downloads the appropriate manifest file if one does not already exist.
Expand Down Expand Up @@ -224,16 +286,7 @@ def read_probes(self, manifest_file):
# AddressB_ID in manifest includes NaNs and INTs and becomes floats, which breaks. forcing back here.
#data_frame['AddressB_ID'] = data_frame['AddressB_ID'].astype('Int64') # converts floats to ints; leaves NaNs inplace
# TURNS out, int or float both work for manifests. NOT the source of the error with mouse.

def get_probe_type(name, infinium_type):
"""returns one of (I, II, SnpI, SnpII, Control)

.from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
(starts with 'rs' == SnpI) and 'Control' is none of the above."""
probe_type = ProbeType.from_manifest_values(name, infinium_type)
return probe_type.value

vectorized_get_type = np.vectorize(get_probe_type)
vectorized_get_type = np.vectorize(self.get_probe_type)
data_frame['probe_type'] = vectorized_get_type(
data_frame.index.values,
data_frame['Infinium_Design_Type'].values,
Expand All @@ -254,7 +307,7 @@ def read_control_probes(self, manifest_file):
index_col=0, # illumina_id, not IlmnID here
names=CONTROL_COLUMNS, # this gives these columns new names, because they have none. loading stuff at end of CSV after probes end.
nrows=self.array_type.num_controls,
skiprows=self.array_type.num_probes +1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
skiprows=self.array_type.num_probes+1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
usecols=range(len(CONTROL_COLUMNS)),
)

Expand Down Expand Up @@ -283,29 +336,6 @@ def read_mouse_probes(self, manifest_file):
mouse_df = mouse_df[(mouse_df['design'] == 'Multi') | (mouse_df['design'] == 'Random')]
return mouse_df

""" NEVER CALLED ANYWHERE - belongs in methylize
def map_to_genome(self, data_frame):
genome_df = self.get_genome_data()
merged_df = inner_join_data(data_frame, genome_df)
return merged_df

def get_genome_data(self, build=None):
if self.__genome_df is not None:
return self.__genome_df

LOGGER.info('Building genome data frame')
# new in version 1.5.6: support for both new and old genomes
GENOME_COLUMNS = (
'Genome_Build',
'CHR',
'MAPINFO',
'Strand',
) # PLUS four more optional columns with OLD_ prefix (for prev genome build)
genome_columns = GENOME_COLUMNS + ['OLD_'+col for col in GENOME_COLUMNS]
self.__genome_df = self.data_frame[genome_columns]
return self.__genome_df
"""

def get_data_types(self):
data_types = {
key: str for key in self.columns
Expand All @@ -330,4 +360,4 @@ def get_probe_details(self, probe_type, channel=None):
return data_frame[probe_type_mask]

channel_mask = data_frame['Color_Channel'].values == channel.value
return data_frame[probe_type_mask & channel_mask]
return data_frame[probe_type_mask & channel_mask]
3 changes: 2 additions & 1 deletion methylprep/models/arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def from_probe_count(cls, probe_count):
LOGGER.warning(f'Probe count ({probe_count}) falls outside of normal range. Setting to newest array type: EPIC')
return cls.ILLUMINA_EPIC

raise ValueError(f'Unknown array type: ({probe_count} probes detected)')
LOGGER.warning(f'Unknown array type: ({probe_count} probes detected) - using type CUSTOM')
return cls.CUSTOM

@property
def num_probes(self):
Expand Down
27 changes: 27 additions & 0 deletions methylprep/processing/p_value_probe_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,33 @@ def _pval_sesame_preprocess(data_container, combine_neg=True):
pval = pd.concat([pIR,pIG,pII])
return pval

def _pval_neg_ecdf(data_container):
dfR = data_container.ctrl_red
dfR = dfR[dfR['Control_Type']=='NEGATIVE']
dfR = dfR[['Extended_Type','mean_value']]
funcR = ECDF(dfR['mean_value'].values)
dfG = data_container.ctrl_green
dfG = dfG[dfG['Control_Type']=='NEGATIVE']
dfG = dfG[['Extended_Type','mean_value']]
funcG = ECDF(dfG['mean_value'].values)
pIR = pd.DataFrame(
index=data_container.IR.index,
data=1-np.maximum(funcR(data_container.IR['Meth']), funcR(data_container.IR['Unmeth'])),
columns=['pNegECDF_pval'])
pIG = pd.DataFrame(
index=data_container.IG.index,
data=1-np.maximum(funcG(data_container.IG['Meth']), funcG(data_container.IG['Unmeth'])),
columns=['pNegECDF_pval'])
pII = pd.DataFrame(
index=data_container.II.index,
data=1-np.maximum(funcG(data_container.II['Meth']), funcR(data_container.II['Unmeth'])),
columns=['pNegECDF_pval'])
# concat and sort
pval = pd.concat([pIR, pIG, pII])
return pval



""" DEPRECATED FUNCTIONS (<v1.5.0)
def detect_probes(data_containers, method='sesame', save=False, silent=True):
'''
Expand Down
Loading