Skip to content

Commit b35fe19

Browse files
authored
Merge pull request #68 from MannLabs/fix_intensity_issues
Fix protein intensity calculation inconsistency due to ambiguous ion sorting
2 parents 6f33bdb + b62b739 commit b35fe19

File tree

3 files changed

+54
-45
lines changed

3 files changed

+54
-45
lines changed

directlfq/lfq_manager.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
LOGGER = logging.getLogger(__name__)
2121

2222

23-
def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None, mq_protein_groups_txt = None, min_nonan = 1, input_type_to_use = None, maximum_number_of_quadratic_ions_to_use_per_protein = 10,
23+
def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None, mq_protein_groups_txt = None, min_nonan = 1, input_type_to_use = None, maximum_number_of_quadratic_ions_to_use_per_protein = 10,
2424
number_of_quadratic_samples = 50, num_cores = None, filename_suffix = "", deactivate_normalization = False, filter_dict = None, log_processed_proteins = True, protein_id = 'protein', quant_id = 'ion'
2525
,compile_normalized_ion_table = True):
2626
"""Run the directLFQ pipeline on a given input file. The input file is expected to contain ion intensities. The output is a table containing protein intensities.
@@ -47,30 +47,30 @@ def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None
4747
input_file = lfqutils.add_mq_protein_group_ids_if_applicable_and_obtain_annotated_file(input_file, input_type_to_use,mq_protein_groups_txt, columns_to_add)
4848
input_df = lfqutils.import_data(input_file=input_file, input_type_to_use=input_type_to_use, filter_dict=filter_dict)
4949

50-
input_df = lfqutils.sort_input_df_by_protein_id(input_df)
50+
input_df = lfqutils.sort_input_df_by_protein_and_quant_id(input_df)
5151
input_df = lfqutils.remove_potential_quant_id_duplicates(input_df)
5252
input_df = lfqutils.index_and_log_transform_input_df(input_df)
5353
input_df = lfqutils.remove_allnan_rows_input_df(input_df)
54-
54+
5555
if not deactivate_normalization:
5656
LOGGER.info("Performing sample normalization.")
5757
input_df = lfqnorm.NormalizationManagerSamplesOnSelectedProteins(input_df, num_samples_quadratic=number_of_quadratic_samples, selected_proteins_file=selected_proteins_file).complete_dataframe
58-
58+
5959
LOGGER.info("Estimating lfq intensities.")
6060
protein_df, ion_df = lfqprot_estimation.estimate_protein_intensities(input_df,min_nonan=min_nonan,num_samples_quadratic=maximum_number_of_quadratic_ions_to_use_per_protein, num_cores = num_cores)
6161
try:
6262
protein_df = lfqutils.add_columns_to_lfq_results_table(protein_df, input_file, columns_to_add)
6363
except:
6464
LOGGER.info("Could not add additional columns to protein table, printing without additional columns.")
65-
65+
6666
LOGGER.info("Writing results files.")
6767
outfile_basename = get_outfile_basename(input_file, input_type_to_use, selected_proteins_file, deactivate_normalization,filename_suffix)
6868
save_run_config(outfile_basename, locals())
6969
save_protein_df(protein_df,outfile_basename)
70-
70+
7171
if config.COMPILE_NORMALIZED_ION_TABLE:
7272
save_ion_df(ion_df,outfile_basename)
73-
73+
7474
LOGGER.info("Analysis finished!")
7575

7676
def load_filter_dict_if_given_as_yaml(filter_dict):

directlfq/protein_intensity_estimation.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ def estimate_protein_intensities(normed_df, min_nonan, num_samples_quadratic, nu
3737
Returns:
3838
tuple[protein_intensity_df, ion_intensity_df]: protein intensity dataframe and an ion intensity dataframe. The ion intensity dataframe is only compiled if the config.COMPILE_NORMALIZED_ION_TABLE is set to True.
3939
"""
40-
40+
4141
allprots = list(normed_df.index.get_level_values(0).unique())
4242
LOGGER.info(f"{len(allprots)} lfq-groups total")
43-
43+
4444
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_of_tuple_w_protein_profiles_and_shifted_peptides(normed_df, num_samples_quadratic, min_nonan, num_cores)
4545
protein_df = get_protein_dataframe_from_list_of_protein_profiles(list_of_tuple_w_protein_profiles_and_shifted_peptides=list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df= normed_df)
4646
if config.COMPILE_NORMALIZED_ION_TABLE:
@@ -72,16 +72,16 @@ def get_normed_dfs(normed_df):
7272
normed_array = normed_df.to_numpy()
7373
indices_of_proteinname_switch = find_nameswitch_indices(protein_names)
7474
results_list = [get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, ion_names) for idx in range(len(indices_of_proteinname_switch)-1)]
75-
75+
7676
return results_list
77-
77+
7878

7979
def find_nameswitch_indices(arr):
8080
change_indices = np.where(arr[:-1] != arr[1:])[0] + 1
8181

8282
# Add the index 0 for the start of the first element
8383
start_indices = np.insert(change_indices, 0, 0)
84-
84+
8585
#Append the index of the last element
8686
start_indices = np.append(start_indices, len(arr))
8787

@@ -101,7 +101,7 @@ def get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, i
101101
def get_list_with_sequential_processing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan):
102102
list_of_tuple_w_protein_profiles_and_shifted_peptides = list(map(lambda x : calculate_peptide_and_protein_intensities(*x), input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan))
103103
return list_of_tuple_w_protein_profiles_and_shifted_peptides
104-
104+
105105
def get_list_with_multiprocessing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan, num_cores):
106106
pool = get_configured_multiprocessing_pool(num_cores)
107107
list_of_tuple_w_protein_profiles_and_shifted_peptides = pool.starmap(calculate_peptide_and_protein_intensities, input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan)
@@ -125,18 +125,18 @@ def calculate_peptide_and_protein_intensities_from_list_of_peptide_intensity_dfs
125125
def calculate_peptide_and_protein_intensities(idx, peptide_intensity_df, num_samples_quadratic, min_nonan):
126126
if len(peptide_intensity_df.index) > 1:
127127
peptide_intensity_df = ProtvalCutter(peptide_intensity_df, maximum_df_length=100).get_dataframe()
128-
128+
129129
if(idx%100 ==0) and config.LOG_PROCESSED_PROTEINS:
130130
LOGGER.info(f"lfq-object {idx}")
131131
summed_pepint = np.nansum(2**peptide_intensity_df)
132-
132+
133133
if(peptide_intensity_df.shape[1]<2):
134134
shifted_peptides = peptide_intensity_df
135135
else:
136136
shifted_peptides = lfqnorm.NormalizationManagerProtein(peptide_intensity_df, num_samples_quadratic = num_samples_quadratic).complete_dataframe
137-
137+
138138
protein_profile = get_protein_profile_from_shifted_peptides(shifted_peptides, summed_pepint, min_nonan)
139-
139+
140140
return protein_profile, shifted_peptides
141141

142142

@@ -181,9 +181,18 @@ def _check_if_df_too_long_and_sort_index_if_so(self):
181181
self._determine_nansorted_df_index()
182182

183183
def _determine_nansorted_df_index(self):
184+
"""Sorts the dataframe index primarily by number of NaN values (ascending) and secondarily by summed intensity (descending). Sorting by intensties in case multiple ions have identical missing value counts. We expect initial sorting by ion name (which is done in the run_lfq module) to be deterministic.
185+
186+
The sorting prioritizes:
187+
1. Rows with fewer NaN values come first
188+
2. For rows with equal number of NaNs, higher intensity sums come first
189+
"""
184190
idxs = self._protvals_df.index
185-
self._sorted_idx = sorted(idxs, key= lambda idx : self._get_num_nas_in_row(self._protvals_df.loc[idx].to_numpy()))
186-
191+
self._sorted_idx = sorted(idxs, key=lambda idx: (
192+
sum(np.isnan(self._protvals_df.loc[idx].to_numpy())), # First by number of NaNs (ascending)
193+
-np.nansum(self._protvals_df.loc[idx].to_numpy()) # Then by sum of intensities (descending)
194+
))
195+
187196
@staticmethod
188197
@njit
189198
def _get_num_nas_in_row(row):
@@ -225,8 +234,8 @@ def get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_pr
225234
ion_df["protein"] = protein_names
226235
ion_df = ion_df.set_index(["protein", "ion"])
227236
return ion_df
228-
229-
237+
238+
230239

231240
def add_protein_names_to_ion_ints(ion_ints, allprots):
232241
ion_ints = [add_protein_name_to_ion_df(ion_ints[idx], allprots[idx]) for idx in range(len(ion_ints))]
@@ -244,13 +253,13 @@ def get_protein_dataframe_from_list_of_protein_profiles(list_of_tuple_w_protein_
244253

245254
list_of_protein_profiles = [x[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
246255
allprots = [x[1].index.get_level_values(0)[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
247-
256+
248257
for idx in range(len(allprots)):
249258
if list_of_protein_profiles[idx] is None:
250259
continue
251260
index_list.append(allprots[idx])
252261
profile_list.append(list_of_protein_profiles[idx])
253-
262+
254263
index_for_protein_df = pd.Index(data=index_list, name=config.PROTEIN_ID)
255264
protein_df = 2**pd.DataFrame(profile_list, index = index_for_protein_df, columns = normed_df.columns)
256265
protein_df = protein_df.replace(np.nan, 0)

directlfq/utils.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def _get_input_type(mq_file ,input_type_to_use):
159159
return input_type_to_use
160160
else:
161161
return get_input_type_and_config_dict(mq_file)[0]
162-
162+
163163

164164
def load_input_file_and_de_duplicate_if_evidence(input_file, input_type, columns_to_add):
165165
input_df = pd.read_csv(input_file, sep = "\t")
@@ -175,7 +175,7 @@ def load_input_file_and_de_duplicate_if_evidence(input_file, input_type, columns
175175

176176
return input_df
177177

178-
def create_id_to_protein_df(mq_protein_group_file, id_column):
178+
def create_id_to_protein_df(mq_protein_group_file, id_column):
179179
id_mapping_df = pd.read_csv(mq_protein_group_file, sep = "\t", usecols=["Protein IDs", id_column])
180180
#apply lambda function to id column to split it into a list of ids
181181
id_mapping_df[id_column] = id_mapping_df[id_column].apply(lambda x: x.split(";"))
@@ -225,15 +225,15 @@ def add_columns_to_lfq_results_table(lfq_results_df, input_file, columns_to_add)
225225

226226
lfq_results_df = lfq_results_df[[x is not None for x in lfq_results_df[config.PROTEIN_ID]]]
227227
if (len(columns_to_add) == 0) and (len(standard_columns_for_input_type)==0) :
228-
return lfq_results_df
228+
return lfq_results_df
229229
input_df = pd.read_csv(input_file, sep="\t", usecols=all_columns).drop_duplicates(subset=protein_column_input_table)
230230

231231
length_before = len(lfq_results_df.index)
232232
lfq_results_df_appended = pd.merge(lfq_results_df, input_df, left_on=config.PROTEIN_ID, right_on=protein_column_input_table, how='left')
233233
length_after = len(lfq_results_df_appended.index)
234234

235235
#lfq_results_df_appended = lfq_results_df_appended.set_index(config.PROTEIN_ID)
236-
236+
237237

238238
assert length_before == length_after
239239
return lfq_results_df_appended
@@ -247,7 +247,7 @@ def get_protein_column_input_table(config_dict):
247247
return config_dict["protein_cols"][0]
248248

249249
def get_standard_columns_for_input_type(input_type):
250-
250+
251251
if 'maxquant' in input_type:
252252
return ["Gene names"]
253253
elif 'diann' in input_type:
@@ -303,11 +303,11 @@ def remove_potential_quant_id_duplicates(data_df : pd.DataFrame):
303303
return data_df
304304

305305

306-
def sort_input_df_by_protein_id(data_df):
307-
return data_df.sort_values(by = config.PROTEIN_ID,ignore_index=True)
306+
def sort_input_df_by_protein_and_quant_id(data_df):
307+
return data_df.sort_values(by=[config.PROTEIN_ID, config.QUANT_ID], ignore_index=True)
308+
308309

309310

310-
311311

312312
# %% ../nbdev_nbs/04_utils.ipynb 29
313313
import yaml
@@ -427,7 +427,7 @@ def merge_protein_and_ion_cols(input_df, config_dict):
427427
import copy
428428
def merge_protein_cols_and_ion_dict(input_df, config_dict):
429429
"""[summary]
430-
430+
431431
Args:
432432
input_df ([pandas dataframe]): longtable containing peptide intensity data
433433
confid_dict ([dict[String[]]]): nested dict containing the parse information. derived from yaml file
@@ -581,7 +581,7 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c
581581
os.remove(tmpfile_large)
582582
if os.path.exists(outfile_name):
583583
os.remove(outfile_name)
584-
584+
585585
relevant_cols = get_relevant_columns_config_dict(config_dict_for_type)
586586
input_df_it = utils_fileread.read_file_with_pandas(input_file=input_file, sep=sep, decimal=decimal, usecols=relevant_cols, chunksize=chunksize)
587587
input_df_list = []
@@ -593,14 +593,14 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c
593593
else:
594594
input_df_list.append(input_df_subset)
595595
header = False
596-
596+
597597
if file_is_large and HAS_DASK:
598598
process_with_dask(tmpfile_columnfilt=tmpfile_large , outfile_name = outfile_name, config_dict_for_type=config_dict_for_type)
599599
else:
600600
input_df = pd.concat(input_df_list)
601601
input_reshaped = reshape_input_df(input_df, config_dict_for_type)
602602
input_reshaped.to_csv(outfile_name, sep = "\t", index = None)
603-
603+
604604

605605
def adapt_subtable(input_df_subset, config_dict):
606606
input_df_subset = filter_input(config_dict.get("filters", {}), input_df_subset)
@@ -613,7 +613,7 @@ def adapt_subtable(input_df_subset, config_dict):
613613
import pandas as pd
614614
import glob
615615
import os
616-
import shutil
616+
import shutil
617617

618618
def process_with_dask(*, tmpfile_columnfilt, outfile_name, config_dict_for_type):
619619
df = dd.read_csv(tmpfile_columnfilt, sep = "\t")
@@ -718,7 +718,7 @@ def merge_sample_id_and_channels(input_df, channels, config_dict_for_type):
718718
sample_ids = list(input_df[sample_id])
719719
input_df[sample_id] = [merge_channel_and_sample_string(sample_ids[idx], channels[idx]) for idx in range(len(sample_ids))]
720720
return input_df
721-
721+
722722
def merge_channel_and_sample_string(sample, channel):
723723
return f"{sample}_{channel}"
724724

@@ -738,7 +738,7 @@ def reformat_and_write_wideformat_table(peptides_tsv, outfile_name, config_dict)
738738
input_df = input_df.rename(columns = lambda x : x.replace(quant_pre_or_suffix, ""))
739739

740740
#input_df = input_df.reset_index()
741-
741+
742742
input_df.to_csv(outfile_name, sep = '\t', index = None)
743743

744744

@@ -776,7 +776,7 @@ def import_data(input_file, input_type_to_use = None, samples_subset = None, fil
776776
file_to_read = input_file
777777
else:
778778
file_to_read = reformat_and_save_input_file(input_file=input_file, input_type_to_use=input_type_to_use, filter_dict=filter_dict)
779-
779+
780780
input_reshaped = pd.read_csv(file_to_read, sep = "\t", encoding = 'latin1', usecols=samples_subset)
781781
input_reshaped = adapt_table_for_alphabaseformat_backward_compatibility(file_is_already_formatted, input_reshaped)
782782
input_reshaped = input_reshaped.drop_duplicates(subset=config.QUANT_ID)
@@ -791,7 +791,7 @@ def add_ion_protein_headers_if_applicable(samples_subset):
791791

792792

793793
def reformat_and_save_input_file(input_file, input_type_to_use = None, filter_dict = None):
794-
794+
795795
input_type, config_dict_for_type, sep = get_input_type_and_config_dict(input_file, input_type_to_use)
796796

797797
if filter_dict is not None:
@@ -911,28 +911,28 @@ def __init__(self, input_file):
911911
def reformat_and_load_acquisition_data_frame(self):
912912

913913
input_df_it = self._iterator_function()
914-
914+
915915
input_df_list = []
916916
for input_df_subset in input_df_it:
917917
input_df_subset = self._reformatting_function(input_df_subset)
918918
input_df_list.append(input_df_subset)
919919
input_df = pd.concat(input_df_list)
920-
920+
921921
return input_df
922922

923923
def reformat_and_save_acquisition_data_frame(self, output_file):
924-
924+
925925
input_df_it = self._iterator_function()
926926
write_header = True
927-
927+
928928
for input_df_subset in input_df_it:
929929
input_df_subset = self._reformatting_function(input_df_subset)
930930
self.__write_reformatted_df_to_file__(input_df_subset, output_file, write_header)
931931
write_header = False
932932

933933
def __initialize_df_iterator__(self):
934934
return pd.read_csv(self._input_file, sep = "\t", encoding ='latin1', chunksize=1000000)
935-
935+
936936
@staticmethod
937937
def __write_reformatted_df_to_file__(reformatted_df, filepath ,write_header):
938938
reformatted_df.to_csv(filepath, header=write_header, mode='a', sep = "\t", index = None)

0 commit comments

Comments
 (0)