Merge pull request #68 from MannLabs/fix_intensity_issues

ammarcsj · web-flow · commit b35fe190fc41 · 2025-04-16T07:38:03.000-07:00
Fix protein intensity calculation inconsistency due to ambiguous ion sorting
diff --git a/directlfq/lfq_manager.py b/directlfq/lfq_manager.py
@@ -20,7 +20,7 @@
 LOGGER = logging.getLogger(__name__)
 
 
-def run_lfq(input_file,  columns_to_add = [], selected_proteins_file :str = None, mq_protein_groups_txt = None, min_nonan = 1, input_type_to_use = None, maximum_number_of_quadratic_ions_to_use_per_protein = 10, 
+def run_lfq(input_file,  columns_to_add = [], selected_proteins_file :str = None, mq_protein_groups_txt = None, min_nonan = 1, input_type_to_use = None, maximum_number_of_quadratic_ions_to_use_per_protein = 10,
 number_of_quadratic_samples = 50, num_cores = None, filename_suffix = "", deactivate_normalization = False, filter_dict = None, log_processed_proteins = True, protein_id = 'protein', quant_id = 'ion'
 ,compile_normalized_ion_table = True):
     """Run the directLFQ pipeline on a given input file. The input file is expected to contain ion intensities. The output is a table containing protein intensities.
@@ -47,30 +47,30 @@ def run_lfq(input_file,  columns_to_add = [], selected_proteins_file :str = None
     input_file = lfqutils.add_mq_protein_group_ids_if_applicable_and_obtain_annotated_file(input_file, input_type_to_use,mq_protein_groups_txt, columns_to_add)
     input_df = lfqutils.import_data(input_file=input_file, input_type_to_use=input_type_to_use, filter_dict=filter_dict)
 
-    input_df = lfqutils.sort_input_df_by_protein_id(input_df)
+    input_df = lfqutils.sort_input_df_by_protein_and_quant_id(input_df)
     input_df = lfqutils.remove_potential_quant_id_duplicates(input_df)
     input_df = lfqutils.index_and_log_transform_input_df(input_df)
     input_df = lfqutils.remove_allnan_rows_input_df(input_df)
-    
+
     if not deactivate_normalization:
         LOGGER.info("Performing sample normalization.")
         input_df = lfqnorm.NormalizationManagerSamplesOnSelectedProteins(input_df, num_samples_quadratic=number_of_quadratic_samples, selected_proteins_file=selected_proteins_file).complete_dataframe
-    
+
     LOGGER.info("Estimating lfq intensities.")
     protein_df, ion_df = lfqprot_estimation.estimate_protein_intensities(input_df,min_nonan=min_nonan,num_samples_quadratic=maximum_number_of_quadratic_ions_to_use_per_protein, num_cores = num_cores)
     try:
         protein_df = lfqutils.add_columns_to_lfq_results_table(protein_df, input_file, columns_to_add)
     except:
         LOGGER.info("Could not add additional columns to protein table, printing without additional columns.")
-    
+
     LOGGER.info("Writing results files.")
     outfile_basename = get_outfile_basename(input_file, input_type_to_use, selected_proteins_file, deactivate_normalization,filename_suffix)
     save_run_config(outfile_basename, locals())
     save_protein_df(protein_df,outfile_basename)
-    
+
     if config.COMPILE_NORMALIZED_ION_TABLE:
         save_ion_df(ion_df,outfile_basename)
-    
+
     LOGGER.info("Analysis finished!")
 
 def load_filter_dict_if_given_as_yaml(filter_dict):
diff --git a/directlfq/protein_intensity_estimation.py b/directlfq/protein_intensity_estimation.py
@@ -37,10 +37,10 @@ def estimate_protein_intensities(normed_df, min_nonan, num_samples_quadratic, nu
     Returns:
         tuple[protein_intensity_df, ion_intensity_df]: protein intensity dataframe and an ion intensity dataframe. The ion intensity dataframe is only compiled if the config.COMPILE_NORMALIZED_ION_TABLE is set to True.
     """
-    
+
     allprots = list(normed_df.index.get_level_values(0).unique())
     LOGGER.info(f"{len(allprots)} lfq-groups total")
-    
+
     list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_of_tuple_w_protein_profiles_and_shifted_peptides(normed_df, num_samples_quadratic, min_nonan, num_cores)
     protein_df = get_protein_dataframe_from_list_of_protein_profiles(list_of_tuple_w_protein_profiles_and_shifted_peptides=list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df= normed_df)
     if config.COMPILE_NORMALIZED_ION_TABLE:
@@ -72,16 +72,16 @@ def get_normed_dfs(normed_df):
     normed_array = normed_df.to_numpy()
     indices_of_proteinname_switch = find_nameswitch_indices(protein_names)
     results_list = [get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, ion_names) for idx in range(len(indices_of_proteinname_switch)-1)]
-    
+
     return results_list
-    
+
 
 def find_nameswitch_indices(arr):
     change_indices = np.where(arr[:-1] != arr[1:])[0] + 1
 
     # Add the index 0 for the start of the first element
     start_indices = np.insert(change_indices, 0, 0)
-    
+
     #Append the index of the last element
     start_indices = np.append(start_indices, len(arr))
 
@@ -101,7 +101,7 @@ def get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, i
 def get_list_with_sequential_processing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan):
     list_of_tuple_w_protein_profiles_and_shifted_peptides = list(map(lambda x : calculate_peptide_and_protein_intensities(*x), input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan))
     return list_of_tuple_w_protein_profiles_and_shifted_peptides
-    
+
 def get_list_with_multiprocessing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan, num_cores):
     pool = get_configured_multiprocessing_pool(num_cores)
     list_of_tuple_w_protein_profiles_and_shifted_peptides = pool.starmap(calculate_peptide_and_protein_intensities, input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan)
@@ -125,18 +125,18 @@ def calculate_peptide_and_protein_intensities_from_list_of_peptide_intensity_dfs
 def calculate_peptide_and_protein_intensities(idx, peptide_intensity_df, num_samples_quadratic, min_nonan):
     if len(peptide_intensity_df.index) > 1:
         peptide_intensity_df = ProtvalCutter(peptide_intensity_df, maximum_df_length=100).get_dataframe()
-    
+
     if(idx%100 ==0) and config.LOG_PROCESSED_PROTEINS:
         LOGGER.info(f"lfq-object {idx}")
     summed_pepint = np.nansum(2**peptide_intensity_df)
-    
+
     if(peptide_intensity_df.shape[1]<2):
         shifted_peptides = peptide_intensity_df
     else:
         shifted_peptides = lfqnorm.NormalizationManagerProtein(peptide_intensity_df, num_samples_quadratic = num_samples_quadratic).complete_dataframe
-    
+
     protein_profile = get_protein_profile_from_shifted_peptides(shifted_peptides, summed_pepint, min_nonan)
-    
+
     return protein_profile, shifted_peptides
 
 
@@ -181,9 +181,18 @@ def _check_if_df_too_long_and_sort_index_if_so(self):
             self._determine_nansorted_df_index()
 
     def _determine_nansorted_df_index(self):
+        """Sorts the dataframe index primarily by number of NaN values (ascending) and secondarily by summed intensity (descending). Sorting by intensties in case multiple ions have identical missing value counts. We expect initial sorting by ion name (which is done in the run_lfq module) to be deterministic.
+
+        The sorting prioritizes:
+        1. Rows with fewer NaN values come first
+        2. For rows with equal number of NaNs, higher intensity sums come first
+        """
         idxs = self._protvals_df.index
-        self._sorted_idx =  sorted(idxs, key= lambda idx : self._get_num_nas_in_row(self._protvals_df.loc[idx].to_numpy()))
-        
+        self._sorted_idx = sorted(idxs, key=lambda idx: (
+            sum(np.isnan(self._protvals_df.loc[idx].to_numpy())),  # First by number of NaNs (ascending)
+            -np.nansum(self._protvals_df.loc[idx].to_numpy())      # Then by sum of intensities (descending)
+        ))
+
     @staticmethod
     @njit
     def _get_num_nas_in_row(row):
@@ -225,8 +234,8 @@ def get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_pr
     ion_df["protein"] = protein_names
     ion_df = ion_df.set_index(["protein", "ion"])
     return ion_df
-    
-        
+
+
 
 def add_protein_names_to_ion_ints(ion_ints, allprots):
     ion_ints = [add_protein_name_to_ion_df(ion_ints[idx], allprots[idx]) for idx in range(len(ion_ints))]
@@ -244,13 +253,13 @@ def get_protein_dataframe_from_list_of_protein_profiles(list_of_tuple_w_protein_
 
     list_of_protein_profiles = [x[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
     allprots = [x[1].index.get_level_values(0)[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
-    
+
     for idx in range(len(allprots)):
         if list_of_protein_profiles[idx] is None:
             continue
         index_list.append(allprots[idx])
         profile_list.append(list_of_protein_profiles[idx])
-    
+
     index_for_protein_df = pd.Index(data=index_list, name=config.PROTEIN_ID)
     protein_df = 2**pd.DataFrame(profile_list, index = index_for_protein_df, columns = normed_df.columns)
     protein_df = protein_df.replace(np.nan, 0)
diff --git a/directlfq/utils.py b/directlfq/utils.py
@@ -159,7 +159,7 @@ def _get_input_type(mq_file ,input_type_to_use):
         return input_type_to_use
     else:
         return get_input_type_and_config_dict(mq_file)[0]
-    
+
 
 def load_input_file_and_de_duplicate_if_evidence(input_file, input_type, columns_to_add):
     input_df = pd.read_csv(input_file, sep = "\t")
@@ -175,7 +175,7 @@ def load_input_file_and_de_duplicate_if_evidence(input_file, input_type, columns
 
     return input_df
 
-def create_id_to_protein_df(mq_protein_group_file, id_column):    
+def create_id_to_protein_df(mq_protein_group_file, id_column):
     id_mapping_df = pd.read_csv(mq_protein_group_file, sep = "\t", usecols=["Protein IDs", id_column])
     #apply lambda function to id column to split it into a list of ids
     id_mapping_df[id_column] = id_mapping_df[id_column].apply(lambda x: x.split(";"))
@@ -225,15 +225,15 @@ def add_columns_to_lfq_results_table(lfq_results_df, input_file, columns_to_add)
 
     lfq_results_df = lfq_results_df[[x is not None for x in lfq_results_df[config.PROTEIN_ID]]]
     if (len(columns_to_add) == 0) and (len(standard_columns_for_input_type)==0) :
-        return lfq_results_df 
+        return lfq_results_df
     input_df = pd.read_csv(input_file, sep="\t", usecols=all_columns).drop_duplicates(subset=protein_column_input_table)
 
     length_before = len(lfq_results_df.index)
     lfq_results_df_appended = pd.merge(lfq_results_df, input_df, left_on=config.PROTEIN_ID, right_on=protein_column_input_table, how='left')
     length_after = len(lfq_results_df_appended.index)
 
     #lfq_results_df_appended = lfq_results_df_appended.set_index(config.PROTEIN_ID)
-    
+
 
     assert length_before == length_after
     return lfq_results_df_appended
@@ -247,7 +247,7 @@ def get_protein_column_input_table(config_dict):
     return config_dict["protein_cols"][0]
 
 def get_standard_columns_for_input_type(input_type):
-    
+
     if 'maxquant' in input_type:
         return ["Gene names"]
     elif 'diann' in input_type:
@@ -303,11 +303,11 @@ def remove_potential_quant_id_duplicates(data_df : pd.DataFrame):
     return data_df
 
 
-def sort_input_df_by_protein_id(data_df):
-    return data_df.sort_values(by = config.PROTEIN_ID,ignore_index=True)
+def sort_input_df_by_protein_and_quant_id(data_df):
+     return data_df.sort_values(by=[config.PROTEIN_ID, config.QUANT_ID], ignore_index=True)
+
 
 
-    
 
 # %% ../nbdev_nbs/04_utils.ipynb 29
 import yaml
@@ -427,7 +427,7 @@ def merge_protein_and_ion_cols(input_df, config_dict):
 import copy
 def merge_protein_cols_and_ion_dict(input_df, config_dict):
     """[summary]
-    
+
     Args:
         input_df ([pandas dataframe]): longtable containing peptide intensity data
         confid_dict ([dict[String[]]]): nested dict containing the parse information. derived from yaml file
@@ -581,7 +581,7 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c
             os.remove(tmpfile_large)
         if os.path.exists(outfile_name):
             os.remove(outfile_name)
-    
+
     relevant_cols = get_relevant_columns_config_dict(config_dict_for_type)
     input_df_it = utils_fileread.read_file_with_pandas(input_file=input_file, sep=sep, decimal=decimal, usecols=relevant_cols, chunksize=chunksize)
     input_df_list = []
@@ -593,14 +593,14 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c
         else:
             input_df_list.append(input_df_subset)
         header = False
-        
+
     if file_is_large and HAS_DASK:
         process_with_dask(tmpfile_columnfilt=tmpfile_large , outfile_name = outfile_name, config_dict_for_type=config_dict_for_type)
     else:
         input_df = pd.concat(input_df_list)
         input_reshaped = reshape_input_df(input_df, config_dict_for_type)
         input_reshaped.to_csv(outfile_name, sep = "\t", index = None)
-    
+
 
 def adapt_subtable(input_df_subset, config_dict):
     input_df_subset = filter_input(config_dict.get("filters", {}), input_df_subset)
@@ -613,7 +613,7 @@ def adapt_subtable(input_df_subset, config_dict):
 import pandas as pd
 import glob
 import os
-import shutil 
+import shutil
 
 def process_with_dask(*, tmpfile_columnfilt, outfile_name, config_dict_for_type):
     df = dd.read_csv(tmpfile_columnfilt, sep = "\t")
@@ -718,7 +718,7 @@ def merge_sample_id_and_channels(input_df, channels, config_dict_for_type):
     sample_ids = list(input_df[sample_id])
     input_df[sample_id] = [merge_channel_and_sample_string(sample_ids[idx], channels[idx]) for idx in range(len(sample_ids))]
     return input_df
-            
+
 def merge_channel_and_sample_string(sample, channel):
     return f"{sample}_{channel}"
 
@@ -738,7 +738,7 @@ def reformat_and_write_wideformat_table(peptides_tsv, outfile_name, config_dict)
         input_df = input_df.rename(columns = lambda x : x.replace(quant_pre_or_suffix, ""))
 
     #input_df = input_df.reset_index()
-    
+
     input_df.to_csv(outfile_name, sep = '\t', index = None)
 
 
@@ -776,7 +776,7 @@ def import_data(input_file, input_type_to_use = None, samples_subset = None, fil
         file_to_read = input_file
     else:
         file_to_read = reformat_and_save_input_file(input_file=input_file, input_type_to_use=input_type_to_use, filter_dict=filter_dict)
-    
+
     input_reshaped = pd.read_csv(file_to_read, sep = "\t", encoding = 'latin1', usecols=samples_subset)
     input_reshaped = adapt_table_for_alphabaseformat_backward_compatibility(file_is_already_formatted, input_reshaped)
     input_reshaped = input_reshaped.drop_duplicates(subset=config.QUANT_ID)
@@ -791,7 +791,7 @@ def add_ion_protein_headers_if_applicable(samples_subset):
 
 
 def reformat_and_save_input_file(input_file, input_type_to_use = None, filter_dict = None):
-    
+
     input_type, config_dict_for_type, sep = get_input_type_and_config_dict(input_file, input_type_to_use)
 
     if filter_dict is not None:
@@ -911,28 +911,28 @@ def __init__(self, input_file):
     def reformat_and_load_acquisition_data_frame(self):
 
         input_df_it = self._iterator_function()
-        
+
         input_df_list = []
         for input_df_subset in input_df_it:
             input_df_subset = self._reformatting_function(input_df_subset)
             input_df_list.append(input_df_subset)
         input_df = pd.concat(input_df_list)
-        
+
         return input_df
 
     def reformat_and_save_acquisition_data_frame(self, output_file):
-        
+
         input_df_it = self._iterator_function()
         write_header = True
-        
+
         for input_df_subset in input_df_it:
             input_df_subset = self._reformatting_function(input_df_subset)
             self.__write_reformatted_df_to_file__(input_df_subset, output_file, write_header)
             write_header = False
 
     def __initialize_df_iterator__(self):
         return pd.read_csv(self._input_file, sep = "\t", encoding ='latin1', chunksize=1000000)
-    
+
     @staticmethod
     def __write_reformatted_df_to_file__(reformatted_df, filepath ,write_header):
         reformatted_df.to_csv(filepath, header=write_header, mode='a', sep = "\t", index = None)