@@ -159,7 +159,7 @@ def _get_input_type(mq_file ,input_type_to_use):
159159 return input_type_to_use
160160 else :
161161 return get_input_type_and_config_dict (mq_file )[0 ]
162-
162+
163163
164164def load_input_file_and_de_duplicate_if_evidence (input_file , input_type , columns_to_add ):
165165 input_df = pd .read_csv (input_file , sep = "\t " )
@@ -175,7 +175,7 @@ def load_input_file_and_de_duplicate_if_evidence(input_file, input_type, columns
175175
176176 return input_df
177177
178- def create_id_to_protein_df (mq_protein_group_file , id_column ):
178+ def create_id_to_protein_df (mq_protein_group_file , id_column ):
179179 id_mapping_df = pd .read_csv (mq_protein_group_file , sep = "\t " , usecols = ["Protein IDs" , id_column ])
180180 #apply lambda function to id column to split it into a list of ids
181181 id_mapping_df [id_column ] = id_mapping_df [id_column ].apply (lambda x : x .split (";" ))
@@ -225,15 +225,15 @@ def add_columns_to_lfq_results_table(lfq_results_df, input_file, columns_to_add)
225225
226226 lfq_results_df = lfq_results_df [[x is not None for x in lfq_results_df [config .PROTEIN_ID ]]]
227227 if (len (columns_to_add ) == 0 ) and (len (standard_columns_for_input_type )== 0 ) :
228- return lfq_results_df
228+ return lfq_results_df
229229 input_df = pd .read_csv (input_file , sep = "\t " , usecols = all_columns ).drop_duplicates (subset = protein_column_input_table )
230230
231231 length_before = len (lfq_results_df .index )
232232 lfq_results_df_appended = pd .merge (lfq_results_df , input_df , left_on = config .PROTEIN_ID , right_on = protein_column_input_table , how = 'left' )
233233 length_after = len (lfq_results_df_appended .index )
234234
235235 #lfq_results_df_appended = lfq_results_df_appended.set_index(config.PROTEIN_ID)
236-
236+
237237
238238 assert length_before == length_after
239239 return lfq_results_df_appended
@@ -247,7 +247,7 @@ def get_protein_column_input_table(config_dict):
247247 return config_dict ["protein_cols" ][0 ]
248248
249249def get_standard_columns_for_input_type (input_type ):
250-
250+
251251 if 'maxquant' in input_type :
252252 return ["Gene names" ]
253253 elif 'diann' in input_type :
@@ -303,11 +303,11 @@ def remove_potential_quant_id_duplicates(data_df : pd.DataFrame):
303303 return data_df
304304
305305
306- def sort_input_df_by_protein_id (data_df ):
307- return data_df .sort_values (by = config .PROTEIN_ID ,ignore_index = True )
306+ def sort_input_df_by_protein_and_quant_id (data_df ):
307+ return data_df .sort_values (by = [config .PROTEIN_ID , config .QUANT_ID ], ignore_index = True )
308+
308309
309310
310-
311311
312312# %% ../nbdev_nbs/04_utils.ipynb 29
313313import yaml
@@ -427,7 +427,7 @@ def merge_protein_and_ion_cols(input_df, config_dict):
427427import copy
428428def merge_protein_cols_and_ion_dict (input_df , config_dict ):
429429 """[summary]
430-
430+
431431 Args:
432432 input_df ([pandas dataframe]): longtable containing peptide intensity data
433433 confid_dict ([dict[String[]]]): nested dict containing the parse information. derived from yaml file
@@ -581,7 +581,7 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c
581581 os .remove (tmpfile_large )
582582 if os .path .exists (outfile_name ):
583583 os .remove (outfile_name )
584-
584+
585585 relevant_cols = get_relevant_columns_config_dict (config_dict_for_type )
586586 input_df_it = utils_fileread .read_file_with_pandas (input_file = input_file , sep = sep , decimal = decimal , usecols = relevant_cols , chunksize = chunksize )
587587 input_df_list = []
@@ -593,14 +593,14 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c
593593 else :
594594 input_df_list .append (input_df_subset )
595595 header = False
596-
596+
597597 if file_is_large and HAS_DASK :
598598 process_with_dask (tmpfile_columnfilt = tmpfile_large , outfile_name = outfile_name , config_dict_for_type = config_dict_for_type )
599599 else :
600600 input_df = pd .concat (input_df_list )
601601 input_reshaped = reshape_input_df (input_df , config_dict_for_type )
602602 input_reshaped .to_csv (outfile_name , sep = "\t " , index = None )
603-
603+
604604
605605def adapt_subtable (input_df_subset , config_dict ):
606606 input_df_subset = filter_input (config_dict .get ("filters" , {}), input_df_subset )
@@ -613,7 +613,7 @@ def adapt_subtable(input_df_subset, config_dict):
613613import pandas as pd
614614import glob
615615import os
616- import shutil
616+ import shutil
617617
618618def process_with_dask (* , tmpfile_columnfilt , outfile_name , config_dict_for_type ):
619619 df = dd .read_csv (tmpfile_columnfilt , sep = "\t " )
@@ -718,7 +718,7 @@ def merge_sample_id_and_channels(input_df, channels, config_dict_for_type):
718718 sample_ids = list (input_df [sample_id ])
719719 input_df [sample_id ] = [merge_channel_and_sample_string (sample_ids [idx ], channels [idx ]) for idx in range (len (sample_ids ))]
720720 return input_df
721-
721+
722722def merge_channel_and_sample_string (sample , channel ):
723723 return f"{ sample } _{ channel } "
724724
@@ -738,7 +738,7 @@ def reformat_and_write_wideformat_table(peptides_tsv, outfile_name, config_dict)
738738 input_df = input_df .rename (columns = lambda x : x .replace (quant_pre_or_suffix , "" ))
739739
740740 #input_df = input_df.reset_index()
741-
741+
742742 input_df .to_csv (outfile_name , sep = '\t ' , index = None )
743743
744744
@@ -776,7 +776,7 @@ def import_data(input_file, input_type_to_use = None, samples_subset = None, fil
776776 file_to_read = input_file
777777 else :
778778 file_to_read = reformat_and_save_input_file (input_file = input_file , input_type_to_use = input_type_to_use , filter_dict = filter_dict )
779-
779+
780780 input_reshaped = pd .read_csv (file_to_read , sep = "\t " , encoding = 'latin1' , usecols = samples_subset )
781781 input_reshaped = adapt_table_for_alphabaseformat_backward_compatibility (file_is_already_formatted , input_reshaped )
782782 input_reshaped = input_reshaped .drop_duplicates (subset = config .QUANT_ID )
@@ -791,7 +791,7 @@ def add_ion_protein_headers_if_applicable(samples_subset):
791791
792792
793793def reformat_and_save_input_file (input_file , input_type_to_use = None , filter_dict = None ):
794-
794+
795795 input_type , config_dict_for_type , sep = get_input_type_and_config_dict (input_file , input_type_to_use )
796796
797797 if filter_dict is not None :
@@ -911,28 +911,28 @@ def __init__(self, input_file):
911911 def reformat_and_load_acquisition_data_frame (self ):
912912
913913 input_df_it = self ._iterator_function ()
914-
914+
915915 input_df_list = []
916916 for input_df_subset in input_df_it :
917917 input_df_subset = self ._reformatting_function (input_df_subset )
918918 input_df_list .append (input_df_subset )
919919 input_df = pd .concat (input_df_list )
920-
920+
921921 return input_df
922922
923923 def reformat_and_save_acquisition_data_frame (self , output_file ):
924-
924+
925925 input_df_it = self ._iterator_function ()
926926 write_header = True
927-
927+
928928 for input_df_subset in input_df_it :
929929 input_df_subset = self ._reformatting_function (input_df_subset )
930930 self .__write_reformatted_df_to_file__ (input_df_subset , output_file , write_header )
931931 write_header = False
932932
933933 def __initialize_df_iterator__ (self ):
934934 return pd .read_csv (self ._input_file , sep = "\t " , encoding = 'latin1' , chunksize = 1000000 )
935-
935+
936936 @staticmethod
937937 def __write_reformatted_df_to_file__ (reformatted_df , filepath ,write_header ):
938938 reformatted_df .to_csv (filepath , header = write_header , mode = 'a' , sep = "\t " , index = None )
0 commit comments