From 04f51762a97e9864c0219cdea8e13e57704cf744 Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Fri, 20 Feb 2026 14:57:31 -0800 Subject: [PATCH 1/9] Hide inital callback error --- Dash_interface/chart_section_n.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Dash_interface/chart_section_n.py b/Dash_interface/chart_section_n.py index dcc4627..c6241ea 100644 --- a/Dash_interface/chart_section_n.py +++ b/Dash_interface/chart_section_n.py @@ -672,6 +672,7 @@ def apply_structure_filter(data, siteLocatorObj): Output("download_heatmap_svg", "data"), Input("download_svg_button", "n_clicks"), State("hidden_svg", "children"), + prevent_initial_call=True, ) def download_svg(n_clicks, svg): if n_clicks: From 02b90c62e2d219a34a767a93b53d45c3e5f97420 Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Mon, 23 Feb 2026 16:51:32 -0800 Subject: [PATCH 2/9] Prepare for MF Base API update. * Swap MF Base API calls for Dashboard worker API calls * Move spectrum filtering to Dashboard worker * Update Object instantiation to match new API : --- Dash_interface/computation_n.py | 449 +++++++++++++++++++++++++++++++- ModiFinder_base | 2 +- pages/visualizer.py | 3 +- 3 files changed, 438 insertions(+), 16 deletions(-) diff --git a/Dash_interface/computation_n.py b/Dash_interface/computation_n.py index 7c9d205..7c9fb66 100644 --- a/Dash_interface/computation_n.py +++ b/Dash_interface/computation_n.py @@ -1,10 +1,394 @@ +import traceback + from dash import Dash, html, dcc, Input, Output, State, dash_table import base64 import pickle import json import copy +from urllib.parse import quote +from typing import List +import sys + +import requests from modifinder import ModiFinder, Compound from rdkit import Chem +from datetime import datetime + +adduct_mapping = {'M+H': '[M+H]+', +'[M+H]': '[M+H]+', +'[M+H]+': '[M+H]+', +'M+H]': '[M+H]+', +'M+Na': '[M+Na]+', +'[M+Na]': '[M+Na]+', +'[M+Na]+': '[M+Na]+', +'2M+Na': '[2M+Na]+', +'M2+Na': '[2M+Na]+', +'[2M+Na]+': '[2M+Na]+', +'[2M+Na]': '[2M+Na]+', +'M+K': '[M+K]+', +'[M+K]': '[M+K]+', +'[M+K]+': '[M+K]+', +'[2M+K]+': '[2M+K]+', +'2M+K': '[2M+K]+', +'[2M+K]': '[2M+K]+', +'M+H-H20': '[M-H2O+H]+', +'M+H-H2O': '[M-H2O+H]+', +'[M-H2O+H]+': '[M-H2O+H]+', +'M-H20+H': '[M-H2O+H]+', +'[M+H-H2O]+': '[M-H2O+H]+', +'M-H2O+H': '[M-H2O+H]+', +'M+H-2H2O': '[M-2H2O+H]+', +'M-2H2O+H': '[M-2H2O+H]+', +'[M-2H2O+H]+': '[M-2H2O+H]+', +'M-2(H2O)+H': '[M-2H2O+H]+', +'2M+Na-2H': '[2M-2H+Na]-', +'2M-2H+Na': '[2M-2H+Na]-', +'M-H': '[M-H]-', +'[M-H]': '[M-H]-', +'[M-H]-': '[M-H]-', +'M-H-': '[M-H]-', +'M-H1': '[M-H]-', +'3M+Na': '[3M+Na]+', +'[3M+Na]+': '[3M+Na]+', +'[M]+': '[M]+', +'M+': '[M]+', +'M-e': '[M]+', +'M2+H': '[2M+H]+', +'2M+H': '[2M+H]+', +'[2M+H]+': '[2M+H]+', +'[2M+H]': '[2M+H]+', +'[M+2H]': '[M+2H]2+', +'[M+2H]2+': '[M+2H]2+', +'M+2H]': '[M+2H]2+', +'M+2H+2': '[M+2H]2+', +'M+2H': '[M+2H]2+', +'M+acetate': '[M+CH3COOH-H]-', +'M+CH3COOH-H': '[M+CH3COOH-H]-', +'M+CH3COO': '[M+CH3COOH-H]-', +'M+ACN+H': '[M+CH3CN+H]+', +'[M+ACN+H]+': '[M+CH3CN+H]+', +'[M+H+CH3CN]': '[M+CH3CN+H]+', +'M+2Na': '[M+2Na]2+', +'M+2Na]': '[M+2Na]2+', +'M+HCOO': '[M+HCOOH-H]-', +'[M-H+HCOOH]': '[M+HCOOH-H]-', +'M+FA-H': '[M+HCOOH-H]-', +'M+formate': '[M+HCOOH-H]-', +'[M+H+HCOOH]': '[M+HCOOH-H]-', +'2M+FA-H': '[2M+HCOOH-H]-', +'[2M-H+HCOOH]': '[2M+HCOOH-H]-', +'M+NH4': '[M+NH3+H]+', +'[M+NH4]+': '[M+NH3+H]+', +'[M+NH4]': '[M+NH3+H]+', +'2M+Hac-H': '[2M+CH3COOH-H]-', +'2M-H': '[2M-H]-', +'[2M-H]': '[2M-H]-', +'2M+NH4': '[2M+NH3+H]+', +'[2M+NH4]+': '[2M+NH3+H]+', +'[2M+NH4]': '[2M+NH3+H]+', +'[2M+Ca]2+': '[2M+Ca]2+', +'[M+Ca]2+': '[M+Ca]2+', +'[3M+Ca]2+': '[3M+Ca]2+', +'[2M+Ca-H]+': '[2M-H+Ca]+', +'[2M-H2O+H]+': '[2M-H2O+H]+', +'[4M+Ca]2+': '[4M+Ca]2+', +'[3M+NH4]+': '[3M+NH3+H]+', +'3M+NH4': '[3M+NH3+H]+', +'[2M-2H2O+H]+': '[2M-2H2O+H]+', +'[M+ACN+NH4]+': '[M+CH3CN+NH3+H]+', +'[5M+Ca]2+': '[5M+Ca]2+', +'[3M+K]+': '[3M+K]+', +'[3M+Ca-H]+': '[3M-H+Ca]2+', +'[M-H+2Na]+': '[M-H+2Na]+', +'M-H+2Na': '[M-H+2Na]+', +'[M-3H2O+H]+': '[M-3H2O+H]+', +'M-3H2O+H': '[M-3H2O+H]+', +'[M-3H2O+2H]2+': '[M-3H2O+2H]2+', +'[M-2H2O+2H]2+': '[M-2H2O+2H]2+', +'[M-4H2O+H]+': '[M-4H2O+H]+', +'[M-5H2O+H]+': '[M-5H2O+H]+', +'[M+Ca-H]+': '[M+Ca-H]+', +'[2M-H+2Na]+': '[2M-H+2Na]+', +'[2M-3H2O+H]+': '[2M-3H2O+H]+', +'[M+H+Na]2+': '[M+Na+H]2+', +'[M-2H2O+NH4]+': '[M-2H2O+NH3+H]+', +'[2M-2H+Na]': '[2M-2H+Na]-', +'[M-H+CH3OH]': '[M+CH3OH-H]-', +'M+MeOH-H': '[M+CH3OH-H]-', +'M-H2O-H': '[M-H2O-H]-', +'[M-H-H2O]': '[M-H2O-H]-', +'M+Cl-': '[M+Cl]-', +'M+Cl': '[M+Cl]-', +'[M+Cl]': '[M+Cl]-', +'M+K-2H': '[M-2H+K]-', +'[M-2H+K]': '[M-2H+K]-', +'M-2H]': '[M-2H]2-', +'M-2H': '[M-2H]2-', +'M-2H-': '[M-2H]2-', +'M+Na-2H': '[M-2H+Na]-', +'[M-2H+Na]': '[M-2H+Na]-', +'M+Br': '[M+Br]-', +'3M-H': '[3M-H]-', +'[3M-H]': '[3M-H]-', +'[M+H+CH3OH]': '[M+CH3OH+H]+', +'M+CH3OH+H': '[M+CH3OH+H]+', +'[2M+H+CH3CN]': '[2M+CH3CN+H]+', +'M-CO2-H': '[M-CO2-H]-', +'[2M-2H+K]': '[2M-2H+K]-', +'2M+K-2H': '[2M-2H+K]-', +'[M+Na+CH3CN]': '[M+CH3CN+Na]+', +'M-H2+H': '[M-H2+H]-', +'M-H+Cl]': '[M-H+Cl]2-', +'M-H+Cl': '[M-H+Cl]2-', +'3M+H': '[3M+H]+', +'[3M+H]': '[3M+H]+', +'M+H-NH3': '[M-NH3+H]+', +'M-NH3+H': '[M-NH3+H]+', +'M-H+C2H2O': '[M+C2H2O-H]-', +'M+H-C2H2O': '[M+C2H2O-H]-', +'M-H+CH2O2': '[M+CH2O2-H]-', +'M+CH2O2-H': '[M+CH2O2-H]-', +'M+TFA-H': '[M+C2HF3O2-H]-', +'M-C2HF3O2-H': '[M+C2HF3O2-H]-', +'[M]1+': '[M]1+'} + + +gnps_keys_mapping = { + ## precursor + "precursor_mz": "precursor_mz", + ## charge + "precursor_charge": "precursor_charge", + "charge": "precursor_charge", + ## smiles + "smiles": "smiles", + "smile": "smiles", + ## adduct + "adduct": "adduct", + ## peaks + "peaks": "peaks", + ## instrument + "instrument": "instrument", + ## name + "name": "name", + "compound_name": "name", + ## spectrum_id + "spectrum_id": "spectrum_id", + "spectrumid": "spectrum_id", + ## exact mass + "exact_mass": "exact_mass", + "exactmass": "exact_mass", + ## mz + "fragment_mz": "mz", + "mz": "mz", + "mzs": "mz", + ## intensity + "fragment_intensities": "intensity", + "intensities": "intensity", +} + +def filter_peaks_by_ratio_to_base_peak(spectrum, ratio_to_base_peak:float = 0.01): + """Remove peaks with intensity lower than a given ratio to the base peak. + + Parameters + ---------- + ratio_to_base_peak : float (0, 1), default is 0.01 + The ratio to the base peak. + change_spectrum : bool, default is True + If True, the peaks with intensity lower than the given ratio will be removed in place. + If False, a new Spectrum object with the peaks removed will be returned. + """ + + base_peak = max(spectrum.intensity) + new_mz = [] + new_intensity = [] + for index, intensity in enumerate(spectrum.intensity): + if intensity >= float(ratio_to_base_peak) * base_peak: + new_mz.append(spectrum.mz[index]) + new_intensity.append(intensity) + + spectrum.mz = new_mz + spectrum.intensity = new_intensity + + return spectrum + +def remove_larger_than_precursor_peaks(spectrum): + """ + Remove peaks that are larger than the precursor m/z value. + """ + + new_mz = [] + new_intensity = [] + for mz, intensity in zip(spectrum.mz, spectrum.intensity): + if mz < spectrum.precursor_mz * 0.99: + new_mz.append(mz) + new_intensity.append(intensity) + + spectrum.mz = new_mz + spectrum.intensity = new_intensity + + return spectrum + + + +def harmonize_spectrum_keys(data): + """ + Parse the data to a universal format. + + This function takes a dictionary of data and converts it into a universal format. + It processes specific keys like "peaks_json" and "Charge" differently, and attempts + to convert other values to floats. If the conversion to float is successful and the + key is "Charge", it further converts the value to an integer. + + Args: + :data (dict): The input data dictionary to be parsed. + + Returns: + :dict: A dictionary with keys converted to a universal format and values processed + accordingly. + """ + def _convert_to_universal_key(key: str) -> str: + """ + Convert different types of keys to universal keys. + This function standardizes various key names to a universal format. + + Args: + :key (str): The key to be converted. + + Returns: + :str: The converted key. + """ + key = key.lower() + key = key.replace(" ", "_") + return gnps_keys_mapping.get(key, key) + + res = {} + for key, value in data.items(): + converted_key = _convert_to_universal_key(key) + if key == "peaks_json": + res['peaks'] = json.loads(value) + elif converted_key == "adduct": + res[converted_key] = adduct_mapping.get(value, value) + else: + try: + if converted_key in ["precursor_charge", "precursor_mz", "ms_level", "scan", "exact_mass"]: + value = float(value) + if converted_key in ["precursor_charge", "charge", "ms_level"]: + value = int(value) + except Exception: + raise ValueError(f"Could not convert {key} to number") + res[converted_key] = value + return res + +def get_from_metabolomics_resolver(identifier: str) -> dict: + """ + Get partial data (ms2 data) from USI + param identifier: str - USI + return: dict - dictionary of data with keys: precursor_mz, precursor_charge, mz: list, intensity: list + """ + url = 'https://metabolomics-usi.gnps2.org/json/' + "?usi1=" + identifier + try: + r = requests.get(url) + data = json.loads(r.text) + except: + raise Exception("Error in retrieving data from GNPS for identifier: {}, link: {}".format(identifier, url)) + + data = harmonize_spectrum_keys(data) + return data + +def get_data(identifier: str) -> dict: + """ + Get data from GNPS, either from USI or Accession. if the identifier points to a known item in gnps, + it will return the full data, otherwise it will return partial data (ms2 data) + param identifier: str - USI or Accession + return: dict - dictionary of data + """ + + data = dict() + data['usi'] = None + + if "mzspec" in identifier: # It's a USI + data['usi'] = identifier + + if "accession" in identifier: # It's a library spectrum + original_identifier = str(identifier) + identifier = identifier.split(":")[-1] + else: # It's a USI that isn't a library spectrum + data = get_from_metabolomics_resolver(identifier) + data['id'] = identifier + data = harmonize_spectrum_keys(data) + + # Sort peaks if needed + if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: + data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) + + return data + + link = "https://external.gnps2.org/gnpsspectrum?SpectrumID={}".format(identifier) + try: + res = requests.get(link) + parsed = res.json() + except Exception: + data = get_from_metabolomics_resolver(original_identifier) + data['usi'] = original_identifier + data['id'] = identifier + data = harmonize_spectrum_keys(data) + return data + + try: + data.update(parsed['annotations'][0]) + except KeyError: + pass + try: + data.update(parsed['spectruminfo']) + except KeyError: + pass + try: + data['comments'] = parsed['comments'] + except KeyError: + pass + + data = harmonize_spectrum_keys(data) + data['id'] = identifier + + # Ensure peaks are sorted + if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: + data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) + + return data + +def load_helpers( + data: List[str], + ratio_to_base_peak: float = None, + remove_large_peaks: bool = True + ) -> List[Compound]: + """ Load helpers from a list of identifiers, failing gracefully if the smile string is invalid. + """ + loaded_helpers = [] + failed_helpers = [] + for h in data: + try: + lh = get_data(h) + ch = Compound( + spectrum=lh['peaks'], + precursor_mz=lh['precursor_mz'], + precursor_charge=lh['precursor_charge'], + adduct=lh.get('adduct', None), + smiles=lh.get('smiles', None) + ) + if ratio_to_base_peak: + ch.spectrum = filter_peaks_by_ratio_to_base_peak(ch.spectrum, ratio_to_base_peak=ratio_to_base_peak) + if remove_large_peaks: + ch.spectrum = remove_larger_than_precursor_peaks(ch.spectrum) + loaded_helpers.append(ch) + except Exception as e: + # Print the traceback + print(f"Error loading helper compound {h}: {str(e)}", flush=True) + traceback.print_exc(file=sys.stderr) + failed_helpers.append(h) + raise e + + print(f"Loaded {len(loaded_helpers)} helper compounds successfully. Failed to load {len(failed_helpers)} helper compounds: {failed_helpers}", flush=True) + return loaded_helpers def get_callbacks(app): @@ -23,11 +407,27 @@ def calculate_module(data): # remove SMILES and USI from args args.pop('SMILES1', None) args.pop('SMILES2', None) - args.pop('USI1', None) - args.pop('USI2', None) - args["normalize_peaks"] = True - args["remove_large_peaks"] = True - args["ratio_to_base_peak"] = float(args["filter_peaks_variable"]) + usi1 = args.pop('USI1', None) + usi2 = args.pop('USI2', None) + + spectrum1 = get_data(usi1) + spectrum2 = get_data(usi2) + if spectrum1['adduct'] is None: + # Replace with adduct from data + spectrum1['adduct'] = data.get('adduct', None) + if spectrum2['adduct'] is None: + # Replace with adduct from data + spectrum2['adduct'] = data.get('adduct', None) + + # TODO: What to do if adduct differs at this point? + + # TODO: Filter adducts in Helpers? + + # Options propagated out of ModiFinder + ratio_to_base_peak = args.pop('filter_peaks_variable', None) + remove_large_peaks = True + + # Args to pass to ModiFinder args['ppm_tolerance'] = float(args['ppm_tolerance']) helper_compounds = args.pop('Helpers', "").strip(' \t\n\r') helper_compounds = helper_compounds.replace(" ", "") @@ -36,6 +436,11 @@ def calculate_module(data): helper_compounds = list(filter(None, helper_compounds)) # remove "" strings helper_compounds = list(filter(lambda x: x != "", helper_compounds)) + helper_compounds = load_helpers( + helper_compounds, + ratio_to_base_peak=ratio_to_base_peak, + remove_large_peaks=remove_large_peaks + ) if data["SMILES1"] == "" or data["SMILES1"] is None: data["SMILES1"] = None @@ -46,15 +451,20 @@ def calculate_module(data): try: if data['adduct']: args['adduct'] = data['adduct'] - main_compound = Compound(data['USI1'], **args) - if data["SMILES1"] is not None: - main_compound.update(smiles=data["SMILES1"]) - mod_compound = Compound(data['USI2'], **args) - if data["SMILES2"] is not None: - if data["SMILES2"] != ".": - mod_compound.update(smiles=data["SMILES2"]) - if data["SMILES2"] is None: - mod_compound.structure = None + main_compound = Compound( + spectrum=spectrum1['peaks'], + precursor_mz=spectrum1['precursor_mz'], + precursor_charge=spectrum1['precursor_charge'], + adduct=spectrum1['adduct'], + smiles=data["SMILES1"] + ) + mod_compound = Compound( + spectrum=spectrum2['peaks'], + precursor_mz=spectrum2['precursor_mz'], + precursor_charge=spectrum2['precursor_charge'], + adduct=spectrum2['adduct'], + smiles=data["SMILES2"] if data["SMILES2"] is not None and data["SMILES2"] != "" else None + ) except Exception as e: raise e @@ -67,6 +477,17 @@ def calculate_module(data): if main_compound.structure is None: return None, None, None, None, "Error loading SMILES1" + + # Perform actions for ratio_to_base_peak filter + if ratio_to_base_peak: + ratio_to_base_peak = float(ratio_to_base_peak) + main_compound.spectrum = filter_peaks_by_ratio_to_base_peak(main_compound.spectrum, ratio_to_base_peak) + mod_compound.spectrum = filter_peaks_by_ratio_to_base_peak(mod_compound.spectrum, ratio_to_base_peak) + + # Perform actions for remove_large_peaks filter + if remove_large_peaks: + main_compound.spectrum = remove_larger_than_precursor_peaks(main_compound.spectrum) + mod_compound.spectrum = remove_larger_than_precursor_peaks(mod_compound.spectrum) siteLocator = ModiFinder(main_compound, mod_compound, helpers=helper_compounds, **args) diff --git a/ModiFinder_base b/ModiFinder_base index ecb5ecf..9001142 160000 --- a/ModiFinder_base +++ b/ModiFinder_base @@ -1 +1 @@ -Subproject commit ecb5ecfc9d92602fe917ba392e1874d0a6223177 +Subproject commit 9001142467cb1adf46de3d3e2dad85b7a38ac981 diff --git a/pages/visualizer.py b/pages/visualizer.py index c9ead0f..3652ce2 100644 --- a/pages/visualizer.py +++ b/pages/visualizer.py @@ -173,7 +173,8 @@ dbc.CardHeader(html.H5("Contributors")), dbc.CardBody( [ - "Reza Shahneh - UC Riverside", + "Reza Shahneh, Ph.D. - UC Riverside", html.Br(), + "Michael Strobel - UC Riverside", html.Br(), html.Br(), html.H5("Citation"), From 274843484a4974a5adf755adfc5e3048aad2fab8 Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Mon, 23 Feb 2026 21:30:09 -0800 Subject: [PATCH 3/9] Update dash interface to reflect switch the m/z rather than index annotation. --- Dash_interface/chart_section_n.py | 125 +++++++++++------------------- Dash_interface/computation_n.py | 6 +- 2 files changed, 48 insertions(+), 83 deletions(-) diff --git a/Dash_interface/chart_section_n.py b/Dash_interface/chart_section_n.py index c6241ea..a0ae17b 100644 --- a/Dash_interface/chart_section_n.py +++ b/Dash_interface/chart_section_n.py @@ -295,6 +295,7 @@ def update_peaks(data): # , slider_value): if data == None: return {}, {"display": "none"} peaksObj = pickle.loads(base64.b64decode(data)) + main_compound_peaks = peaksObj["main_compound_peaks"] mod_compound_peaks = peaksObj["mod_compound_peaks"] matched_peaks = peaksObj["matched_peaks"] @@ -304,13 +305,15 @@ def update_peaks(data): # , slider_value): fig = go.Figure() typesInxMain = {"matched_shifted": [], "matched_unshifted": [], "unmatched": []} + + ### Assemble matched and unmatched peaks for main compound + x1 = [] y1 = [] for peak in main_compound_peaks: x1.append(peak[0]) y1.append(peak[1]) - # topPeakCount = slider_value topPeakCount = max( len(main_compound_peaks), len(mod_compound_peaks), @@ -319,29 +322,32 @@ def update_peaks(data): # , slider_value): hoverData = {"main": [], "modified": []} for i in topPeaksInxModif: flag = False - for j in matched_peaks: - if j[0] == i: + for main_match_mz, mod_match_mz in matched_peaks: + if abs(main_compound_peaks[i][0] - main_match_mz) < 1e-6: # We have found a match for our specific peak if ( abs( main_compound_peaks[i][0] - - mod_compound_peaks[j[1]][0] + - mod_match_mz ) > args["mz_tolerance"] ): - typesInxMain["matched_shifted"].append(i) - hoverData["main"].append(j[1]) + typesInxMain["matched_shifted"].append([main_match_mz, y1[i], f"Shifted Matched ({mod_match_mz:.2f}, {main_compound_peaks[i][0]:.2f})"]) else: - typesInxMain["matched_unshifted"].append(i) + typesInxMain["matched_unshifted"].append([main_match_mz, y1[i], f"Matched ({mod_match_mz:.2f}, {main_compound_peaks[i][0]:.2f})"]) flag = True - break + break if not flag: - typesInxMain["unmatched"].append(i) + typesInxMain["unmatched"].append([main_compound_peaks[i][0], y1[i], "Unmatched"]) + typesInxModified = { "matched_shifted": [], "matched_unshifted": [], "unmatched": [], } + + ### Assemble matched and unmatched peaks for modified compound + x2 = [] y2 = [] for peak in mod_compound_peaks: @@ -351,42 +357,39 @@ def update_peaks(data): # , slider_value): topPeaksInxModif = sorted(range(len(y2)), key=lambda i: y2[i])[-topPeakCount:] for i in topPeaksInxModif: flag = False - for j in matched_peaks: - if j[1] == i: + for main_match_mz, mod_match_mz in matched_peaks: + + if abs(mod_compound_peaks[i][0] - mod_match_mz) < 1e-6: # We have found a match for our specific peak if ( abs( - main_compound_peaks[j[0]][0] - - mod_compound_peaks[j[1]][0] + mod_compound_peaks[i][0] + - main_match_mz ) - > 0.1 + > args["mz_tolerance"] ): - typesInxModified["matched_shifted"].append([i, j[0]]) - hoverData["modified"].append(j[0]) + typesInxModified["matched_shifted"].append([mod_match_mz, -y2[i], f"Shifted Matched ({main_match_mz:.2f}, {mod_compound_peaks[i][0]:.2f})"]) + # hoverData["modified"].append(j[0]) else: - typesInxModified["matched_unshifted"].append([i, j[0]]) + typesInxModified["matched_unshifted"].append([mod_match_mz, -y2[i], f"Matched ({main_match_mz:.2f}, {mod_compound_peaks[i][0]:.2f})"]) flag = True break if not flag: - typesInxModified["unmatched"].append([i, -1]) + typesInxModified["unmatched"].append([mod_compound_peaks[i][0], -y2[i], "Unmatched"]) minX = min(min(x1), min(x2)) maxX = max(max(x1), max(x2)) minX = min(minX, main_precursor_mz, mod_precursor_mz) maxX = max(maxX, main_precursor_mz, mod_precursor_mz) + ### Plotting + for inx_type in typesInxMain: - x_main = [round(x1[j], 4) for j in typesInxMain[inx_type]] - y1_ = [y1[j] for j in typesInxMain[inx_type]] - y_main = [y / max(y1_) * 100 for y in y1_] - x_modified = [round(x2[j[0]], 4) for j in typesInxModified[inx_type]] - y2_ = [y2[j[0]] for j in typesInxModified[inx_type]] - y_modified = [-j / max(y2_) * 100 for j in y2_] - indicis = typesInxMain[inx_type] + [ - j[0] for j in typesInxModified[inx_type] - ] - x_ = x_main + x_modified - y_ = y_main + y_modified - colors = [colorsInxMain[inx_type]] * len(x_) + + x = [j[0] for j in typesInxMain[inx_type]] + [j[0] for j in typesInxModified[inx_type]] + y = [j[1] for j in typesInxMain[inx_type]] + [j[1] for j in typesInxModified[inx_type]] + y = [y_i / max(y) * 100 for y_i in y] + hovertext = [j[2] for j in typesInxMain[inx_type]] + [j[2] for j in typesInxModified[inx_type]] + colors = [colorsInxMain[inx_type]] * len(x) if inx_type == "unmatched": visibility = "legendonly" if len(typesInxModified["matched_shifted"]) == 0 and len( @@ -396,36 +399,20 @@ def update_peaks(data): # , slider_value): fig.add_trace( go.Bar( - x=x_, - y=y_, + x=x, + y=y, width=(maxX - minX) / 500, - hovertext=indicis, + hovertext=hovertext, name=inx_type, visible=visibility, marker_color=colors, ) ) elif inx_type == "matched_shifted": - hovertext = [] - for i in range(len(x_main)): - hovertext.append( - str(indicis[i]) - + " " - + "matched to:" - + str(hoverData["main"][i]) - ) - for i in range(len(x_main), len(x_main) + len(x_modified)): - hovertext.append( - str(indicis[i]) - + " " - + "matched to:" - + str(hoverData["modified"][i - len(x_main)]) - ) - fig.add_trace( go.Bar( - x=x_, - y=y_, + x=x, + y=y, hovertext=hovertext, name=inx_type, width=(maxX - minX) / 500, @@ -435,9 +422,9 @@ def update_peaks(data): # , slider_value): else: fig.add_trace( go.Bar( - x=x_, - y=y_, - hovertext=indicis, + x=x, + y=y, + hovertext=hovertext, name=inx_type, width=(maxX - minX) / 500, marker_color=colors, @@ -452,8 +439,6 @@ def update_peaks(data): # , slider_value): mode="lines", line=go.scatter.Line(color="black", dash="dash", width= (maxX - minX) / 600), name='known precursor m/z', - # showlegend=False, - # hoverinfo='skip' ) ) fig.add_trace( @@ -463,17 +448,9 @@ def update_peaks(data): # , slider_value): mode="lines", line=go.scatter.Line(color="black", dash="dot", width= (maxX - minX) / 600), name='modified precursor m/z', - # showlegend=False, - # hoverinfo='skip' ) ) - # minX = min(minX, main_precursor_mz, mod_precursor_mz) - # maxX = max(maxX, main_precursor_mz, mod_precursor_mz) - - # fig.update_traces( - # width=(maxX - minX) / 400, - # ) fig.update_layout( title="Alignment of Peaks", bargap=0, @@ -501,18 +478,6 @@ def update_peaks(data): # , slider_value): "zIndex": "1", } - - # @app.callback( - # Output("peak_info", "children", allow_duplicate=True), - # Input("siteLocatorObj", "data"), - # prevent_initial_call=True, - # ) - # def clear_peak_info(data): - # if data == None: - # return "" - # else: - # return "Select a peak to see its fragments" - @app.callback( Output("peak_info", "children", allow_duplicate=True), Input("peaks", "clickData"), @@ -635,18 +600,18 @@ def apply_structure_filter(data, siteLocatorObj): main_compound_peaks = [(main_compound.spectrum.mz[i], main_compound.spectrum.intensity[i]) for i in range(len(main_compound.spectrum.mz))] modified_compound = siteLocator.network.nodes[modified_compound_id]['compound'] - ind = main_compound.spectrum.get_peak_indexes(data["mz"]) - main_compound.spectrum.peak_fragments_map[ind[0]] = [data["all_fragments"][i] for i in data["selected_fragments"]] + mzs = data["mz"] + main_compound.spectrum.peak_fragment_dict[mzs[0]] = [data["all_fragments"][i] for i in data["selected_fragments"]] fragmentsObj = { - "frags_map": main_compound.spectrum.peak_fragments_map, + "frags_map": main_compound.spectrum.peak_fragment_dict, "structure": main_compound.structure, "peaks": main_compound_peaks, "Precursor_MZ": main_compound.spectrum.precursor_mz, } - fragments = list(main_compound.spectrum.peak_fragments_map[ind[0]]) + fragments = list(main_compound.spectrum.peak_fragment_dict[mzs[0]]) result_posibility_indicies = [] for fragment in fragments: fragment_indicies = [] diff --git a/Dash_interface/computation_n.py b/Dash_interface/computation_n.py index 7c9fb66..9f411b6 100644 --- a/Dash_interface/computation_n.py +++ b/Dash_interface/computation_n.py @@ -412,10 +412,10 @@ def calculate_module(data): spectrum1 = get_data(usi1) spectrum2 = get_data(usi2) - if spectrum1['adduct'] is None: + if spectrum1.get('adduct') is None: # Replace with adduct from data spectrum1['adduct'] = data.get('adduct', None) - if spectrum2['adduct'] is None: + if spectrum2.get('adduct') is None: # Replace with adduct from data spectrum2['adduct'] = data.get('adduct', None) @@ -528,7 +528,7 @@ def calculate_module(data): } fragmentsObj = { - "frags_map": main_compound.spectrum.peak_fragments_map, + "frags_map": main_compound.spectrum.peak_fragment_dict, "structure": main_compound.structure, "peaks": main_compound_peaks, "Precursor_MZ": main_compound.spectrum.precursor_mz, From ee12d9d8254a6a7a1f9eb3c8158e06fabcf03e29 Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Tue, 24 Feb 2026 17:19:20 -0800 Subject: [PATCH 4/9] Update for new API. --- Dash_interface/chart_section_n.py | 62 +++++++++++++++-------- Dash_interface/computation_n.py | 74 +++++++++++----------------- Dash_interface/fragment_selection.py | 2 +- ModiFinder_base | 2 +- 4 files changed, 71 insertions(+), 69 deletions(-) diff --git a/Dash_interface/chart_section_n.py b/Dash_interface/chart_section_n.py index a0ae17b..af1456d 100644 --- a/Dash_interface/chart_section_n.py +++ b/Dash_interface/chart_section_n.py @@ -1,3 +1,5 @@ +import sys + from dash import Dash, html, dcc, Input, Output, State, dash_table, Patch from dash.exceptions import PreventUpdate import pickle @@ -303,6 +305,11 @@ def update_peaks(data): # , slider_value): main_precursor_mz = peaksObj["main_precursor_mz"] mod_precursor_mz = peaksObj["mod_precursor_mz"] + # Convert m/z values back down from keys + main_compound_peaks = [(mz/1e6, intensity) for mz, intensity in main_compound_peaks] + mod_compound_peaks = [(mz/1e6, intensity) for mz, intensity in mod_compound_peaks] + matched_peaks = [(main_mz/1e6, mod_mz/1e6) for main_mz, mod_mz in matched_peaks] + fig = go.Figure() typesInxMain = {"matched_shifted": [], "matched_unshifted": [], "unmatched": []} @@ -331,9 +338,9 @@ def update_peaks(data): # , slider_value): ) > args["mz_tolerance"] ): - typesInxMain["matched_shifted"].append([main_match_mz, y1[i], f"Shifted Matched ({mod_match_mz:.2f}, {main_compound_peaks[i][0]:.2f})"]) + typesInxMain["matched_shifted"].append([main_match_mz, y1[i], f"{mod_match_mz:.2f}:{main_compound_peaks[i][0]:.2f}"]) else: - typesInxMain["matched_unshifted"].append([main_match_mz, y1[i], f"Matched ({mod_match_mz:.2f}, {main_compound_peaks[i][0]:.2f})"]) + typesInxMain["matched_unshifted"].append([main_match_mz, y1[i], f"{mod_match_mz:.2f}:{main_compound_peaks[i][0]:.2f}"]) flag = True break if not flag: @@ -367,10 +374,10 @@ def update_peaks(data): # , slider_value): ) > args["mz_tolerance"] ): - typesInxModified["matched_shifted"].append([mod_match_mz, -y2[i], f"Shifted Matched ({main_match_mz:.2f}, {mod_compound_peaks[i][0]:.2f})"]) + typesInxModified["matched_shifted"].append([mod_match_mz, -y2[i], f"{main_match_mz:.2f}:{mod_compound_peaks[i][0]:.2f}"]) # hoverData["modified"].append(j[0]) else: - typesInxModified["matched_unshifted"].append([mod_match_mz, -y2[i], f"Matched ({main_match_mz:.2f}, {mod_compound_peaks[i][0]:.2f})"]) + typesInxModified["matched_unshifted"].append([mod_match_mz, -y2[i], f"{main_match_mz:.2f}:{mod_compound_peaks[i][0]:.2f}"]) flag = True break if not flag: @@ -387,7 +394,13 @@ def update_peaks(data): # , slider_value): x = [j[0] for j in typesInxMain[inx_type]] + [j[0] for j in typesInxModified[inx_type]] y = [j[1] for j in typesInxMain[inx_type]] + [j[1] for j in typesInxModified[inx_type]] - y = [y_i / max(y) * 100 for y_i in y] + # Separate norm constants for pos and neg y + if len(y) == 0: + continue + + max_y = max(y) if max(y) > 0 else 1 + min_y = min(y) if min(y) < 0 else -1 + y = [y_i / max_y * 100 if y_i > 0 else -(y_i / min_y) * 100 for y_i in y] hovertext = [j[2] for j in typesInxMain[inx_type]] + [j[2] for j in typesInxModified[inx_type]] colors = [colorsInxMain[inx_type]] * len(x) if inx_type == "unmatched": @@ -498,17 +511,24 @@ def display_click_data(clickData, fragmentsObj): structure = fragmentsObj["structure"] frags_map = fragmentsObj["frags_map"] - peaks = fragmentsObj["peaks"] + peak_keys = [int(x[0]) for x in fragmentsObj["peaks"]] - peak_index = -1 - for i, peak in enumerate(peaks): - if abs(peak[0]- clicked_peak_x)/clicked_peak_x*1000000 < 40: - peak_index = i + peak_key = None + for k in peak_keys: + if abs((k/1e6)- clicked_peak_x)/clicked_peak_x*1000000 < 40: + peak_key = k # Cast to int (numpy ints won't key) break - if peak_index == -1: - return "error in finding peak index" + if peak_key is None: + raise ValueError(f"Clicked peak not found in peaks list "f"(clicked_peak_x: {clicked_peak_x}, peaks: {peak_keys})") + + try: + fragments = list(frags_map[peak_key]) + except KeyError: + # Check for the closest key + closest_key = min(frags_map.keys(), key=lambda k: abs(k/1e6 - clicked_peak_x)) + + raise ValueError(f"Fragment map does not contain peak key {peak_key} (type {type(peak_key)}), closest key is {closest_key} (type {type(closest_key)} with m/z {closest_key/1e6}, clicked m/z was {clicked_peak_x}") - fragments = list(frags_map[peak_index]) result_posibility_indicies = [] for fragment in fragments: fragment_indicies = [] @@ -529,9 +549,9 @@ def display_click_data(clickData, fragmentsObj): ) except: import traceback - - traceback.print_exc() + traceback.print_exc(file=sys.stderr) return "siteLocator object not found" + return None # change the color of the bar when clicked @@ -564,9 +584,9 @@ def change_bar_color(clickData, figure): # figure["data"][i]["marker"]["color"][j] = "green" # if matched shifted peak, highlight the corresponding peak in the other bar if figure["data"][i]["name"] == "matched_shifted": - index = figure["data"][i]["hovertext"][j].split(":")[1] + peak_x = str(figure["data"][i]["hovertext"][j].split(":")[0]).strip() for l in range(len(figure["data"][i]["x"])): - if (figure["data"][i]["hovertext"][l].split(" ")[0] == index and figure["data"][i]["y"][l] < 0): + if (str(figure["data"][i]["hovertext"][l].split(':')[1]).strip() == peak_x and figure["data"][i]["y"][l] < 0): patched_figure["data"][i]["marker"]["color"][l] = "olive" break @@ -582,7 +602,7 @@ def change_bar_color(clickData, figure): @app.callback( [Output("siteLocatorObj", "data", allow_duplicate=True), Output("peak_info", "children", allow_duplicate=True), - Output('fragmentsObj', 'data', allow_duplicate=True)], + Output("fragmentsObj", "data", allow_duplicate=True)], Input(FragmentsDisplayAIO.ids.fragment_data("fragmentDisplay"), "data"), State("siteLocatorObj", "data"), prevent_initial_call=True, @@ -597,11 +617,11 @@ def apply_structure_filter(data, siteLocatorObj): modified_compound_id = siteLocator._get_unknown() main_compound_id = siteLocator._get_known_neighbor(modified_compound_id) main_compound = siteLocator.network.nodes[main_compound_id]['compound'] - main_compound_peaks = [(main_compound.spectrum.mz[i], main_compound.spectrum.intensity[i]) for i in range(len(main_compound.spectrum.mz))] + main_compound_peaks = [(main_compound.spectrum.mz_key[i], main_compound.spectrum.intensity[i]) for i in range(len(main_compound.spectrum.mz_key))] modified_compound = siteLocator.network.nodes[modified_compound_id]['compound'] mzs = data["mz"] - main_compound.spectrum.peak_fragment_dict[mzs[0]] = [data["all_fragments"][i] for i in data["selected_fragments"]] + main_compound.spectrum.peak_fragment_dict[int(mzs[0])] = [data["all_fragments"][i] for i in data["selected_fragments"]] fragmentsObj = { "frags_map": main_compound.spectrum.peak_fragment_dict, @@ -611,7 +631,7 @@ def apply_structure_filter(data, siteLocatorObj): } - fragments = list(main_compound.spectrum.peak_fragment_dict[mzs[0]]) + fragments = list(main_compound.spectrum.peak_fragment_dict[int(mzs[0])]) result_posibility_indicies = [] for fragment in fragments: fragment_indicies = [] diff --git a/Dash_interface/computation_n.py b/Dash_interface/computation_n.py index 9f411b6..296f2c1 100644 --- a/Dash_interface/computation_n.py +++ b/Dash_interface/computation_n.py @@ -17,10 +17,12 @@ adduct_mapping = {'M+H': '[M+H]+', '[M+H]': '[M+H]+', '[M+H]+': '[M+H]+', +'[M+H]1+': '[M+H]+', 'M+H]': '[M+H]+', 'M+Na': '[M+Na]+', '[M+Na]': '[M+Na]+', '[M+Na]+': '[M+Na]+', +'[M+Na]1+': '[M+Na]+', '2M+Na': '[2M+Na]+', 'M2+Na': '[2M+Na]+', '[2M+Na]+': '[2M+Na]+', @@ -28,6 +30,7 @@ 'M+K': '[M+K]+', '[M+K]': '[M+K]+', '[M+K]+': '[M+K]+', +'[M+K]1+': '[M+K]+', '[2M+K]+': '[2M+K]+', '2M+K': '[2M+K]+', '[2M+K]': '[2M+K]+', @@ -46,6 +49,7 @@ 'M-H': '[M-H]-', '[M-H]': '[M-H]-', '[M-H]-': '[M-H]-', +'[M-H]1-': '[M-H]-', 'M-H-': '[M-H]-', 'M-H1': '[M-H]-', '3M+Na': '[3M+Na]+', @@ -79,6 +83,7 @@ '[2M-H+HCOOH]': '[2M+HCOOH-H]-', 'M+NH4': '[M+NH3+H]+', '[M+NH4]+': '[M+NH3+H]+', +'[M+NH4]1+': '[M+NH3+H]+', '[M+NH4]': '[M+NH3+H]+', '2M+Hac-H': '[2M+CH3COOH-H]-', '2M-H': '[2M-H]-', @@ -120,6 +125,7 @@ 'M+Cl-': '[M+Cl]-', 'M+Cl': '[M+Cl]-', '[M+Cl]': '[M+Cl]-', +'[M+Cl]1-': '[M+Cl]-', 'M+K-2H': '[M-2H+K]-', '[M-2H+K]': '[M-2H+K]-', 'M-2H]': '[M-2H]2-', @@ -128,6 +134,7 @@ 'M+Na-2H': '[M-2H+Na]-', '[M-2H+Na]': '[M-2H+Na]-', 'M+Br': '[M+Br]-', +'[M+Br]1-': '[M+Br]-', '3M-H': '[3M-H]-', '[3M-H]': '[3M-H]-', '[M+H+CH3OH]': '[M+CH3OH+H]+', @@ -203,10 +210,10 @@ def filter_peaks_by_ratio_to_base_peak(spectrum, ratio_to_base_peak:float = 0.01 new_intensity = [] for index, intensity in enumerate(spectrum.intensity): if intensity >= float(ratio_to_base_peak) * base_peak: - new_mz.append(spectrum.mz[index]) + new_mz.append(spectrum.mz_key[index]) # TODO, swap back once mz_key full integrated new_intensity.append(intensity) - spectrum.mz = new_mz + spectrum.mz_key = new_mz spectrum.intensity = new_intensity return spectrum @@ -218,12 +225,12 @@ def remove_larger_than_precursor_peaks(spectrum): new_mz = [] new_intensity = [] - for mz, intensity in zip(spectrum.mz, spectrum.intensity): - if mz < spectrum.precursor_mz * 0.99: + for mz, intensity in zip(spectrum.mz_key, spectrum.intensity): # TODO, swap back once mz_key full integrated + if mz < (spectrum.precursor_mz * 1e6)* 0.99: new_mz.append(mz) new_intensity.append(intensity) - spectrum.mz = new_mz + spectrum.mz_key = new_mz spectrum.intensity = new_intensity return spectrum @@ -320,7 +327,6 @@ def get_data(identifier: str) -> dict: # Sort peaks if needed if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) - return data link = "https://external.gnps2.org/gnpsspectrum?SpectrumID={}".format(identifier) @@ -353,7 +359,6 @@ def get_data(identifier: str) -> dict: # Ensure peaks are sorted if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) - return data def load_helpers( @@ -393,7 +398,13 @@ def load_helpers( def get_callbacks(app): @app.callback( - [Output('siteLocatorObj', 'data'), Output('siriusData', 'children'), Output('peaksObj', 'data'), Output('fragmentsObj', 'data')], Output('error-input', 'children'), + [ + Output('siteLocatorObj', 'data'), + Output('siriusData', 'children'), + Output('peaksObj', 'data'), + Output('fragmentsObj', 'data') + ], + Output('error-input', 'children'), Input('InputData', 'data'), ) def calculate_module(data): @@ -414,10 +425,10 @@ def calculate_module(data): spectrum2 = get_data(usi2) if spectrum1.get('adduct') is None: # Replace with adduct from data - spectrum1['adduct'] = data.get('adduct', None) + spectrum1['adduct'] = adduct_mapping[data['adduct']] # Should raise error here if we don't know what it is if spectrum2.get('adduct') is None: # Replace with adduct from data - spectrum2['adduct'] = data.get('adduct', None) + spectrum2['adduct'] = adduct_mapping[data['adduct']] # TODO: What to do if adduct differs at this point? @@ -449,8 +460,9 @@ def calculate_module(data): data["SMILES2"] = None try: - if data['adduct']: - args['adduct'] = data['adduct'] + # Use known compound adduct + args['adduct'] = spectrum1.get('adduct', None) + main_compound = Compound( spectrum=spectrum1['peaks'], precursor_mz=spectrum1['precursor_mz'], @@ -490,48 +502,18 @@ def calculate_module(data): mod_compound.spectrum = remove_larger_than_precursor_peaks(mod_compound.spectrum) siteLocator = ModiFinder(main_compound, mod_compound, helpers=helper_compounds, **args) - - if mod_compound.structure is not None: - if not (mod_compound.structure.HasSubstructMatch(main_compound.structure) or main_compound.structure.HasSubstructMatch(mod_compound.structure)): - return None, None, None, None, "None of the structures are substructures of the other" - if mod_compound.structure.HasSubstructMatch(main_compound.structure) and main_compound.structure.HasSubstructMatch(mod_compound.structure): - return None, None, None, None, "Structures are the same" + peaksObj, fragmentsObj = siteLocator.get_result() siriusText = "SIRIUS data was not available" - # else: - # print("SIRIUS data was not available", data['USI1']) - # if siteLocator.main_compound.Precursor_MZ > siteLocator.modified_compound.Precursor_MZ: - # return None, "Molecule precursor mass is higher than modified precursor mass", siriusText - # else: + args = copy.deepcopy(data) # remove SMILES and USI from args args.pop('SMILES1', None) args.pop('SMILES2', None) args.pop('USI1', None) args.pop('USI2', None) - - main_compound_peaks = [(main_compound.spectrum.mz[i], main_compound.spectrum.intensity[i]) for i in range(len(main_compound.spectrum.mz))] - mod_compound_peaks = [(mod_compound.spectrum.mz[i], mod_compound.spectrum.intensity[i]) for i in range(len(mod_compound.spectrum.mz))] - matched_peaks = siteLocator.get_edge_detail(main_compound.id, mod_compound.id) - if matched_peaks is None: - matched_peaks = [] - else: - matched_peaks = matched_peaks.get_matches_pairs() - peaksObj = { - "main_compound_peaks": main_compound_peaks, - "mod_compound_peaks": mod_compound_peaks, - "matched_peaks": matched_peaks, - "args": args, - "main_precursor_mz": main_compound.spectrum.precursor_mz, - "mod_precursor_mz": mod_compound.spectrum.precursor_mz, - } - - fragmentsObj = { - "frags_map": main_compound.spectrum.peak_fragment_dict, - "structure": main_compound.structure, - "peaks": main_compound_peaks, - "Precursor_MZ": main_compound.spectrum.precursor_mz, - } + + peaksObj.update({"args": args}) return base64.b64encode(pickle.dumps(siteLocator)).decode(), siriusText, base64.b64encode(pickle.dumps(peaksObj)).decode(), base64.b64encode(pickle.dumps(fragmentsObj)).decode(), None \ No newline at end of file diff --git a/Dash_interface/fragment_selection.py b/Dash_interface/fragment_selection.py index 96ab102..0f370b6 100644 --- a/Dash_interface/fragment_selection.py +++ b/Dash_interface/fragment_selection.py @@ -88,7 +88,7 @@ def __init__(self, fragments_indicies, mol, info, aio_id=None, *args, **kwargs): ), ], ) - print("fragments_indicies", fragments_indicies) + self.aio_id = aio_id super().__init__( children=[ diff --git a/ModiFinder_base b/ModiFinder_base index 9001142..72f2455 160000 --- a/ModiFinder_base +++ b/ModiFinder_base @@ -1 +1 @@ -Subproject commit 9001142467cb1adf46de3d3e2dad85b7a38ac981 +Subproject commit 72f2455760453716abb85c1618f1c1f5752d3235 From f397350de9210314b5d8988fe27bbf99c2b1710c Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Tue, 24 Feb 2026 17:25:34 -0800 Subject: [PATCH 5/9] Deprecate USI and spectrum input for molcule drawer. --- pages/visualizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pages/visualizer.py b/pages/visualizer.py index 3652ce2..51c4647 100644 --- a/pages/visualizer.py +++ b/pages/visualizer.py @@ -117,11 +117,11 @@ html.H1("Molecule Drawer"), dbc.Card([ dbc.InputGroup( - [dbc.InputGroupText('Smiles1'), dbc.Input(placeholder='SMILES, InChI, Spectrum ID, or USI',id='Mol1', value = "")], + [dbc.InputGroupText('Smiles1'), dbc.Input(placeholder='SMILES or InChI',id='Mol1', value = "")], style = {'width': '90vw', 'margin': '1vh auto'} ), dbc.InputGroup( - [dbc.InputGroupText('Smiles2'), dbc.Input(placeholder='SMILES, InChI, Spectrum ID, or USI',id='Mol2', value = "")], + [dbc.InputGroupText('Smiles2'), dbc.Input(placeholder='SMILES or InChI',id='Mol2', value = "")], style = {'width': '90vw', 'margin': '1vh auto'} ), dbc.Checklist( From 952e612ebde478942d52e0dba29953caa967c8a7 Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Wed, 25 Feb 2026 11:27:36 -0800 Subject: [PATCH 6/9] Update visualizer. --- Dash_interface/computation_n.py | 412 +------------------------------- app_utils.py | 367 ++++++++++++++++++++++++++++ pages/visualizer.py | 16 +- 3 files changed, 390 insertions(+), 405 deletions(-) create mode 100644 app_utils.py diff --git a/Dash_interface/computation_n.py b/Dash_interface/computation_n.py index 296f2c1..6e0c2b9 100644 --- a/Dash_interface/computation_n.py +++ b/Dash_interface/computation_n.py @@ -3,397 +3,12 @@ from dash import Dash, html, dcc, Input, Output, State, dash_table import base64 import pickle -import json import copy from urllib.parse import quote -from typing import List -import sys -import requests from modifinder import ModiFinder, Compound -from rdkit import Chem -from datetime import datetime -adduct_mapping = {'M+H': '[M+H]+', -'[M+H]': '[M+H]+', -'[M+H]+': '[M+H]+', -'[M+H]1+': '[M+H]+', -'M+H]': '[M+H]+', -'M+Na': '[M+Na]+', -'[M+Na]': '[M+Na]+', -'[M+Na]+': '[M+Na]+', -'[M+Na]1+': '[M+Na]+', -'2M+Na': '[2M+Na]+', -'M2+Na': '[2M+Na]+', -'[2M+Na]+': '[2M+Na]+', -'[2M+Na]': '[2M+Na]+', -'M+K': '[M+K]+', -'[M+K]': '[M+K]+', -'[M+K]+': '[M+K]+', -'[M+K]1+': '[M+K]+', -'[2M+K]+': '[2M+K]+', -'2M+K': '[2M+K]+', -'[2M+K]': '[2M+K]+', -'M+H-H20': '[M-H2O+H]+', -'M+H-H2O': '[M-H2O+H]+', -'[M-H2O+H]+': '[M-H2O+H]+', -'M-H20+H': '[M-H2O+H]+', -'[M+H-H2O]+': '[M-H2O+H]+', -'M-H2O+H': '[M-H2O+H]+', -'M+H-2H2O': '[M-2H2O+H]+', -'M-2H2O+H': '[M-2H2O+H]+', -'[M-2H2O+H]+': '[M-2H2O+H]+', -'M-2(H2O)+H': '[M-2H2O+H]+', -'2M+Na-2H': '[2M-2H+Na]-', -'2M-2H+Na': '[2M-2H+Na]-', -'M-H': '[M-H]-', -'[M-H]': '[M-H]-', -'[M-H]-': '[M-H]-', -'[M-H]1-': '[M-H]-', -'M-H-': '[M-H]-', -'M-H1': '[M-H]-', -'3M+Na': '[3M+Na]+', -'[3M+Na]+': '[3M+Na]+', -'[M]+': '[M]+', -'M+': '[M]+', -'M-e': '[M]+', -'M2+H': '[2M+H]+', -'2M+H': '[2M+H]+', -'[2M+H]+': '[2M+H]+', -'[2M+H]': '[2M+H]+', -'[M+2H]': '[M+2H]2+', -'[M+2H]2+': '[M+2H]2+', -'M+2H]': '[M+2H]2+', -'M+2H+2': '[M+2H]2+', -'M+2H': '[M+2H]2+', -'M+acetate': '[M+CH3COOH-H]-', -'M+CH3COOH-H': '[M+CH3COOH-H]-', -'M+CH3COO': '[M+CH3COOH-H]-', -'M+ACN+H': '[M+CH3CN+H]+', -'[M+ACN+H]+': '[M+CH3CN+H]+', -'[M+H+CH3CN]': '[M+CH3CN+H]+', -'M+2Na': '[M+2Na]2+', -'M+2Na]': '[M+2Na]2+', -'M+HCOO': '[M+HCOOH-H]-', -'[M-H+HCOOH]': '[M+HCOOH-H]-', -'M+FA-H': '[M+HCOOH-H]-', -'M+formate': '[M+HCOOH-H]-', -'[M+H+HCOOH]': '[M+HCOOH-H]-', -'2M+FA-H': '[2M+HCOOH-H]-', -'[2M-H+HCOOH]': '[2M+HCOOH-H]-', -'M+NH4': '[M+NH3+H]+', -'[M+NH4]+': '[M+NH3+H]+', -'[M+NH4]1+': '[M+NH3+H]+', -'[M+NH4]': '[M+NH3+H]+', -'2M+Hac-H': '[2M+CH3COOH-H]-', -'2M-H': '[2M-H]-', -'[2M-H]': '[2M-H]-', -'2M+NH4': '[2M+NH3+H]+', -'[2M+NH4]+': '[2M+NH3+H]+', -'[2M+NH4]': '[2M+NH3+H]+', -'[2M+Ca]2+': '[2M+Ca]2+', -'[M+Ca]2+': '[M+Ca]2+', -'[3M+Ca]2+': '[3M+Ca]2+', -'[2M+Ca-H]+': '[2M-H+Ca]+', -'[2M-H2O+H]+': '[2M-H2O+H]+', -'[4M+Ca]2+': '[4M+Ca]2+', -'[3M+NH4]+': '[3M+NH3+H]+', -'3M+NH4': '[3M+NH3+H]+', -'[2M-2H2O+H]+': '[2M-2H2O+H]+', -'[M+ACN+NH4]+': '[M+CH3CN+NH3+H]+', -'[5M+Ca]2+': '[5M+Ca]2+', -'[3M+K]+': '[3M+K]+', -'[3M+Ca-H]+': '[3M-H+Ca]2+', -'[M-H+2Na]+': '[M-H+2Na]+', -'M-H+2Na': '[M-H+2Na]+', -'[M-3H2O+H]+': '[M-3H2O+H]+', -'M-3H2O+H': '[M-3H2O+H]+', -'[M-3H2O+2H]2+': '[M-3H2O+2H]2+', -'[M-2H2O+2H]2+': '[M-2H2O+2H]2+', -'[M-4H2O+H]+': '[M-4H2O+H]+', -'[M-5H2O+H]+': '[M-5H2O+H]+', -'[M+Ca-H]+': '[M+Ca-H]+', -'[2M-H+2Na]+': '[2M-H+2Na]+', -'[2M-3H2O+H]+': '[2M-3H2O+H]+', -'[M+H+Na]2+': '[M+Na+H]2+', -'[M-2H2O+NH4]+': '[M-2H2O+NH3+H]+', -'[2M-2H+Na]': '[2M-2H+Na]-', -'[M-H+CH3OH]': '[M+CH3OH-H]-', -'M+MeOH-H': '[M+CH3OH-H]-', -'M-H2O-H': '[M-H2O-H]-', -'[M-H-H2O]': '[M-H2O-H]-', -'M+Cl-': '[M+Cl]-', -'M+Cl': '[M+Cl]-', -'[M+Cl]': '[M+Cl]-', -'[M+Cl]1-': '[M+Cl]-', -'M+K-2H': '[M-2H+K]-', -'[M-2H+K]': '[M-2H+K]-', -'M-2H]': '[M-2H]2-', -'M-2H': '[M-2H]2-', -'M-2H-': '[M-2H]2-', -'M+Na-2H': '[M-2H+Na]-', -'[M-2H+Na]': '[M-2H+Na]-', -'M+Br': '[M+Br]-', -'[M+Br]1-': '[M+Br]-', -'3M-H': '[3M-H]-', -'[3M-H]': '[3M-H]-', -'[M+H+CH3OH]': '[M+CH3OH+H]+', -'M+CH3OH+H': '[M+CH3OH+H]+', -'[2M+H+CH3CN]': '[2M+CH3CN+H]+', -'M-CO2-H': '[M-CO2-H]-', -'[2M-2H+K]': '[2M-2H+K]-', -'2M+K-2H': '[2M-2H+K]-', -'[M+Na+CH3CN]': '[M+CH3CN+Na]+', -'M-H2+H': '[M-H2+H]-', -'M-H+Cl]': '[M-H+Cl]2-', -'M-H+Cl': '[M-H+Cl]2-', -'3M+H': '[3M+H]+', -'[3M+H]': '[3M+H]+', -'M+H-NH3': '[M-NH3+H]+', -'M-NH3+H': '[M-NH3+H]+', -'M-H+C2H2O': '[M+C2H2O-H]-', -'M+H-C2H2O': '[M+C2H2O-H]-', -'M-H+CH2O2': '[M+CH2O2-H]-', -'M+CH2O2-H': '[M+CH2O2-H]-', -'M+TFA-H': '[M+C2HF3O2-H]-', -'M-C2HF3O2-H': '[M+C2HF3O2-H]-', -'[M]1+': '[M]1+'} - - -gnps_keys_mapping = { - ## precursor - "precursor_mz": "precursor_mz", - ## charge - "precursor_charge": "precursor_charge", - "charge": "precursor_charge", - ## smiles - "smiles": "smiles", - "smile": "smiles", - ## adduct - "adduct": "adduct", - ## peaks - "peaks": "peaks", - ## instrument - "instrument": "instrument", - ## name - "name": "name", - "compound_name": "name", - ## spectrum_id - "spectrum_id": "spectrum_id", - "spectrumid": "spectrum_id", - ## exact mass - "exact_mass": "exact_mass", - "exactmass": "exact_mass", - ## mz - "fragment_mz": "mz", - "mz": "mz", - "mzs": "mz", - ## intensity - "fragment_intensities": "intensity", - "intensities": "intensity", -} - -def filter_peaks_by_ratio_to_base_peak(spectrum, ratio_to_base_peak:float = 0.01): - """Remove peaks with intensity lower than a given ratio to the base peak. - - Parameters - ---------- - ratio_to_base_peak : float (0, 1), default is 0.01 - The ratio to the base peak. - change_spectrum : bool, default is True - If True, the peaks with intensity lower than the given ratio will be removed in place. - If False, a new Spectrum object with the peaks removed will be returned. - """ - - base_peak = max(spectrum.intensity) - new_mz = [] - new_intensity = [] - for index, intensity in enumerate(spectrum.intensity): - if intensity >= float(ratio_to_base_peak) * base_peak: - new_mz.append(spectrum.mz_key[index]) # TODO, swap back once mz_key full integrated - new_intensity.append(intensity) - - spectrum.mz_key = new_mz - spectrum.intensity = new_intensity - - return spectrum - -def remove_larger_than_precursor_peaks(spectrum): - """ - Remove peaks that are larger than the precursor m/z value. - """ - - new_mz = [] - new_intensity = [] - for mz, intensity in zip(spectrum.mz_key, spectrum.intensity): # TODO, swap back once mz_key full integrated - if mz < (spectrum.precursor_mz * 1e6)* 0.99: - new_mz.append(mz) - new_intensity.append(intensity) - - spectrum.mz_key = new_mz - spectrum.intensity = new_intensity - - return spectrum - - - -def harmonize_spectrum_keys(data): - """ - Parse the data to a universal format. - - This function takes a dictionary of data and converts it into a universal format. - It processes specific keys like "peaks_json" and "Charge" differently, and attempts - to convert other values to floats. If the conversion to float is successful and the - key is "Charge", it further converts the value to an integer. - - Args: - :data (dict): The input data dictionary to be parsed. - - Returns: - :dict: A dictionary with keys converted to a universal format and values processed - accordingly. - """ - def _convert_to_universal_key(key: str) -> str: - """ - Convert different types of keys to universal keys. - This function standardizes various key names to a universal format. - - Args: - :key (str): The key to be converted. - - Returns: - :str: The converted key. - """ - key = key.lower() - key = key.replace(" ", "_") - return gnps_keys_mapping.get(key, key) - - res = {} - for key, value in data.items(): - converted_key = _convert_to_universal_key(key) - if key == "peaks_json": - res['peaks'] = json.loads(value) - elif converted_key == "adduct": - res[converted_key] = adduct_mapping.get(value, value) - else: - try: - if converted_key in ["precursor_charge", "precursor_mz", "ms_level", "scan", "exact_mass"]: - value = float(value) - if converted_key in ["precursor_charge", "charge", "ms_level"]: - value = int(value) - except Exception: - raise ValueError(f"Could not convert {key} to number") - res[converted_key] = value - return res - -def get_from_metabolomics_resolver(identifier: str) -> dict: - """ - Get partial data (ms2 data) from USI - param identifier: str - USI - return: dict - dictionary of data with keys: precursor_mz, precursor_charge, mz: list, intensity: list - """ - url = 'https://metabolomics-usi.gnps2.org/json/' + "?usi1=" + identifier - try: - r = requests.get(url) - data = json.loads(r.text) - except: - raise Exception("Error in retrieving data from GNPS for identifier: {}, link: {}".format(identifier, url)) - - data = harmonize_spectrum_keys(data) - return data - -def get_data(identifier: str) -> dict: - """ - Get data from GNPS, either from USI or Accession. if the identifier points to a known item in gnps, - it will return the full data, otherwise it will return partial data (ms2 data) - param identifier: str - USI or Accession - return: dict - dictionary of data - """ - - data = dict() - data['usi'] = None - - if "mzspec" in identifier: # It's a USI - data['usi'] = identifier - - if "accession" in identifier: # It's a library spectrum - original_identifier = str(identifier) - identifier = identifier.split(":")[-1] - else: # It's a USI that isn't a library spectrum - data = get_from_metabolomics_resolver(identifier) - data['id'] = identifier - data = harmonize_spectrum_keys(data) - - # Sort peaks if needed - if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: - data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) - return data - - link = "https://external.gnps2.org/gnpsspectrum?SpectrumID={}".format(identifier) - try: - res = requests.get(link) - parsed = res.json() - except Exception: - data = get_from_metabolomics_resolver(original_identifier) - data['usi'] = original_identifier - data['id'] = identifier - data = harmonize_spectrum_keys(data) - return data - - try: - data.update(parsed['annotations'][0]) - except KeyError: - pass - try: - data.update(parsed['spectruminfo']) - except KeyError: - pass - try: - data['comments'] = parsed['comments'] - except KeyError: - pass - - data = harmonize_spectrum_keys(data) - data['id'] = identifier - - # Ensure peaks are sorted - if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: - data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) - return data - -def load_helpers( - data: List[str], - ratio_to_base_peak: float = None, - remove_large_peaks: bool = True - ) -> List[Compound]: - """ Load helpers from a list of identifiers, failing gracefully if the smile string is invalid. - """ - loaded_helpers = [] - failed_helpers = [] - for h in data: - try: - lh = get_data(h) - ch = Compound( - spectrum=lh['peaks'], - precursor_mz=lh['precursor_mz'], - precursor_charge=lh['precursor_charge'], - adduct=lh.get('adduct', None), - smiles=lh.get('smiles', None) - ) - if ratio_to_base_peak: - ch.spectrum = filter_peaks_by_ratio_to_base_peak(ch.spectrum, ratio_to_base_peak=ratio_to_base_peak) - if remove_large_peaks: - ch.spectrum = remove_larger_than_precursor_peaks(ch.spectrum) - loaded_helpers.append(ch) - except Exception as e: - # Print the traceback - print(f"Error loading helper compound {h}: {str(e)}", flush=True) - traceback.print_exc(file=sys.stderr) - failed_helpers.append(h) - raise e - - print(f"Loaded {len(loaded_helpers)} helper compounds successfully. Failed to load {len(failed_helpers)} helper compounds: {failed_helpers}", flush=True) - return loaded_helpers +from app_utils import get_data, load_helpers, filter_peaks_by_ratio_to_base_peak, adduct_mapping def get_callbacks(app): @@ -436,7 +51,6 @@ def calculate_module(data): # Options propagated out of ModiFinder ratio_to_base_peak = args.pop('filter_peaks_variable', None) - remove_large_peaks = True # Args to pass to ModiFinder args['ppm_tolerance'] = float(args['ppm_tolerance']) @@ -450,7 +64,6 @@ def calculate_module(data): helper_compounds = load_helpers( helper_compounds, ratio_to_base_peak=ratio_to_base_peak, - remove_large_peaks=remove_large_peaks ) if data["SMILES1"] == "" or data["SMILES1"] is None: @@ -463,15 +76,22 @@ def calculate_module(data): # Use known compound adduct args['adduct'] = spectrum1.get('adduct', None) + spectrum1_peaks = spectrum1['peaks'] + spectrum2_peaks = spectrum2['peaks'] + + if ratio_to_base_peak: + spectrum1_peaks = filter_peaks_by_ratio_to_base_peak(spectrum1_peaks, ratio_to_base_peak=ratio_to_base_peak) + spectrum2_peaks = filter_peaks_by_ratio_to_base_peak(spectrum2_peaks, ratio_to_base_peak=ratio_to_base_peak) + main_compound = Compound( - spectrum=spectrum1['peaks'], + spectrum=spectrum1_peaks, precursor_mz=spectrum1['precursor_mz'], precursor_charge=spectrum1['precursor_charge'], adduct=spectrum1['adduct'], smiles=data["SMILES1"] ) mod_compound = Compound( - spectrum=spectrum2['peaks'], + spectrum=spectrum2_peaks, precursor_mz=spectrum2['precursor_mz'], precursor_charge=spectrum2['precursor_charge'], adduct=spectrum2['adduct'], @@ -479,7 +99,6 @@ def calculate_module(data): ) except Exception as e: - raise e # if exception is of type value error, return the error message if type(e) == ValueError: return None, None, None, None, str(e) @@ -489,17 +108,6 @@ def calculate_module(data): if main_compound.structure is None: return None, None, None, None, "Error loading SMILES1" - - # Perform actions for ratio_to_base_peak filter - if ratio_to_base_peak: - ratio_to_base_peak = float(ratio_to_base_peak) - main_compound.spectrum = filter_peaks_by_ratio_to_base_peak(main_compound.spectrum, ratio_to_base_peak) - mod_compound.spectrum = filter_peaks_by_ratio_to_base_peak(mod_compound.spectrum, ratio_to_base_peak) - - # Perform actions for remove_large_peaks filter - if remove_large_peaks: - main_compound.spectrum = remove_larger_than_precursor_peaks(main_compound.spectrum) - mod_compound.spectrum = remove_larger_than_precursor_peaks(mod_compound.spectrum) siteLocator = ModiFinder(main_compound, mod_compound, helpers=helper_compounds, **args) diff --git a/app_utils.py b/app_utils.py new file mode 100644 index 0000000..456b89a --- /dev/null +++ b/app_utils.py @@ -0,0 +1,367 @@ +import sys +import traceback +import requests +import json +from typing import List, Tuple + +from modifinder import Compound + +adduct_mapping = {'M+H': '[M+H]+', +'[M+H]': '[M+H]+', +'[M+H]+': '[M+H]+', +'[M+H]1+': '[M+H]+', +'M+H]': '[M+H]+', +'M+Na': '[M+Na]+', +'[M+Na]': '[M+Na]+', +'[M+Na]+': '[M+Na]+', +'[M+Na]1+': '[M+Na]+', +'2M+Na': '[2M+Na]+', +'M2+Na': '[2M+Na]+', +'[2M+Na]+': '[2M+Na]+', +'[2M+Na]': '[2M+Na]+', +'M+K': '[M+K]+', +'[M+K]': '[M+K]+', +'[M+K]+': '[M+K]+', +'[M+K]1+': '[M+K]+', +'[2M+K]+': '[2M+K]+', +'2M+K': '[2M+K]+', +'[2M+K]': '[2M+K]+', +'M+H-H20': '[M-H2O+H]+', +'M+H-H2O': '[M-H2O+H]+', +'[M-H2O+H]+': '[M-H2O+H]+', +'M-H20+H': '[M-H2O+H]+', +'[M+H-H2O]+': '[M-H2O+H]+', +'M-H2O+H': '[M-H2O+H]+', +'M+H-2H2O': '[M-2H2O+H]+', +'M-2H2O+H': '[M-2H2O+H]+', +'[M-2H2O+H]+': '[M-2H2O+H]+', +'M-2(H2O)+H': '[M-2H2O+H]+', +'2M+Na-2H': '[2M-2H+Na]-', +'2M-2H+Na': '[2M-2H+Na]-', +'M-H': '[M-H]-', +'[M-H]': '[M-H]-', +'[M-H]-': '[M-H]-', +'[M-H]1-': '[M-H]-', +'M-H-': '[M-H]-', +'M-H1': '[M-H]-', +'3M+Na': '[3M+Na]+', +'[3M+Na]+': '[3M+Na]+', +'[M]+': '[M]+', +'M+': '[M]+', +'M-e': '[M]+', +'M2+H': '[2M+H]+', +'2M+H': '[2M+H]+', +'[2M+H]+': '[2M+H]+', +'[2M+H]': '[2M+H]+', +'[M+2H]': '[M+2H]2+', +'[M+2H]2+': '[M+2H]2+', +'M+2H]': '[M+2H]2+', +'M+2H+2': '[M+2H]2+', +'M+2H': '[M+2H]2+', +'M+acetate': '[M+CH3COOH-H]-', +'M+CH3COOH-H': '[M+CH3COOH-H]-', +'M+CH3COO': '[M+CH3COOH-H]-', +'M+ACN+H': '[M+CH3CN+H]+', +'[M+ACN+H]+': '[M+CH3CN+H]+', +'[M+H+CH3CN]': '[M+CH3CN+H]+', +'M+2Na': '[M+2Na]2+', +'M+2Na]': '[M+2Na]2+', +'M+HCOO': '[M+HCOOH-H]-', +'[M-H+HCOOH]': '[M+HCOOH-H]-', +'M+FA-H': '[M+HCOOH-H]-', +'M+formate': '[M+HCOOH-H]-', +'[M+H+HCOOH]': '[M+HCOOH-H]-', +'2M+FA-H': '[2M+HCOOH-H]-', +'[2M-H+HCOOH]': '[2M+HCOOH-H]-', +'M+NH4': '[M+NH3+H]+', +'[M+NH4]+': '[M+NH3+H]+', +'[M+NH4]1+': '[M+NH3+H]+', +'[M+NH4]': '[M+NH3+H]+', +'2M+Hac-H': '[2M+CH3COOH-H]-', +'2M-H': '[2M-H]-', +'[2M-H]': '[2M-H]-', +'2M+NH4': '[2M+NH3+H]+', +'[2M+NH4]+': '[2M+NH3+H]+', +'[2M+NH4]': '[2M+NH3+H]+', +'[2M+Ca]2+': '[2M+Ca]2+', +'[M+Ca]2+': '[M+Ca]2+', +'[3M+Ca]2+': '[3M+Ca]2+', +'[2M+Ca-H]+': '[2M-H+Ca]+', +'[2M-H2O+H]+': '[2M-H2O+H]+', +'[4M+Ca]2+': '[4M+Ca]2+', +'[3M+NH4]+': '[3M+NH3+H]+', +'3M+NH4': '[3M+NH3+H]+', +'[2M-2H2O+H]+': '[2M-2H2O+H]+', +'[M+ACN+NH4]+': '[M+CH3CN+NH3+H]+', +'[5M+Ca]2+': '[5M+Ca]2+', +'[3M+K]+': '[3M+K]+', +'[3M+Ca-H]+': '[3M-H+Ca]2+', +'[M-H+2Na]+': '[M-H+2Na]+', +'M-H+2Na': '[M-H+2Na]+', +'[M-3H2O+H]+': '[M-3H2O+H]+', +'M-3H2O+H': '[M-3H2O+H]+', +'[M-3H2O+2H]2+': '[M-3H2O+2H]2+', +'[M-2H2O+2H]2+': '[M-2H2O+2H]2+', +'[M-4H2O+H]+': '[M-4H2O+H]+', +'[M-5H2O+H]+': '[M-5H2O+H]+', +'[M+Ca-H]+': '[M+Ca-H]+', +'[2M-H+2Na]+': '[2M-H+2Na]+', +'[2M-3H2O+H]+': '[2M-3H2O+H]+', +'[M+H+Na]2+': '[M+Na+H]2+', +'[M-2H2O+NH4]+': '[M-2H2O+NH3+H]+', +'[2M-2H+Na]': '[2M-2H+Na]-', +'[M-H+CH3OH]': '[M+CH3OH-H]-', +'M+MeOH-H': '[M+CH3OH-H]-', +'M-H2O-H': '[M-H2O-H]-', +'[M-H-H2O]': '[M-H2O-H]-', +'M+Cl-': '[M+Cl]-', +'M+Cl': '[M+Cl]-', +'[M+Cl]': '[M+Cl]-', +'[M+Cl]1-': '[M+Cl]-', +'M+K-2H': '[M-2H+K]-', +'[M-2H+K]': '[M-2H+K]-', +'M-2H]': '[M-2H]2-', +'M-2H': '[M-2H]2-', +'M-2H-': '[M-2H]2-', +'M+Na-2H': '[M-2H+Na]-', +'[M-2H+Na]': '[M-2H+Na]-', +'M+Br': '[M+Br]-', +'[M+Br]1-': '[M+Br]-', +'3M-H': '[3M-H]-', +'[3M-H]': '[3M-H]-', +'[M+H+CH3OH]': '[M+CH3OH+H]+', +'M+CH3OH+H': '[M+CH3OH+H]+', +'[2M+H+CH3CN]': '[2M+CH3CN+H]+', +'M-CO2-H': '[M-CO2-H]-', +'[2M-2H+K]': '[2M-2H+K]-', +'2M+K-2H': '[2M-2H+K]-', +'[M+Na+CH3CN]': '[M+CH3CN+Na]+', +'M-H2+H': '[M-H2+H]-', +'M-H+Cl]': '[M-H+Cl]2-', +'M-H+Cl': '[M-H+Cl]2-', +'3M+H': '[3M+H]+', +'[3M+H]': '[3M+H]+', +'M+H-NH3': '[M-NH3+H]+', +'M-NH3+H': '[M-NH3+H]+', +'M-H+C2H2O': '[M+C2H2O-H]-', +'M+H-C2H2O': '[M+C2H2O-H]-', +'M-H+CH2O2': '[M+CH2O2-H]-', +'M+CH2O2-H': '[M+CH2O2-H]-', +'M+TFA-H': '[M+C2HF3O2-H]-', +'M-C2HF3O2-H': '[M+C2HF3O2-H]-', +'[M]1+': '[M]1+'} + + +gnps_keys_mapping = { + ## precursor + "precursor_mz": "precursor_mz", + ## charge + "precursor_charge": "precursor_charge", + "charge": "precursor_charge", + ## smiles + "smiles": "smiles", + "smile": "smiles", + ## adduct + "adduct": "adduct", + ## peaks + "peaks": "peaks", + ## instrument + "instrument": "instrument", + ## name + "name": "name", + "compound_name": "name", + ## spectrum_id + "spectrum_id": "spectrum_id", + "spectrumid": "spectrum_id", + ## exact mass + "exact_mass": "exact_mass", + "exactmass": "exact_mass", + ## mz + "fragment_mz": "mz", + "mz": "mz", + "mzs": "mz", + ## intensity + "fragment_intensities": "intensity", + "intensities": "intensity", +} + +def filter_peaks_by_ratio_to_base_peak(peaks:List[Tuple[float,float]], ratio_to_base_peak:float = 0.01): + """Remove peaks with intensity lower than a given ratio to the base peak. + + Parameters + ---------- + peaks : List[Tuple[float, float]] + List of (mz, intensity) tuples representing the spectrum peaks. + ratio_to_base_peak : float (0, 1), default is 0.01 + The ratio to the base peak. + """ + + base_peak_intensity = max([intensity for (mz, intensity) in peaks]) + new_mz = [] + new_intensity = [] + for index, intensity in enumerate([peak[1] for peak in peaks]): + if intensity >= float(ratio_to_base_peak) * base_peak_intensity: + new_mz.append(peaks[index][0]) + new_intensity.append(intensity) + + return list(zip(new_mz, new_intensity)) + + + +def get_data(identifier: str) -> dict: + """ + Get data from GNPS, either from USI or Accession. if the identifier points to a known item in gnps, + it will return the full data, otherwise it will return partial data (ms2 data) + param identifier: str - USI or Accession + return: dict - dictionary of data + """ + + data = dict() + data['usi'] = None + + if "mzspec" in identifier: # It's a USI + data['usi'] = identifier + + if "accession" in identifier: # It's a library spectrum + original_identifier = str(identifier) + identifier = identifier.split(":")[-1] + else: # It's a USI that isn't a library spectrum + data = _get_from_metabolomics_resolver(identifier) + data['id'] = identifier + data = _harmonize_spectrum_keys(data) + + # Sort peaks if needed + if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: + data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) + return data + + link = "https://external.gnps2.org/gnpsspectrum?SpectrumID={}".format(identifier) + try: + res = requests.get(link) + parsed = res.json() + except Exception: + data = _get_from_metabolomics_resolver(original_identifier) + data['usi'] = original_identifier + data['id'] = identifier + data = _harmonize_spectrum_keys(data) + return data + + try: + data.update(parsed['annotations'][0]) + except KeyError: + pass + try: + data.update(parsed['spectruminfo']) + except KeyError: + pass + try: + data['comments'] = parsed['comments'] + except KeyError: + pass + + data = _harmonize_spectrum_keys(data) + data['id'] = identifier + + # Ensure peaks are sorted + if 'peaks' in data and isinstance(data['peaks'], list) and len(data['peaks']) > 0: + data['peaks'] = sorted(data['peaks'], key=lambda x: x[0]) + return data + +def load_helpers( + data: List[str], + ratio_to_base_peak: float = None, + ) -> List[Compound]: + """ Load helpers from a list of identifiers, failing gracefully if the smile string is invalid. + """ + loaded_helpers = [] + failed_helpers = [] + for h in data: + try: + lh = get_data(h) + + if ratio_to_base_peak: + lh['peaks'] = filter_peaks_by_ratio_to_base_peak(lh['peaks'], ratio_to_base_peak=ratio_to_base_peak) + + ch = Compound( + spectrum=lh['peaks'], + precursor_mz=lh['precursor_mz'], + precursor_charge=lh['precursor_charge'], + adduct=lh.get('adduct', None), + smiles=lh.get('smiles', None) + ) + + loaded_helpers.append(ch) + except Exception as e: + # Print the traceback + print(f"Error loading helper compound {h}: {str(e)}", flush=True) + traceback.print_exc(file=sys.stderr) + failed_helpers.append(h) + raise e + + print(f"Loaded {len(loaded_helpers)} helper compounds successfully. Failed to load {len(failed_helpers)} helper compounds: {failed_helpers}", flush=True) + return loaded_helpers + +def _harmonize_spectrum_keys(data): + """ + Parse the data to a universal format. + + This function takes a dictionary of data and converts it into a universal format. + It processes specific keys like "peaks_json" and "Charge" differently, and attempts + to convert other values to floats. If the conversion to float is successful and the + key is "Charge", it further converts the value to an integer. + + Args: + :data (dict): The input data dictionary to be parsed. + + Returns: + :dict: A dictionary with keys converted to a universal format and values processed + accordingly. + """ + def _convert_to_universal_key(key: str) -> str: + """ + Convert different types of keys to universal keys. + This function standardizes various key names to a universal format. + + Args: + :key (str): The key to be converted. + + Returns: + :str: The converted key. + """ + key = key.lower() + key = key.replace(" ", "_") + return gnps_keys_mapping.get(key, key) + + res = {} + for key, value in data.items(): + converted_key = _convert_to_universal_key(key) + if key == "peaks_json": + res['peaks'] = json.loads(value) + elif converted_key == "adduct": + res[converted_key] = adduct_mapping.get(value, value) + else: + try: + if converted_key in ["precursor_charge", "precursor_mz", "ms_level", "scan", "exact_mass"]: + value = float(value) + if converted_key in ["precursor_charge", "charge", "ms_level"]: + value = int(value) + except Exception: + raise ValueError(f"Could not convert {key} to number") + res[converted_key] = value + return res + +def _get_from_metabolomics_resolver(identifier: str) -> dict: + """ + Get partial data (ms2 data) from USI + param identifier: str - USI + return: dict - dictionary of data with keys: precursor_mz, precursor_charge, mz: list, intensity: list + """ + url = 'https://metabolomics-usi.gnps2.org/json/' + "?usi1=" + identifier + try: + r = requests.get(url) + data = json.loads(r.text) + except: + raise Exception("Error in retrieving data from GNPS for identifier: {}, link: {}".format(identifier, url)) + + data = _harmonize_spectrum_keys(data) + return data \ No newline at end of file diff --git a/pages/visualizer.py b/pages/visualizer.py index 51c4647..625605d 100644 --- a/pages/visualizer.py +++ b/pages/visualizer.py @@ -28,6 +28,8 @@ from flask import Flask, send_file, request, jsonify import json from app import app +from app_utils import get_data +import traceback from furl import furl from myopic_mces import MCES @@ -413,17 +415,25 @@ def update_spectra_output(Spec1, Spec2, boolean_inputs): input = Spec2 else: input = Spec1 - input = Spectrum(input, ignore_adduct_format=True) + + data = get_data(input) + input = Spectrum(**data, ignore_adduct_format=True) + png = mf_vis.draw_spectrum(input, **kwargs) else: - Spec1 = Spectrum(Spec1, ignore_adduct_format=True) - Spec2 = Spectrum(Spec2, ignore_adduct_format=True) + + data1 = get_data(Spec1) + data2 = get_data(Spec2) + + Spec1 = Spectrum(**data1, ignore_adduct_format=True) + Spec2 = Spectrum(**data2, ignore_adduct_format=True) cosine, matches = _cosine_fast(Spec1, Spec2, 0.1, 40, True) png = mf_vis.draw_alignment([Spec1, Spec2], [matches], **kwargs) img = png_to_showable_src(png) return html.Img(src=img, style={'margin': 'auto', 'height': '50vh'}) except Exception as e: + print(traceback.format_exc(), file=sys.stderr) return str(e) From 1659c14b3622078370bee5b544c50593c0866a3e Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Wed, 25 Feb 2026 11:28:21 -0800 Subject: [PATCH 7/9] Bump base version on remote. --- ModiFinder_base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ModiFinder_base b/ModiFinder_base index 72f2455..fff4fde 160000 --- a/ModiFinder_base +++ b/ModiFinder_base @@ -1 +1 @@ -Subproject commit 72f2455760453716abb85c1618f1c1f5752d3235 +Subproject commit fff4fde7e6a69ffdf0d7955fc3208f964379af62 From 94bd53d7d9cecd8bcf1f9af4f59300198be06161 Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Wed, 25 Feb 2026 11:43:30 -0800 Subject: [PATCH 8/9] Sync base version. --- ModiFinder_base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ModiFinder_base b/ModiFinder_base index fff4fde..a511460 160000 --- a/ModiFinder_base +++ b/ModiFinder_base @@ -1 +1 @@ -Subproject commit fff4fde7e6a69ffdf0d7955fc3208f964379af62 +Subproject commit a511460410faf57f9fb5303f5843d4cc2094c3bf From f8ef7e5f29f86f47c7b080ff833d3cfe5942faba Mon Sep 17 00:00:00 2001 From: Michael Strobel Date: Wed, 25 Feb 2026 11:51:29 -0800 Subject: [PATCH 9/9] Update prod data path. --- docker-compose-prod.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index d90a97f..ae5b0c9 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -2,5 +2,5 @@ version: '3' services: mod-site: volumes: - - /home/user/LabData/Reza/data:/app/data:rw + - /nas-services/data_resources/modifinder:/app/data:ro \ No newline at end of file