|
| 1 | +/* |
| 2 | + * Copyright (C) 2007-2026 Syed Asad Rahman <asad.rahman@bioinceptionlabs.com>. |
| 3 | + * |
| 4 | + * This library is free software; you can redistribute it and/or |
| 5 | + * modify it under the terms of the GNU Lesser General Public |
| 6 | + * License as published by the Free Software Foundation; either |
| 7 | + * version 2.1 of the License, or (at your option) any later version. |
| 8 | + * |
| 9 | + * This library is distributed in the hope that it will be useful, |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | + * Lesser General Public License for more details. |
| 13 | + * |
| 14 | + * You should have received a copy of the GNU Lesser General Public |
| 15 | + * License along with this library; if not, write to the Free Software |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
| 17 | + * MA 02110-1301 USA |
| 18 | + */ |
| 19 | +package com.bioinceptionlabs.reactionblast.api; |
| 20 | + |
| 21 | +import java.security.MessageDigest; |
| 22 | +import java.security.NoSuchAlgorithmException; |
| 23 | +import java.util.ArrayList; |
| 24 | +import java.util.Collections; |
| 25 | +import java.util.HashMap; |
| 26 | +import java.util.List; |
| 27 | +import java.util.Map; |
| 28 | +import java.util.TreeMap; |
| 29 | + |
| 30 | +/** |
| 31 | + * Canonical reaction signature generator using Weisfeiler-Lehman (WL) graph |
| 32 | + * hashing on the Imaginary Transition State (ITS) graph. |
| 33 | + * |
| 34 | + * The ITS graph merges reactant and product molecular graphs, with edge labels |
| 35 | + * encoding bond changes (formed, cleaved, order changed). The WL hash produces |
| 36 | + * a canonical, invariant fingerprint that is: |
| 37 | + * - Deterministic (same reaction always gives same hash) |
| 38 | + * - Permutation-invariant (independent of atom ordering) |
| 39 | + * - Hierarchical (deeper iterations capture wider neighborhood) |
| 40 | + * |
| 41 | + * Based on the Weisfeiler-Lehman graph isomorphism test (1968) and its |
| 42 | + * application to molecular graphs. Implementation is IP-free (public domain |
| 43 | + * algorithm, no dependency on external tools like Nauty). |
| 44 | + * |
| 45 | + * References: |
| 46 | + * - Weisfeiler, Lehman (1968): "A reduction of a graph to a canonical form" |
| 47 | + * - Shervashidze et al. (2011): "Weisfeiler-Lehman Graph Kernels" (JMLR) |
| 48 | + * - Leber (2008): R-matrix canonicalization for enzymatic reactions |
| 49 | + * - Phan et al. (2025): SynKit graph-based reaction canonicalization |
| 50 | + * |
| 51 | + * @author Syed Asad Rahman <asad.rahman@bioinceptionlabs.com> |
| 52 | + */ |
| 53 | +public final class ReactionCanonicalizer { |
| 54 | + |
| 55 | + private ReactionCanonicalizer() {} |
| 56 | + |
| 57 | + private static final int WL_ITERATIONS = 3; |
| 58 | + |
| 59 | + /** |
| 60 | + * Compute a canonical hash for a reaction based on its bond changes. |
| 61 | + * The hash is invariant to atom ordering and deterministic. |
| 62 | + * |
| 63 | + * @param formedCleavedBonds bond formation/cleavage patterns (e.g., "C-O:1") |
| 64 | + * @param orderChangedBonds bond order change patterns (e.g., "C=C:1") |
| 65 | + * @param stereoChangedBonds stereo change patterns |
| 66 | + * @param reactionCentreFP reaction centre fingerprint patterns |
| 67 | + * @return canonical hex hash string (SHA-256 based) |
| 68 | + */ |
| 69 | + public static String computeCanonicalHash( |
| 70 | + List<String> formedCleavedBonds, |
| 71 | + List<String> orderChangedBonds, |
| 72 | + List<String> stereoChangedBonds, |
| 73 | + List<String> reactionCentreFP) { |
| 74 | + |
| 75 | + // Build ITS graph as adjacency representation |
| 76 | + // Nodes = unique atom types at reaction centre |
| 77 | + // Edges = bond changes with labels |
| 78 | + ITSGraph its = buildITSGraph(formedCleavedBonds, orderChangedBonds, |
| 79 | + stereoChangedBonds, reactionCentreFP); |
| 80 | + |
| 81 | + // Apply WL hash iterations |
| 82 | + String wlHash = wlGraphHash(its, WL_ITERATIONS); |
| 83 | + |
| 84 | + return wlHash; |
| 85 | + } |
| 86 | + |
| 87 | + /** |
| 88 | + * Build an Imaginary Transition State graph from bond change fingerprints. |
| 89 | + * The ITS graph encodes the reaction centre as a labeled graph where: |
| 90 | + * - Nodes are atom types involved in changes |
| 91 | + * - Edges are bond changes with labels (FORMED, CLEAVED, ORDER_CHANGE) |
| 92 | + */ |
| 93 | + static ITSGraph buildITSGraph( |
| 94 | + List<String> formedCleaved, |
| 95 | + List<String> orderChanges, |
| 96 | + List<String> stereoChanges, |
| 97 | + List<String> reactionCentre) { |
| 98 | + |
| 99 | + ITSGraph graph = new ITSGraph(); |
| 100 | + |
| 101 | + // Parse bond change patterns: "X-Y:weight" or "X=Y:weight" |
| 102 | + for (String pattern : formedCleaved) { |
| 103 | + addBondChange(graph, pattern, "FC"); |
| 104 | + } |
| 105 | + for (String pattern : orderChanges) { |
| 106 | + addBondChange(graph, pattern, "OC"); |
| 107 | + } |
| 108 | + for (String pattern : stereoChanges) { |
| 109 | + addBondChange(graph, pattern, "SC"); |
| 110 | + } |
| 111 | + for (String pattern : reactionCentre) { |
| 112 | + addBondChange(graph, pattern, "RC"); |
| 113 | + } |
| 114 | + |
| 115 | + return graph; |
| 116 | + } |
| 117 | + |
| 118 | + /** |
| 119 | + * Parse a bond change pattern like "C-O:1" or "C=C:2" and add to graph. |
| 120 | + */ |
| 121 | + private static void addBondChange(ITSGraph graph, String pattern, String changeType) { |
| 122 | + // Strip weight suffix |
| 123 | + int colon = pattern.lastIndexOf(':'); |
| 124 | + String bondPattern = colon > 0 ? pattern.substring(0, colon) : pattern; |
| 125 | + String weight = colon > 0 ? pattern.substring(colon + 1) : "1"; |
| 126 | + |
| 127 | + // Parse atom pair from bond pattern: "X-Y", "X=Y", "X#Y", "X%Y", "X@Y" |
| 128 | + String[] atoms = bondPattern.split("[-=#%@]"); |
| 129 | + if (atoms.length == 2) { |
| 130 | + // Extract bond type symbol |
| 131 | + char bondType = '-'; |
| 132 | + for (char c : bondPattern.toCharArray()) { |
| 133 | + if (c == '-' || c == '=' || c == '#' || c == '%' || c == '@') { |
| 134 | + bondType = c; |
| 135 | + break; |
| 136 | + } |
| 137 | + } |
| 138 | + |
| 139 | + String nodeA = atoms[0].trim(); |
| 140 | + String nodeB = atoms[1].trim(); |
| 141 | + String edgeLabel = changeType + ":" + bondType + ":" + weight; |
| 142 | + |
| 143 | + graph.addNode(nodeA); |
| 144 | + graph.addNode(nodeB); |
| 145 | + graph.addEdge(nodeA, nodeB, edgeLabel); |
| 146 | + } |
| 147 | + } |
| 148 | + |
| 149 | + /** |
| 150 | + * Weisfeiler-Lehman graph hash. |
| 151 | + * Iteratively refines node labels by aggregating sorted neighbor labels. |
| 152 | + * The final hash is a sorted concatenation of all refined labels. |
| 153 | + * |
| 154 | + * @param graph the ITS graph |
| 155 | + * @param iterations number of WL refinement iterations |
| 156 | + * @return canonical hash string |
| 157 | + */ |
| 158 | + static String wlGraphHash(ITSGraph graph, int iterations) { |
| 159 | + if (graph.nodes.isEmpty()) return "EMPTY"; |
| 160 | + |
| 161 | + // Initial labels = node type (atom symbol) |
| 162 | + Map<String, String> labels = new HashMap<>(); |
| 163 | + for (String node : graph.nodes.keySet()) { |
| 164 | + labels.put(node, graph.nodes.get(node)); |
| 165 | + } |
| 166 | + |
| 167 | + // Collect multiset labels at each iteration |
| 168 | + List<String> allLabels = new ArrayList<>(); |
| 169 | + |
| 170 | + for (int iter = 0; iter < iterations; iter++) { |
| 171 | + Map<String, String> newLabels = new HashMap<>(); |
| 172 | + |
| 173 | + for (String node : graph.nodes.keySet()) { |
| 174 | + // Get sorted neighbor labels with edge labels |
| 175 | + List<String> neighborInfo = new ArrayList<>(); |
| 176 | + for (ITSGraph.Edge edge : graph.getEdges(node)) { |
| 177 | + String neighborLabel = labels.get(edge.target); |
| 178 | + neighborInfo.add(edge.label + "|" + neighborLabel); |
| 179 | + } |
| 180 | + Collections.sort(neighborInfo); |
| 181 | + |
| 182 | + // New label = old label + sorted neighbor info |
| 183 | + String newLabel = labels.get(node) + "(" + String.join(",", neighborInfo) + ")"; |
| 184 | + newLabels.put(node, newLabel); |
| 185 | + } |
| 186 | + |
| 187 | + labels = newLabels; |
| 188 | + |
| 189 | + // Collect all labels at this iteration |
| 190 | + List<String> iterLabels = new ArrayList<>(labels.values()); |
| 191 | + Collections.sort(iterLabels); |
| 192 | + allLabels.addAll(iterLabels); |
| 193 | + } |
| 194 | + |
| 195 | + // Final canonical string = sorted concatenation of all iteration labels |
| 196 | + Collections.sort(allLabels); |
| 197 | + String canonical = String.join(";", allLabels); |
| 198 | + |
| 199 | + // Hash to fixed-length string |
| 200 | + return sha256Hex(canonical); |
| 201 | + } |
| 202 | + |
| 203 | + /** |
| 204 | + * SHA-256 hash of a string, returned as hex. |
| 205 | + */ |
| 206 | + private static String sha256Hex(String input) { |
| 207 | + try { |
| 208 | + MessageDigest md = MessageDigest.getInstance("SHA-256"); |
| 209 | + byte[] hash = md.digest(input.getBytes()); |
| 210 | + StringBuilder hex = new StringBuilder(); |
| 211 | + for (byte b : hash) { |
| 212 | + hex.append(String.format("%02x", b)); |
| 213 | + } |
| 214 | + return hex.toString(); |
| 215 | + } catch (NoSuchAlgorithmException e) { |
| 216 | + // SHA-256 is always available in Java |
| 217 | + throw new RuntimeException(e); |
| 218 | + } |
| 219 | + } |
| 220 | + |
| 221 | + /** |
| 222 | + * Internal ITS graph representation. |
| 223 | + * Nodes are atom types, edges are labeled bond changes. |
| 224 | + */ |
| 225 | + static class ITSGraph { |
| 226 | + final Map<String, String> nodes = new TreeMap<>(); // id → label |
| 227 | + final List<Edge> edges = new ArrayList<>(); |
| 228 | + |
| 229 | + void addNode(String id) { |
| 230 | + nodes.putIfAbsent(id, id); |
| 231 | + } |
| 232 | + |
| 233 | + void addEdge(String source, String target, String label) { |
| 234 | + edges.add(new Edge(source, target, label)); |
| 235 | + edges.add(new Edge(target, source, label)); // undirected |
| 236 | + } |
| 237 | + |
| 238 | + List<Edge> getEdges(String node) { |
| 239 | + List<Edge> result = new ArrayList<>(); |
| 240 | + for (Edge e : edges) { |
| 241 | + if (e.source.equals(node)) result.add(e); |
| 242 | + } |
| 243 | + return result; |
| 244 | + } |
| 245 | + |
| 246 | + static class Edge { |
| 247 | + final String source, target, label; |
| 248 | + Edge(String source, String target, String label) { |
| 249 | + this.source = source; |
| 250 | + this.target = target; |
| 251 | + this.label = label; |
| 252 | + } |
| 253 | + } |
| 254 | + } |
| 255 | +} |
0 commit comments