From a03df01adb3e86bed11e07c5ea08cd443519f920 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Fri, 11 Jul 2025 20:44:35 +0530 Subject: [PATCH 1/8] Semi-optimized canonization function --- .cargo/config.toml | 11 + .gitignore | 3 + Cargo.lock | 16 ++ Cargo.toml | 1 + src/canonize.rs | 566 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 + src/molecule.rs | 23 +- 7 files changed, 621 insertions(+), 2 deletions(-) create mode 100644 .cargo/config.toml create mode 100644 src/canonize.rs diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 00000000..d47f983e --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] diff --git a/.gitignore b/.gitignore index 5767e533..a96bd7b5 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,6 @@ dmypy.json # Pyre type checker .pyre/ + +# MacOS DS files +.DS_Store \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 6347195a..de1204fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -61,6 +61,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "any_ascii" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70033777eb8b5124a81a1889416543dddef2de240019b674c81285a2635a7e1e" + [[package]] name = "anyhow" version = "1.0.98" @@ -77,6 +83,7 @@ dependencies = [ "criterion", "csv", "graph-canon", + "lexical-sort", "petgraph", "pyo3", "rayon", @@ -551,6 +558,15 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexical-sort" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c09e4591611e231daf4d4c685a66cb0410cc1e502027a20ae55f2bb9e997207a" +dependencies = [ + "any_ascii", +] + [[package]] name = "libc" version = "0.2.174" diff --git a/Cargo.toml b/Cargo.toml index 31feb2f5..8137174d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ graph-canon = { git = "https://github.com/AgentElement/graph-canon", version = " petgraph = "0.6.5" pyo3 = { version = "0.24.1", features = ["abi3-py38", "extension-module"]} rayon = "1.10.0" +lexical-sort = "0.3.1" [dev-dependencies] criterion = "0.3" diff --git a/src/canonize.rs b/src/canonize.rs new file mode 100644 index 00000000..219a7b88 --- /dev/null +++ b/src/canonize.rs @@ -0,0 +1,566 @@ +use std::{collections::{HashMap, HashSet, VecDeque}}; +use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; +use crate::molecule::{AtomOrBond, CGraph, Molecule}; +use petgraph::{graph::{NodeIndex}, Direction::{Incoming, Outgoing}, Graph}; + +#[derive(Debug, Clone)] +struct DAGVert { + atom_idx: NodeIndex, + inv: u32, + order: String, + parents: Vec, + level: u32 +} + +impl DAGVert { + pub fn new(atom_idx: NodeIndex, parents: Vec, level: u32) -> Self { + DAGVert { + atom_idx, + inv: 0, + parents, + level, + order: String::new() + } + } +} + +#[derive(Debug, Clone)] +struct MolAtomNode { + color: u32, + inv: u32, + order: String, + num_parents: u32 +} + +impl MolAtomNode { + pub fn new(color: u32, inv: u32, order: String, num_parents: u32) -> Self { + MolAtomNode {color, inv, order, num_parents} + } +} + +// Compute the assembly index of a molecule +pub fn canonize(molecule: &Molecule) -> String { + let mgraph = molecule.graph(); + let mut mol_graph = CGraph::new_undirected(); + let mut vtx_map = vec![NodeIndex::default(); mgraph.node_count()]; + + for bond_idx in mgraph.edge_indices() { + let bond = mgraph.edge_weight(bond_idx).unwrap(); + let (start_atom_idx, end_atom_idx) = mgraph.edge_endpoints(bond_idx).unwrap(); + let start_atom = mgraph.node_weight(start_atom_idx).unwrap(); + let end_atom = mgraph.node_weight(end_atom_idx).unwrap(); + + let new_bond_node_idx = mol_graph.add_node(AtomOrBond::Bond(*bond)); + + if *vtx_map.get(start_atom_idx.index() as usize).unwrap() == NodeIndex::default() { + vtx_map[start_atom_idx.index() as usize] = mol_graph.add_node(AtomOrBond::Atom(*start_atom)) + } + + mol_graph.add_edge(*vtx_map.get(start_atom_idx.index() as usize).unwrap(), new_bond_node_idx, ()); + + if *vtx_map.get(end_atom_idx.index() as usize).unwrap() == NodeIndex::default() { + vtx_map[end_atom_idx.index() as usize] = mol_graph.add_node(AtomOrBond::Atom(*end_atom)) + } + + mol_graph.add_edge(*vtx_map.get(end_atom_idx.index() as usize).unwrap(), new_bond_node_idx, ()); + + } + + let mut max_string = String::new(); + for root in mol_graph.node_indices() { + // for each node in the molecule graph create a signature + /* + 1. create a DAG from each start node + */ + let mut DAG = Graph::::new(); + let mut DAG_vertex_map: HashMap<(NodeIndex, u32), NodeIndex> = HashMap::new(); + let mut mol_g_dag_vertex_map: Vec> = vec![vec![]; mol_graph.node_count()]; + let mut dag_level_list: Vec> = vec![vec![]; mol_graph.node_count()]; + let mut max_level: u32 = 0; + + { + let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); + let mut visited: VecDeque<(NodeIndex,u32)> = VecDeque::new(); + + visited.push_back((root, 0)); + seen_edges_cache.insert((root, root), 0); + + let root_vertex_id = DAG.add_node(DAGVert::new(root, [].to_vec(), 0)); + + DAG_vertex_map.insert((root, 0), root_vertex_id); + dag_level_list[0].push(root_vertex_id); + mol_g_dag_vertex_map[root.index() as usize].push(root_vertex_id); + + loop { + + let (curr, level) = visited.pop_front().unwrap(); + + for neigh in mol_graph.neighbors(curr) { + let mut add_node_to_dag = false; + + //check if curr -> neigh or neigh -> curr already exists + match seen_edges_cache.get(&(curr, neigh)) { + Some(seen_at_level) => { + // edge already exists at a level above + if *seen_at_level < (level+1) { + continue; + } + else { + //edge at the same level + add_node_to_dag = true; + } + }, + None => { + //No edge found + add_node_to_dag = true; + } + } + + if add_node_to_dag { + + //check if a atom has already been processed during this current level's processing + match DAG_vertex_map.get(&(neigh, (level+1))) { + Some(present_node_idx) => { + seen_edges_cache.insert((curr, neigh), level+1); + seen_edges_cache.insert((neigh, curr), level+1); + //get parent node's NodeIndex + match DAG_vertex_map.get(&(curr, level)) { + Some(parent_node_idx) => { + DAG.add_edge(*parent_node_idx, *present_node_idx, ""); + // add as parent in the DAGvert + (&mut DAG[*present_node_idx]).parents.push(*parent_node_idx); + } + None => {} + } + //skip rest of the processing for the atom + continue; + } + None => {} + } + + // haven't seen the atom before so add it to DAG + max_level = level + 1; + seen_edges_cache.insert((curr, neigh), level+1); + seen_edges_cache.insert((neigh, curr), level+1); + let child_node_idx = DAG.add_node(DAGVert::new(neigh, [].to_vec(), level+1)); + DAG_vertex_map.insert((neigh, level+1), child_node_idx); + + // Overriding the map!!! neigh can be seen before in previous layer + mol_g_dag_vertex_map[neigh.index() as usize].push(child_node_idx); + + // Insert into a level by level hashmap of dag nodes + dag_level_list[(level+1) as usize].push(child_node_idx); + + visited.push_back((neigh, level+1)); + //get parent node's NodeIndex + match DAG_vertex_map.get(&(curr, level)) { + Some(parent_node_idx) => { + DAG.add_edge(*parent_node_idx, child_node_idx, ""); + // add as parent in the DAGvert + (&mut DAG[child_node_idx]).parents.push(*parent_node_idx); + } + None => {} + } + } + } + + if visited.len() == 0 { + break; + } + } + } + + /* + 2.1. Initialize the molecule graph with color = 0 and invariant no. for each atom from (atom_type,#parents in DAG) + 2.2. Do lexicographical ordering of the (atom_type, #parents in DAG) + */ + let mut extended_molg_atom_map: Vec = Vec::with_capacity(mol_graph.node_count()); + let mut order_str_set: HashSet = HashSet::new(); + + // Each atom does not have just one vertex in dag!!! + for atom_node in mol_graph.node_indices() { + // find unique parents for an atom's associated vertices in DAG + let atom_assoc_vert_list = &mol_g_dag_vertex_map[atom_node.index() as usize]; + let mut parents= HashSet::new(); + for vert_id in atom_assoc_vert_list { + for parent in &DAG[*vert_id].parents { + parents.insert(parent); + } + } + let parent_len = parents.len(); + let atom_str = mol_graph[atom_node].to_string(); + let atom_order_str = format!("{}{}", atom_str, parent_len); + order_str_set.insert(atom_order_str.clone()); + extended_molg_atom_map.insert(atom_node.index() as usize, MolAtomNode::new(0, 0, atom_order_str, parent_len as u32)); + } + + // lexico-sort + let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + + let mut order_idx: HashMap = HashMap::new(); + + for (idx, order_str) in ordered_vec.iter().enumerate() { + order_idx.insert(order_str.clone(), (idx as u32)+1); + } + + // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in DAG) + for atom_node in mol_graph.node_indices() { + extended_molg_atom_map[atom_node.index() as usize].inv = *order_idx.get(&extended_molg_atom_map[atom_node.index() as usize].order).unwrap(); + } + + // get the canonized string for current root atom + let canon_string = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level, 1, "".to_string()); + + // lexico-compare strings to save the max one. + if lexical_cmp(&max_string, &canon_string).is_lt() { + max_string = canon_string + } + } + return max_string; +} + +fn canonize_signature( + mol_graph: &CGraph, + mut DAG: &mut Graph::, + mut extended_molg_atom_map: &mut Vec, + dag_level_list: &Vec>, + max_level: u32, + color_c: u32, + s_max: String, +) -> String { + // 1. get the invariants for each atom + invariant_atom(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level); + + // 2. generate orbits based on atom's invariant values + let mut orbits: HashMap> = HashMap::new(); + + for atom in mol_graph.node_indices() { + // let extended_atom = extended_molg_atom_map.get(&atom).unwrap(); + let extended_atom = &extended_molg_atom_map[atom.index() as usize]; + let atom_inv = extended_atom.inv; + let parent_len = extended_atom.num_parents; + // only add atoms which have 2 or more parents in DAG + if parent_len >= 2 { + orbits.entry(atom_inv).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + } + + // 3. max length of any orbit + let mut max_orbit_len = 0; + orbits.values().for_each(|orbit| if orbit.len() > max_orbit_len {max_orbit_len = orbit.len()}); + + if max_orbit_len >= 2 { + // find the orbits with max len of atoms + let max_orbits = orbits.keys().filter(|orbit| orbits.get(&orbit).unwrap().len() == max_orbit_len).collect::>(); + // if multiple then use orbit with min value + let min_orbit = (if (&max_orbits.len()).clone() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); + + let mut local_smax = s_max.clone(); + // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit + for atom in orbits.get(&min_orbit).unwrap() { + extended_molg_atom_map[atom.index() as usize].color = color_c as u32; + local_smax = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level, color_c+1, local_smax); + extended_molg_atom_map[atom.index() as usize].color = 0; + } + return local_smax; + } + else { + // no need to recurse further and print the signature-string + for atom in mol_graph.node_indices() { + let extended_atom = &extended_molg_atom_map[atom.index() as usize]; + let atom_inv = extended_atom.inv; + let atom_color = extended_atom.color; + let parent_len = extended_atom.num_parents; + // first update any atom without a color to be same as its invariant value + if (atom_color == 0) && (parent_len >= 2) { + extended_molg_atom_map[atom.index() as usize].color = atom_inv; + } + } + // start from root node of the DAG + let root_node = DAG.node_indices().find(|vert| DAG.neighbors_directed(*vert, Incoming).count() == 0).unwrap(); + let local_smax = print_signature_string(root_node, &DAG, &mol_graph, &extended_molg_atom_map, &mut vec![]); + if local_smax.len() > s_max.len() { + return local_smax; + } + else { + return s_max; + } + } +} + +fn print_signature_string( + vertex: NodeIndex, + DAG: &Graph::, + mol_graph: &CGraph, + extended_molg_atom_map: &Vec, + edges: &mut Vec<(NodeIndex, NodeIndex)> +) -> String { + let mut print_sign = String::new(); + print_sign.push('['); + let atom_idx = DAG[vertex].atom_idx; + let atom = &mol_graph[DAG[vertex].atom_idx]; + print_sign.push_str(&atom.to_string()); + let atom_color = extended_molg_atom_map[atom_idx.index() as usize].color; + if atom_color != 0 { + print_sign.push(','); + print_sign.push_str(&atom_color.to_string()); + } + print_sign.push(']'); + + let mut child_vec = DAG.neighbors_directed(vertex, Outgoing).collect::>(); + if child_vec.len() == 0 { return print_sign; } + else { + // sort children in descending order of inv + child_vec.sort_by(|vert_a, vert_b| DAG[*vert_b].inv.cmp(&DAG[*vert_a].inv)); + + let mut sub_print_sign = String::new(); + + for child in child_vec { + if let Some(_edge) = edges.iter().find(|egde| (egde.0 == vertex) && (egde.1 == child)) {} + else { + // if the edge is not already seen then add it to seen and generate signature-string for the child + edges.push((vertex, child)); + sub_print_sign.push_str(&print_signature_string(child, &DAG, &mol_graph, &extended_molg_atom_map, edges)); + } + } + if sub_print_sign.len() > 0 { + print_sign.push('('); + print_sign.push_str(&sub_print_sign); + print_sign.push(')'); + } + return print_sign; + } +} + +/* + 3. Generate Invariant for Atoms + */ +fn invariant_atom( + mol_graph: &CGraph, + mut DAG: &mut Graph::, + extended_molg_atom_map: &mut Vec, + dag_level_list: &Vec>, + max_level: u32, +) { + let mut count = 0; + let mut initial = true; + loop { + // Unique invariant values + let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map[atom_idx.index() as usize].inv)).len(); + + /* + 3.1 Generate Invariants for DAG vertex + */ + + // first bottom-up + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true, initial); + + initial = false; + + // then top-down + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false, initial); + + // Create a vector for each atom in molecule graph based on associated vertex in DAG + let mut order_map_vert_atom: Vec> = vec![vec![0;(max_level+1).try_into().unwrap()]; mol_graph.node_count()]; + + //for reverse sorting use: max_level - DAG[vert].level as per paper + for vert in DAG.node_indices() { + order_map_vert_atom[DAG[vert].atom_idx.index() as usize][(max_level - DAG[vert].level) as usize] = DAG[vert].inv; + } + + let mut order_to_atom: HashMap> = HashMap::new(); + + // turn vectors into strings for sorting + for atom in mol_graph.node_indices() { + let order_str = order_map_vert_atom[atom.index() as usize].clone().into_iter().map(|i| i.to_string()).collect::(); + order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + + // lexico-sort the vectors-strings + let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); + atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + // atom_ordered_vec.string_sort_unstable(natural_cmp); + // descend sort + atom_ordered_vec.reverse(); + + // assign the invariant of atom as the order of vectors-strings + for (idx, order) in atom_ordered_vec.iter().enumerate() { + for atom in order_to_atom.get(*order).unwrap() { + // extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = (idx as u32)+1); + extended_molg_atom_map[atom.index() as usize].inv = (idx as u32)+1; + } + } + + + let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map[atom_idx.index() as usize].inv)).len(); + + // compare the no. of invariants of all the atoms with the one's they started from + if start_inv_atoms == end_inv_atoms { + break; + } + + // Naive way of stopping + if count > mol_graph.node_count() { + println!("breaking out because reached upper limit!"); + break; + } + count +=1; + } +} + +/* + 3. Generate Invariant for Vertices + */ +fn invariant_dag_vert( + DAG: &mut Graph::, + extended_molg_atom_map: &Vec, + dag_level_list: &Vec>, + max_level: u32, + bottom: bool, + initial: bool) { + // top-down or bottom-up calculation of invariants for each vertex in DAG + let mut curr_lvl_range = if bottom {max_level} else {0}; + loop { + // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors + let mut order_str_set: HashSet = HashSet::new(); + for vert in &dag_level_list[curr_lvl_range as usize] { + let atom_idx_for_vert = DAG[*vert].atom_idx; + let atom_node = &extended_molg_atom_map[atom_idx_for_vert.index() as usize]; + let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); + let vert_inv = DAG[*vert].inv; + let mut vert_order; + let mut child_inv_set: Vec = Vec::new(); + vert_order = if initial { format!("{}{}", atom_color, atom_inv) } else { format!("{}{}", atom_color, vert_inv) }; + if bottom { + // vert_order = format!("{}{}", atom_color, atom_inv); + for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { + child_inv_set.push(DAG[vert_neigh].inv); + } + } + else { + // vert_order = format!("{}{}", atom_color, vert_inv); + for vert_neigh in DAG.neighbors_directed(*vert, Incoming) { + child_inv_set.push(DAG[vert_neigh].inv); + } + } + + while child_inv_set.len() < 10 { + child_inv_set.push(0); + } + + child_inv_set.sort(); + child_inv_set.reverse(); + child_inv_set.iter().for_each(|val| vert_order.push_str(&format!("{}", *val))); + + let vec_string = format!("{:0>20}", vert_order); + DAG[*vert].order = vec_string.clone(); + order_str_set.insert(vec_string); + } + + // lexico-sort the invariant-strings in descending order + let mut ordered_vec: Vec = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(natural_cmp); + ordered_vec.reverse(); + + let mut order_idx: HashMap = HashMap::new(); + + for (idx, order_str) in ordered_vec.iter().enumerate() { + order_idx.insert(order_str.clone(), idx as u32); + } + + // assign the invariant of vertex as the order of invariant-strings + for vert in &dag_level_list[curr_lvl_range as usize] { + DAG[*vert].inv = (*order_idx.get(&DAG[*vert].order).unwrap())+1; + } + + if bottom { + if curr_lvl_range == 0 {break}; + curr_lvl_range -= 1; + } + else { + if curr_lvl_range == max_level {break}; + curr_lvl_range += 1; + } + } +} + + +mod tests { + #![allow(unused_imports)] + use std::fs; + use std::path::PathBuf; + use super::*; + use crate::loader; + + #[test] + fn canonize_benzene() { + let path = PathBuf::from(format!("./data/checks/benzene.mol")); + let molfile = fs::read_to_string(path).expect("Cannot read the data file"); + let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile."); + let canonical_repr = canonize(&molecule); + + println!("{}", canonical_repr); + + assert_eq!(canonical_repr, "[C]([2]([C]([1]([C]([2]([C,1])))))[1]([C]([2]([C]([1]([C,1]))))))") + } + + #[test] + fn canonize_anthracene() { + let path = PathBuf::from(format!("./data/checks/anthracene.mol")); + let molfile = fs::read_to_string(path).expect("Cannot read the data file"); + let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile."); + let canonical_repr = canonize(&molecule); + + println!("{}", canonical_repr); + + assert_eq!(canonical_repr, "[C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C,1])))))[1]([C,8]([2]([C]([1]([C,1])))))))))[1]([C,17]([2]([C]([1]([C,8])))))))))[1]([C]([2]([C]([1]([C,17]))))))") + } + + // Dummy Molecule for testing + /* + fn canonize_dummy() { + + let mut mol_graph: Graph = Graph::::new_undirected(); + + let mut vec_nodes: Vec = Vec::new(); + vec_nodes.push(NodeIndex::new(99999)); + for i in 0..16 { + vec_nodes.push(mol_graph.add_node(Atom::new(Element::Carbon))); + } + mol_graph.add_edge(vec_nodes[1],vec_nodes[2],Bond::Single); + mol_graph.add_edge(vec_nodes[2],vec_nodes[3],Bond::Single); + mol_graph.add_edge(vec_nodes[3],vec_nodes[4],Bond::Single); + mol_graph.add_edge(vec_nodes[4],vec_nodes[1],Bond::Single); + + mol_graph.add_edge(vec_nodes[1],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[2],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[3],vec_nodes[9],Bond::Single); + mol_graph.add_edge(vec_nodes[4],vec_nodes[11],Bond::Single); + + mol_graph.add_edge(vec_nodes[13],vec_nodes[14],Bond::Single); + mol_graph.add_edge(vec_nodes[14],vec_nodes[15],Bond::Single); + mol_graph.add_edge(vec_nodes[15],vec_nodes[16],Bond::Single); + mol_graph.add_edge(vec_nodes[16],vec_nodes[13],Bond::Single); + + mol_graph.add_edge(vec_nodes[13],vec_nodes[6],Bond::Single); + mol_graph.add_edge(vec_nodes[14],vec_nodes[8],Bond::Single); + mol_graph.add_edge(vec_nodes[15],vec_nodes[10],Bond::Single); + mol_graph.add_edge(vec_nodes[16],vec_nodes[12],Bond::Single); + + mol_graph.add_edge(vec_nodes[12],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[6],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[8],vec_nodes[9],Bond::Single); + mol_graph.add_edge(vec_nodes[10],vec_nodes[11],Bond::Single); + + mol_graph.add_edge(vec_nodes[12],vec_nodes[11],Bond::Single); + mol_graph.add_edge(vec_nodes[6],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[8],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[10],vec_nodes[9],Bond::Single); + } + */ +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 29f3fdb1..dc13579a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,6 +43,9 @@ pub mod assembly; // Utility functions mod utils; +// Canonization function +pub mod canonize; + // Python library #[cfg(feature = "python")] pub mod python; diff --git a/src/molecule.rs b/src/molecule.rs index 6f19a90d..9a2c9fb3 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -22,7 +22,7 @@ use crate::utils::{edge_induced_subgraph, is_subset_connected}; pub(crate) type Index = u32; pub(crate) type MGraph = Graph; -type CGraph = Graph; +pub(crate) type CGraph = Graph; type EdgeSet = BTreeSet>; type NodeSet = BTreeSet>; @@ -200,12 +200,31 @@ pub enum Bond { Triple, } +impl Display for Bond { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self { + Bond::Single => write!(f, "1"), + Bond::Double => write!(f, "2"), + Bond::Triple => write!(f, "3"), + } + } +} + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -enum AtomOrBond { +pub enum AtomOrBond { Atom(Atom), Bond(Bond), } +impl Display for AtomOrBond { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self { + AtomOrBond::Atom(atom) => write!(f, "{}", atom.element().to_string()), + AtomOrBond::Bond(bond) => write!(f, "{}", bond.to_string()), + } + } +} + /// Thrown when `from::()` does not recieve a 1, 2, or 3. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ParseBondError; From 2e26c54cca630041f14582d5eef21e903988ecc7 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Fri, 11 Jul 2025 23:45:00 +0530 Subject: [PATCH 2/8] incorporated clippy changes --- src/canonize.rs | 233 ++++++++++++++++++++++-------------------------- src/molecule.rs | 4 +- 2 files changed, 108 insertions(+), 129 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 219a7b88..e33e74d9 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -52,17 +52,17 @@ pub fn canonize(molecule: &Molecule) -> String { let new_bond_node_idx = mol_graph.add_node(AtomOrBond::Bond(*bond)); - if *vtx_map.get(start_atom_idx.index() as usize).unwrap() == NodeIndex::default() { - vtx_map[start_atom_idx.index() as usize] = mol_graph.add_node(AtomOrBond::Atom(*start_atom)) + if *vtx_map.get(start_atom_idx.index()).unwrap() == NodeIndex::default() { + vtx_map[start_atom_idx.index()] = mol_graph.add_node(AtomOrBond::Atom(*start_atom)) } - mol_graph.add_edge(*vtx_map.get(start_atom_idx.index() as usize).unwrap(), new_bond_node_idx, ()); + mol_graph.add_edge(*vtx_map.get(start_atom_idx.index()).unwrap(), new_bond_node_idx, ()); - if *vtx_map.get(end_atom_idx.index() as usize).unwrap() == NodeIndex::default() { - vtx_map[end_atom_idx.index() as usize] = mol_graph.add_node(AtomOrBond::Atom(*end_atom)) + if *vtx_map.get(end_atom_idx.index()).unwrap() == NodeIndex::default() { + vtx_map[end_atom_idx.index()] = mol_graph.add_node(AtomOrBond::Atom(*end_atom)) } - mol_graph.add_edge(*vtx_map.get(end_atom_idx.index() as usize).unwrap(), new_bond_node_idx, ()); + mol_graph.add_edge(*vtx_map.get(end_atom_idx.index()).unwrap(), new_bond_node_idx, ()); } @@ -70,10 +70,10 @@ pub fn canonize(molecule: &Molecule) -> String { for root in mol_graph.node_indices() { // for each node in the molecule graph create a signature /* - 1. create a DAG from each start node + 1. create a dag from each start node */ - let mut DAG = Graph::::new(); - let mut DAG_vertex_map: HashMap<(NodeIndex, u32), NodeIndex> = HashMap::new(); + let mut dag = Graph::::new(); + let mut dag_vertex_map: HashMap<(NodeIndex, u32), NodeIndex> = HashMap::new(); let mut mol_g_dag_vertex_map: Vec> = vec![vec![]; mol_graph.node_count()]; let mut dag_level_list: Vec> = vec![vec![]; mol_graph.node_count()]; let mut max_level: u32 = 0; @@ -85,113 +85,94 @@ pub fn canonize(molecule: &Molecule) -> String { visited.push_back((root, 0)); seen_edges_cache.insert((root, root), 0); - let root_vertex_id = DAG.add_node(DAGVert::new(root, [].to_vec(), 0)); + let root_vertex_id = dag.add_node(DAGVert::new(root, [].to_vec(), 0)); - DAG_vertex_map.insert((root, 0), root_vertex_id); + dag_vertex_map.insert((root, 0), root_vertex_id); dag_level_list[0].push(root_vertex_id); - mol_g_dag_vertex_map[root.index() as usize].push(root_vertex_id); + mol_g_dag_vertex_map[root.index()].push(root_vertex_id); loop { let (curr, level) = visited.pop_front().unwrap(); for neigh in mol_graph.neighbors(curr) { - let mut add_node_to_dag = false; + // let mut add_node_to_dag = false; //check if curr -> neigh or neigh -> curr already exists - match seen_edges_cache.get(&(curr, neigh)) { - Some(seen_at_level) => { - // edge already exists at a level above - if *seen_at_level < (level+1) { - continue; - } - else { - //edge at the same level - add_node_to_dag = true; - } - }, - None => { - //No edge found - add_node_to_dag = true; + if let Some(seen_at_level) = seen_edges_cache.get(&(curr, neigh)) { + // edge already exists at a level above + if *seen_at_level < (level+1) { + continue; } } - if add_node_to_dag { + // if add_node_to_dag { //check if a atom has already been processed during this current level's processing - match DAG_vertex_map.get(&(neigh, (level+1))) { - Some(present_node_idx) => { - seen_edges_cache.insert((curr, neigh), level+1); - seen_edges_cache.insert((neigh, curr), level+1); - //get parent node's NodeIndex - match DAG_vertex_map.get(&(curr, level)) { - Some(parent_node_idx) => { - DAG.add_edge(*parent_node_idx, *present_node_idx, ""); - // add as parent in the DAGvert - (&mut DAG[*present_node_idx]).parents.push(*parent_node_idx); - } - None => {} - } - //skip rest of the processing for the atom - continue; + if let Some(present_node_idx) = dag_vertex_map.get(&(neigh, (level+1))) { + seen_edges_cache.insert((curr, neigh), level+1); + seen_edges_cache.insert((neigh, curr), level+1); + //get parent node's NodeIndex + if let Some(parent_node_idx) = dag_vertex_map.get(&(curr, level)) { + dag.add_edge(*parent_node_idx, *present_node_idx, ""); + // add as parent in the DAGvert + dag[*present_node_idx].parents.push(*parent_node_idx); } - None => {} + //skip rest of the processing for the atom + continue; } - // haven't seen the atom before so add it to DAG + // haven't seen the atom before so add it to dag max_level = level + 1; seen_edges_cache.insert((curr, neigh), level+1); seen_edges_cache.insert((neigh, curr), level+1); - let child_node_idx = DAG.add_node(DAGVert::new(neigh, [].to_vec(), level+1)); - DAG_vertex_map.insert((neigh, level+1), child_node_idx); + let child_node_idx = dag.add_node(DAGVert::new(neigh, [].to_vec(), level+1)); + dag_vertex_map.insert((neigh, level+1), child_node_idx); // Overriding the map!!! neigh can be seen before in previous layer - mol_g_dag_vertex_map[neigh.index() as usize].push(child_node_idx); + mol_g_dag_vertex_map[neigh.index()].push(child_node_idx); // Insert into a level by level hashmap of dag nodes dag_level_list[(level+1) as usize].push(child_node_idx); visited.push_back((neigh, level+1)); //get parent node's NodeIndex - match DAG_vertex_map.get(&(curr, level)) { - Some(parent_node_idx) => { - DAG.add_edge(*parent_node_idx, child_node_idx, ""); - // add as parent in the DAGvert - (&mut DAG[child_node_idx]).parents.push(*parent_node_idx); - } - None => {} + if let Some(parent_node_idx) = dag_vertex_map.get(&(curr, level)) { + dag.add_edge(*parent_node_idx, child_node_idx, ""); + // add as parent in the DAGvert + dag[child_node_idx].parents.push(*parent_node_idx); } - } + // } } - if visited.len() == 0 { + if visited.is_empty() { break; } } } /* - 2.1. Initialize the molecule graph with color = 0 and invariant no. for each atom from (atom_type,#parents in DAG) - 2.2. Do lexicographical ordering of the (atom_type, #parents in DAG) + 2.1. Initialize the molecule graph with color = 0 and invariant no. for each atom from (atom_type,#parents in dag) + 2.2. Do lexicographical ordering of the (atom_type, #parents in dag) */ let mut extended_molg_atom_map: Vec = Vec::with_capacity(mol_graph.node_count()); let mut order_str_set: HashSet = HashSet::new(); // Each atom does not have just one vertex in dag!!! for atom_node in mol_graph.node_indices() { - // find unique parents for an atom's associated vertices in DAG - let atom_assoc_vert_list = &mol_g_dag_vertex_map[atom_node.index() as usize]; + // find unique parents for an atom's associated vertices in dag + let atom_assoc_vert_list = &mol_g_dag_vertex_map[atom_node.index()]; let mut parents= HashSet::new(); for vert_id in atom_assoc_vert_list { - for parent in &DAG[*vert_id].parents { + for parent in &dag[*vert_id].parents { parents.insert(parent); } } let parent_len = parents.len(); let atom_str = mol_graph[atom_node].to_string(); - let atom_order_str = format!("{}{}", atom_str, parent_len); + let atom_order_str = format!("{atom_str}{parent_len}"); order_str_set.insert(atom_order_str.clone()); - extended_molg_atom_map.insert(atom_node.index() as usize, MolAtomNode::new(0, 0, atom_order_str, parent_len as u32)); + extended_molg_atom_map.insert(atom_node.index(), MolAtomNode::new(0, 0, atom_order_str, parent_len as u32)); } // lexico-sort @@ -204,43 +185,43 @@ pub fn canonize(molecule: &Molecule) -> String { order_idx.insert(order_str.clone(), (idx as u32)+1); } - // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in DAG) + // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in dag) for atom_node in mol_graph.node_indices() { - extended_molg_atom_map[atom_node.index() as usize].inv = *order_idx.get(&extended_molg_atom_map[atom_node.index() as usize].order).unwrap(); + extended_molg_atom_map[atom_node.index()].inv = *order_idx.get(&extended_molg_atom_map[atom_node.index()].order).unwrap(); } // get the canonized string for current root atom - let canon_string = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level, 1, "".to_string()); + let canon_string = canonize_signature(&mol_graph, &mut dag, &mut extended_molg_atom_map, &dag_level_list, max_level, 1, "".to_string()); // lexico-compare strings to save the max one. if lexical_cmp(&max_string, &canon_string).is_lt() { max_string = canon_string } } - return max_string; + max_string } fn canonize_signature( mol_graph: &CGraph, - mut DAG: &mut Graph::, - mut extended_molg_atom_map: &mut Vec, + dag: &mut Graph::, + extended_molg_atom_map: &mut Vec, dag_level_list: &Vec>, max_level: u32, color_c: u32, s_max: String, ) -> String { // 1. get the invariants for each atom - invariant_atom(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level); + invariant_atom(mol_graph, dag, extended_molg_atom_map, dag_level_list, max_level); // 2. generate orbits based on atom's invariant values let mut orbits: HashMap> = HashMap::new(); for atom in mol_graph.node_indices() { // let extended_atom = extended_molg_atom_map.get(&atom).unwrap(); - let extended_atom = &extended_molg_atom_map[atom.index() as usize]; + let extended_atom = &extended_molg_atom_map[atom.index()]; let atom_inv = extended_atom.inv; let parent_len = extended_atom.num_parents; - // only add atoms which have 2 or more parents in DAG + // only add atoms which have 2 or more parents in dag if parent_len >= 2 { orbits.entry(atom_inv).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); } @@ -252,67 +233,67 @@ fn canonize_signature( if max_orbit_len >= 2 { // find the orbits with max len of atoms - let max_orbits = orbits.keys().filter(|orbit| orbits.get(&orbit).unwrap().len() == max_orbit_len).collect::>(); + let max_orbits = orbits.keys().filter(|orbit| orbits.get(orbit).unwrap().len() == max_orbit_len).collect::>(); // if multiple then use orbit with min value - let min_orbit = (if (&max_orbits.len()).clone() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); + let min_orbit = (if max_orbits.len() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); let mut local_smax = s_max.clone(); // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit - for atom in orbits.get(&min_orbit).unwrap() { - extended_molg_atom_map[atom.index() as usize].color = color_c as u32; - local_smax = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level, color_c+1, local_smax); - extended_molg_atom_map[atom.index() as usize].color = 0; + for atom in orbits.get(min_orbit).unwrap() { + extended_molg_atom_map[atom.index()].color = color_c; + local_smax = canonize_signature(mol_graph, dag, extended_molg_atom_map, dag_level_list, max_level, color_c+1, local_smax); + extended_molg_atom_map[atom.index()].color = 0; } - return local_smax; + local_smax } else { // no need to recurse further and print the signature-string for atom in mol_graph.node_indices() { - let extended_atom = &extended_molg_atom_map[atom.index() as usize]; + let extended_atom = &extended_molg_atom_map[atom.index()]; let atom_inv = extended_atom.inv; let atom_color = extended_atom.color; let parent_len = extended_atom.num_parents; // first update any atom without a color to be same as its invariant value if (atom_color == 0) && (parent_len >= 2) { - extended_molg_atom_map[atom.index() as usize].color = atom_inv; + extended_molg_atom_map[atom.index()].color = atom_inv; } } - // start from root node of the DAG - let root_node = DAG.node_indices().find(|vert| DAG.neighbors_directed(*vert, Incoming).count() == 0).unwrap(); - let local_smax = print_signature_string(root_node, &DAG, &mol_graph, &extended_molg_atom_map, &mut vec![]); + // start from root node of the dag + let root_node = dag.node_indices().find(|vert| dag.neighbors_directed(*vert, Incoming).count() == 0).unwrap(); + let local_smax = print_signature_string(root_node, dag, mol_graph, extended_molg_atom_map, &mut vec![]); if local_smax.len() > s_max.len() { - return local_smax; + local_smax } else { - return s_max; + s_max } } } fn print_signature_string( vertex: NodeIndex, - DAG: &Graph::, + dag: &Graph::, mol_graph: &CGraph, extended_molg_atom_map: &Vec, edges: &mut Vec<(NodeIndex, NodeIndex)> ) -> String { let mut print_sign = String::new(); print_sign.push('['); - let atom_idx = DAG[vertex].atom_idx; - let atom = &mol_graph[DAG[vertex].atom_idx]; + let atom_idx = dag[vertex].atom_idx; + let atom = &mol_graph[dag[vertex].atom_idx]; print_sign.push_str(&atom.to_string()); - let atom_color = extended_molg_atom_map[atom_idx.index() as usize].color; + let atom_color = extended_molg_atom_map[atom_idx.index()].color; if atom_color != 0 { print_sign.push(','); print_sign.push_str(&atom_color.to_string()); } print_sign.push(']'); - let mut child_vec = DAG.neighbors_directed(vertex, Outgoing).collect::>(); - if child_vec.len() == 0 { return print_sign; } + let mut child_vec = dag.neighbors_directed(vertex, Outgoing).collect::>(); + if child_vec.is_empty() { print_sign} else { // sort children in descending order of inv - child_vec.sort_by(|vert_a, vert_b| DAG[*vert_b].inv.cmp(&DAG[*vert_a].inv)); + child_vec.sort_by(|vert_a, vert_b| dag[*vert_b].inv.cmp(&dag[*vert_a].inv)); let mut sub_print_sign = String::new(); @@ -321,15 +302,15 @@ fn print_signature_string( else { // if the edge is not already seen then add it to seen and generate signature-string for the child edges.push((vertex, child)); - sub_print_sign.push_str(&print_signature_string(child, &DAG, &mol_graph, &extended_molg_atom_map, edges)); + sub_print_sign.push_str(&print_signature_string(child, dag, mol_graph, extended_molg_atom_map, edges)); } } - if sub_print_sign.len() > 0 { + if !sub_print_sign.is_empty() { print_sign.push('('); print_sign.push_str(&sub_print_sign); print_sign.push(')'); } - return print_sign; + print_sign } } @@ -338,7 +319,7 @@ fn print_signature_string( */ fn invariant_atom( mol_graph: &CGraph, - mut DAG: &mut Graph::, + dag: &mut Graph::, extended_molg_atom_map: &mut Vec, dag_level_list: &Vec>, max_level: u32, @@ -348,39 +329,38 @@ fn invariant_atom( loop { // Unique invariant values let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map[atom_idx.index() as usize].inv)).len(); + .map(|atom_idx| extended_molg_atom_map[atom_idx.index()].inv)).len(); /* - 3.1 Generate Invariants for DAG vertex + 3.1 Generate Invariants for dag vertex */ // first bottom-up - invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true, initial); + invariant_dag_vert(dag, extended_molg_atom_map, dag_level_list, max_level, true, initial); initial = false; // then top-down - invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false, initial); + invariant_dag_vert(dag, extended_molg_atom_map, dag_level_list, max_level, false, initial); - // Create a vector for each atom in molecule graph based on associated vertex in DAG + // Create a vector for each atom in molecule graph based on associated vertex in dag let mut order_map_vert_atom: Vec> = vec![vec![0;(max_level+1).try_into().unwrap()]; mol_graph.node_count()]; - //for reverse sorting use: max_level - DAG[vert].level as per paper - for vert in DAG.node_indices() { - order_map_vert_atom[DAG[vert].atom_idx.index() as usize][(max_level - DAG[vert].level) as usize] = DAG[vert].inv; + //for reverse sorting use: max_level - dag[vert].level as per paper + for vert in dag.node_indices() { + order_map_vert_atom[dag[vert].atom_idx.index()][(max_level - dag[vert].level) as usize] = dag[vert].inv; } let mut order_to_atom: HashMap> = HashMap::new(); // turn vectors into strings for sorting for atom in mol_graph.node_indices() { - let order_str = order_map_vert_atom[atom.index() as usize].clone().into_iter().map(|i| i.to_string()).collect::(); + let order_str = order_map_vert_atom[atom.index()].clone().into_iter().map(|i| i.to_string()).collect::(); order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); } // lexico-sort the vectors-strings - let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); + let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().collect(); atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); // atom_ordered_vec.string_sort_unstable(natural_cmp); // descend sort @@ -390,14 +370,13 @@ fn invariant_atom( for (idx, order) in atom_ordered_vec.iter().enumerate() { for atom in order_to_atom.get(*order).unwrap() { // extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = (idx as u32)+1); - extended_molg_atom_map[atom.index() as usize].inv = (idx as u32)+1; + extended_molg_atom_map[atom.index()].inv = (idx as u32)+1; } } let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map[atom_idx.index() as usize].inv)).len(); + .map(|atom_idx| extended_molg_atom_map[atom_idx.index()].inv)).len(); // compare the no. of invariants of all the atoms with the one's they started from if start_inv_atoms == end_inv_atoms { @@ -417,35 +396,35 @@ fn invariant_atom( 3. Generate Invariant for Vertices */ fn invariant_dag_vert( - DAG: &mut Graph::, - extended_molg_atom_map: &Vec, - dag_level_list: &Vec>, + dag: &mut Graph::, + extended_molg_atom_map: &[MolAtomNode], + dag_level_list: &[Vec], max_level: u32, bottom: bool, initial: bool) { - // top-down or bottom-up calculation of invariants for each vertex in DAG + // top-down or bottom-up calculation of invariants for each vertex in dag let mut curr_lvl_range = if bottom {max_level} else {0}; loop { // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors let mut order_str_set: HashSet = HashSet::new(); for vert in &dag_level_list[curr_lvl_range as usize] { - let atom_idx_for_vert = DAG[*vert].atom_idx; - let atom_node = &extended_molg_atom_map[atom_idx_for_vert.index() as usize]; + let atom_idx_for_vert = dag[*vert].atom_idx; + let atom_node = &extended_molg_atom_map[atom_idx_for_vert.index()]; let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); - let vert_inv = DAG[*vert].inv; + let vert_inv = dag[*vert].inv; let mut vert_order; let mut child_inv_set: Vec = Vec::new(); - vert_order = if initial { format!("{}{}", atom_color, atom_inv) } else { format!("{}{}", atom_color, vert_inv) }; + vert_order = if initial { format!("{atom_color}{atom_inv}") } else { format!("{atom_color}{vert_inv}") }; if bottom { // vert_order = format!("{}{}", atom_color, atom_inv); - for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { - child_inv_set.push(DAG[vert_neigh].inv); + for vert_neigh in dag.neighbors_directed(*vert, Outgoing) { + child_inv_set.push(dag[vert_neigh].inv); } } else { // vert_order = format!("{}{}", atom_color, vert_inv); - for vert_neigh in DAG.neighbors_directed(*vert, Incoming) { - child_inv_set.push(DAG[vert_neigh].inv); + for vert_neigh in dag.neighbors_directed(*vert, Incoming) { + child_inv_set.push(dag[vert_neigh].inv); } } @@ -457,8 +436,8 @@ fn invariant_dag_vert( child_inv_set.reverse(); child_inv_set.iter().for_each(|val| vert_order.push_str(&format!("{}", *val))); - let vec_string = format!("{:0>20}", vert_order); - DAG[*vert].order = vec_string.clone(); + let vec_string = format!("{vert_order:0>20}"); + dag[*vert].order = vec_string.clone(); order_str_set.insert(vec_string); } @@ -475,7 +454,7 @@ fn invariant_dag_vert( // assign the invariant of vertex as the order of invariant-strings for vert in &dag_level_list[curr_lvl_range as usize] { - DAG[*vert].inv = (*order_idx.get(&DAG[*vert].order).unwrap())+1; + dag[*vert].inv = (*order_idx.get(&dag[*vert].order).unwrap())+1; } if bottom { diff --git a/src/molecule.rs b/src/molecule.rs index 9a2c9fb3..b9269095 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -219,8 +219,8 @@ pub enum AtomOrBond { impl Display for AtomOrBond { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match &self { - AtomOrBond::Atom(atom) => write!(f, "{}", atom.element().to_string()), - AtomOrBond::Bond(bond) => write!(f, "{}", bond.to_string()), + AtomOrBond::Atom(atom) => write!(f, "{}", atom.element()), + AtomOrBond::Bond(bond) => write!(f, "{}", bond), } } } From 38d99cdd036ded953bb5d1f53f6975d5e91f4d6a Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Sat, 12 Jul 2025 00:00:26 +0530 Subject: [PATCH 3/8] incorporating clippy changes --- src/canonize.rs | 10 +++++----- src/molecule.rs | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index e33e74d9..7f1d7aab 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -204,8 +204,8 @@ pub fn canonize(molecule: &Molecule) -> String { fn canonize_signature( mol_graph: &CGraph, dag: &mut Graph::, - extended_molg_atom_map: &mut Vec, - dag_level_list: &Vec>, + extended_molg_atom_map: &mut [MolAtomNode], + dag_level_list: &[Vec], max_level: u32, color_c: u32, s_max: String, @@ -274,7 +274,7 @@ fn print_signature_string( vertex: NodeIndex, dag: &Graph::, mol_graph: &CGraph, - extended_molg_atom_map: &Vec, + extended_molg_atom_map: &[MolAtomNode], edges: &mut Vec<(NodeIndex, NodeIndex)> ) -> String { let mut print_sign = String::new(); @@ -320,8 +320,8 @@ fn print_signature_string( fn invariant_atom( mol_graph: &CGraph, dag: &mut Graph::, - extended_molg_atom_map: &mut Vec, - dag_level_list: &Vec>, + extended_molg_atom_map: &mut [MolAtomNode], + dag_level_list: &[Vec], max_level: u32, ) { let mut count = 0; diff --git a/src/molecule.rs b/src/molecule.rs index b9269095..d332ad70 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -220,7 +220,7 @@ impl Display for AtomOrBond { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match &self { AtomOrBond::Atom(atom) => write!(f, "{}", atom.element()), - AtomOrBond::Bond(bond) => write!(f, "{}", bond), + AtomOrBond::Bond(bond) => write!(f, "{bond}"), } } } From babbf262f2881d22ac065f4e548dda4d156076cc Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Sat, 12 Jul 2025 00:08:03 +0530 Subject: [PATCH 4/8] incorporated fmt changes --- src/canonize.rs | 454 ++++++++++++++++++++++++++++++------------------ 1 file changed, 286 insertions(+), 168 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 7f1d7aab..d4c79445 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,7 +1,11 @@ -use std::{collections::{HashMap, HashSet, VecDeque}}; -use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; use crate::molecule::{AtomOrBond, CGraph, Molecule}; -use petgraph::{graph::{NodeIndex}, Direction::{Incoming, Outgoing}, Graph}; +use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; +use petgraph::{ + graph::NodeIndex, + Direction::{Incoming, Outgoing}, + Graph, +}; +use std::collections::{HashMap, HashSet, VecDeque}; #[derive(Debug, Clone)] struct DAGVert { @@ -9,7 +13,7 @@ struct DAGVert { inv: u32, order: String, parents: Vec, - level: u32 + level: u32, } impl DAGVert { @@ -19,7 +23,7 @@ impl DAGVert { inv: 0, parents, level, - order: String::new() + order: String::new(), } } } @@ -29,12 +33,17 @@ struct MolAtomNode { color: u32, inv: u32, order: String, - num_parents: u32 + num_parents: u32, } impl MolAtomNode { pub fn new(color: u32, inv: u32, order: String, num_parents: u32) -> Self { - MolAtomNode {color, inv, order, num_parents} + MolAtomNode { + color, + inv, + order, + num_parents, + } } } @@ -56,14 +65,21 @@ pub fn canonize(molecule: &Molecule) -> String { vtx_map[start_atom_idx.index()] = mol_graph.add_node(AtomOrBond::Atom(*start_atom)) } - mol_graph.add_edge(*vtx_map.get(start_atom_idx.index()).unwrap(), new_bond_node_idx, ()); + mol_graph.add_edge( + *vtx_map.get(start_atom_idx.index()).unwrap(), + new_bond_node_idx, + (), + ); if *vtx_map.get(end_atom_idx.index()).unwrap() == NodeIndex::default() { vtx_map[end_atom_idx.index()] = mol_graph.add_node(AtomOrBond::Atom(*end_atom)) } - mol_graph.add_edge(*vtx_map.get(end_atom_idx.index()).unwrap(), new_bond_node_idx, ()); - + mol_graph.add_edge( + *vtx_map.get(end_atom_idx.index()).unwrap(), + new_bond_node_idx, + (), + ); } let mut max_string = String::new(); @@ -79,39 +95,38 @@ pub fn canonize(molecule: &Molecule) -> String { let mut max_level: u32 = 0; { - let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); - let mut visited: VecDeque<(NodeIndex,u32)> = VecDeque::new(); - - visited.push_back((root, 0)); - seen_edges_cache.insert((root, root), 0); - - let root_vertex_id = dag.add_node(DAGVert::new(root, [].to_vec(), 0)); - - dag_vertex_map.insert((root, 0), root_vertex_id); - dag_level_list[0].push(root_vertex_id); - mol_g_dag_vertex_map[root.index()].push(root_vertex_id); - - loop { - - let (curr, level) = visited.pop_front().unwrap(); - - for neigh in mol_graph.neighbors(curr) { - // let mut add_node_to_dag = false; - - //check if curr -> neigh or neigh -> curr already exists - if let Some(seen_at_level) = seen_edges_cache.get(&(curr, neigh)) { - // edge already exists at a level above - if *seen_at_level < (level+1) { - continue; + let mut seen_edges_cache: HashMap<(NodeIndex, NodeIndex), u32> = HashMap::new(); + let mut visited: VecDeque<(NodeIndex, u32)> = VecDeque::new(); + + visited.push_back((root, 0)); + seen_edges_cache.insert((root, root), 0); + + let root_vertex_id = dag.add_node(DAGVert::new(root, [].to_vec(), 0)); + + dag_vertex_map.insert((root, 0), root_vertex_id); + dag_level_list[0].push(root_vertex_id); + mol_g_dag_vertex_map[root.index()].push(root_vertex_id); + + loop { + let (curr, level) = visited.pop_front().unwrap(); + + for neigh in mol_graph.neighbors(curr) { + // let mut add_node_to_dag = false; + + //check if curr -> neigh or neigh -> curr already exists + if let Some(seen_at_level) = seen_edges_cache.get(&(curr, neigh)) { + // edge already exists at a level above + if *seen_at_level < (level + 1) { + continue; + } } - } - // if add_node_to_dag { + // if add_node_to_dag { //check if a atom has already been processed during this current level's processing - if let Some(present_node_idx) = dag_vertex_map.get(&(neigh, (level+1))) { - seen_edges_cache.insert((curr, neigh), level+1); - seen_edges_cache.insert((neigh, curr), level+1); + if let Some(present_node_idx) = dag_vertex_map.get(&(neigh, (level + 1))) { + seen_edges_cache.insert((curr, neigh), level + 1); + seen_edges_cache.insert((neigh, curr), level + 1); //get parent node's NodeIndex if let Some(parent_node_idx) = dag_vertex_map.get(&(curr, level)) { dag.add_edge(*parent_node_idx, *present_node_idx, ""); @@ -121,48 +136,49 @@ pub fn canonize(molecule: &Molecule) -> String { //skip rest of the processing for the atom continue; } - + // haven't seen the atom before so add it to dag max_level = level + 1; - seen_edges_cache.insert((curr, neigh), level+1); - seen_edges_cache.insert((neigh, curr), level+1); - let child_node_idx = dag.add_node(DAGVert::new(neigh, [].to_vec(), level+1)); - dag_vertex_map.insert((neigh, level+1), child_node_idx); + seen_edges_cache.insert((curr, neigh), level + 1); + seen_edges_cache.insert((neigh, curr), level + 1); + let child_node_idx = dag.add_node(DAGVert::new(neigh, [].to_vec(), level + 1)); + dag_vertex_map.insert((neigh, level + 1), child_node_idx); // Overriding the map!!! neigh can be seen before in previous layer mol_g_dag_vertex_map[neigh.index()].push(child_node_idx); // Insert into a level by level hashmap of dag nodes - dag_level_list[(level+1) as usize].push(child_node_idx); - - visited.push_back((neigh, level+1)); + dag_level_list[(level + 1) as usize].push(child_node_idx); + + visited.push_back((neigh, level + 1)); //get parent node's NodeIndex if let Some(parent_node_idx) = dag_vertex_map.get(&(curr, level)) { dag.add_edge(*parent_node_idx, child_node_idx, ""); // add as parent in the DAGvert dag[child_node_idx].parents.push(*parent_node_idx); } - // } - } + // } + } - if visited.is_empty() { - break; + if visited.is_empty() { + break; + } } } - } /* 2.1. Initialize the molecule graph with color = 0 and invariant no. for each atom from (atom_type,#parents in dag) 2.2. Do lexicographical ordering of the (atom_type, #parents in dag) */ - let mut extended_molg_atom_map: Vec = Vec::with_capacity(mol_graph.node_count()); + let mut extended_molg_atom_map: Vec = + Vec::with_capacity(mol_graph.node_count()); let mut order_str_set: HashSet = HashSet::new(); // Each atom does not have just one vertex in dag!!! for atom_node in mol_graph.node_indices() { // find unique parents for an atom's associated vertices in dag let atom_assoc_vert_list = &mol_g_dag_vertex_map[atom_node.index()]; - let mut parents= HashSet::new(); + let mut parents = HashSet::new(); for vert_id in atom_assoc_vert_list { for parent in &dag[*vert_id].parents { parents.insert(parent); @@ -172,7 +188,10 @@ pub fn canonize(molecule: &Molecule) -> String { let atom_str = mol_graph[atom_node].to_string(); let atom_order_str = format!("{atom_str}{parent_len}"); order_str_set.insert(atom_order_str.clone()); - extended_molg_atom_map.insert(atom_node.index(), MolAtomNode::new(0, 0, atom_order_str, parent_len as u32)); + extended_molg_atom_map.insert( + atom_node.index(), + MolAtomNode::new(0, 0, atom_order_str, parent_len as u32), + ); } // lexico-sort @@ -182,16 +201,26 @@ pub fn canonize(molecule: &Molecule) -> String { let mut order_idx: HashMap = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { - order_idx.insert(order_str.clone(), (idx as u32)+1); + order_idx.insert(order_str.clone(), (idx as u32) + 1); } // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in dag) for atom_node in mol_graph.node_indices() { - extended_molg_atom_map[atom_node.index()].inv = *order_idx.get(&extended_molg_atom_map[atom_node.index()].order).unwrap(); + extended_molg_atom_map[atom_node.index()].inv = *order_idx + .get(&extended_molg_atom_map[atom_node.index()].order) + .unwrap(); } // get the canonized string for current root atom - let canon_string = canonize_signature(&mol_graph, &mut dag, &mut extended_molg_atom_map, &dag_level_list, max_level, 1, "".to_string()); + let canon_string = canonize_signature( + &mol_graph, + &mut dag, + &mut extended_molg_atom_map, + &dag_level_list, + max_level, + 1, + "".to_string(), + ); // lexico-compare strings to save the max one. if lexical_cmp(&max_string, &canon_string).is_lt() { @@ -203,7 +232,7 @@ pub fn canonize(molecule: &Molecule) -> String { fn canonize_signature( mol_graph: &CGraph, - dag: &mut Graph::, + dag: &mut Graph, extended_molg_atom_map: &mut [MolAtomNode], dag_level_list: &[Vec], max_level: u32, @@ -211,7 +240,13 @@ fn canonize_signature( s_max: String, ) -> String { // 1. get the invariants for each atom - invariant_atom(mol_graph, dag, extended_molg_atom_map, dag_level_list, max_level); + invariant_atom( + mol_graph, + dag, + extended_molg_atom_map, + dag_level_list, + max_level, + ); // 2. generate orbits based on atom's invariant values let mut orbits: HashMap> = HashMap::new(); @@ -223,30 +258,52 @@ fn canonize_signature( let parent_len = extended_atom.num_parents; // only add atoms which have 2 or more parents in dag if parent_len >= 2 { - orbits.entry(atom_inv).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + orbits + .entry(atom_inv) + .and_modify(|atom_list| atom_list.push(atom)) + .or_insert([atom].to_vec()); } } // 3. max length of any orbit let mut max_orbit_len = 0; - orbits.values().for_each(|orbit| if orbit.len() > max_orbit_len {max_orbit_len = orbit.len()}); + orbits.values().for_each(|orbit| { + if orbit.len() > max_orbit_len { + max_orbit_len = orbit.len() + } + }); if max_orbit_len >= 2 { // find the orbits with max len of atoms - let max_orbits = orbits.keys().filter(|orbit| orbits.get(orbit).unwrap().len() == max_orbit_len).collect::>(); + let max_orbits = orbits + .keys() + .filter(|orbit| orbits.get(orbit).unwrap().len() == max_orbit_len) + .collect::>(); // if multiple then use orbit with min value - let min_orbit = (if max_orbits.len() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); - + let min_orbit = (if max_orbits.len() > 1 { + max_orbits.iter().min() + } else { + max_orbits.first() + }) + .unwrap(); + let mut local_smax = s_max.clone(); // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit for atom in orbits.get(min_orbit).unwrap() { extended_molg_atom_map[atom.index()].color = color_c; - local_smax = canonize_signature(mol_graph, dag, extended_molg_atom_map, dag_level_list, max_level, color_c+1, local_smax); + local_smax = canonize_signature( + mol_graph, + dag, + extended_molg_atom_map, + dag_level_list, + max_level, + color_c + 1, + local_smax, + ); extended_molg_atom_map[atom.index()].color = 0; } local_smax - } - else { + } else { // no need to recurse further and print the signature-string for atom in mol_graph.node_indices() { let extended_atom = &extended_molg_atom_map[atom.index()]; @@ -259,23 +316,31 @@ fn canonize_signature( } } // start from root node of the dag - let root_node = dag.node_indices().find(|vert| dag.neighbors_directed(*vert, Incoming).count() == 0).unwrap(); - let local_smax = print_signature_string(root_node, dag, mol_graph, extended_molg_atom_map, &mut vec![]); + let root_node = dag + .node_indices() + .find(|vert| dag.neighbors_directed(*vert, Incoming).count() == 0) + .unwrap(); + let local_smax = print_signature_string( + root_node, + dag, + mol_graph, + extended_molg_atom_map, + &mut vec![], + ); if local_smax.len() > s_max.len() { local_smax - } - else { - s_max + } else { + s_max } } } fn print_signature_string( vertex: NodeIndex, - dag: &Graph::, + dag: &Graph, mol_graph: &CGraph, extended_molg_atom_map: &[MolAtomNode], - edges: &mut Vec<(NodeIndex, NodeIndex)> + edges: &mut Vec<(NodeIndex, NodeIndex)>, ) -> String { let mut print_sign = String::new(); print_sign.push('['); @@ -289,20 +354,32 @@ fn print_signature_string( } print_sign.push(']'); - let mut child_vec = dag.neighbors_directed(vertex, Outgoing).collect::>(); - if child_vec.is_empty() { print_sign} - else { + let mut child_vec = dag + .neighbors_directed(vertex, Outgoing) + .collect::>(); + if child_vec.is_empty() { + print_sign + } else { // sort children in descending order of inv child_vec.sort_by(|vert_a, vert_b| dag[*vert_b].inv.cmp(&dag[*vert_a].inv)); - + let mut sub_print_sign = String::new(); - + for child in child_vec { - if let Some(_edge) = edges.iter().find(|egde| (egde.0 == vertex) && (egde.1 == child)) {} - else { + if let Some(_edge) = edges + .iter() + .find(|egde| (egde.0 == vertex) && (egde.1 == child)) + { + } else { // if the edge is not already seen then add it to seen and generate signature-string for the child edges.push((vertex, child)); - sub_print_sign.push_str(&print_signature_string(child, dag, mol_graph, extended_molg_atom_map, edges)); + sub_print_sign.push_str(&print_signature_string( + child, + dag, + mol_graph, + extended_molg_atom_map, + edges, + )); } } if !sub_print_sign.is_empty() { @@ -315,11 +392,11 @@ fn print_signature_string( } /* - 3. Generate Invariant for Atoms - */ +3. Generate Invariant for Atoms + */ fn invariant_atom( mol_graph: &CGraph, - dag: &mut Graph::, + dag: &mut Graph, extended_molg_atom_map: &mut [MolAtomNode], dag_level_list: &[Vec], max_level: u32, @@ -328,35 +405,62 @@ fn invariant_atom( let mut initial = true; loop { // Unique invariant values - let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() - .map(|atom_idx| extended_molg_atom_map[atom_idx.index()].inv)).len(); + let start_inv_atoms = HashSet::::from_iter( + mol_graph + .node_indices() + .map(|atom_idx| extended_molg_atom_map[atom_idx.index()].inv), + ) + .len(); /* 3.1 Generate Invariants for dag vertex */ // first bottom-up - invariant_dag_vert(dag, extended_molg_atom_map, dag_level_list, max_level, true, initial); + invariant_dag_vert( + dag, + extended_molg_atom_map, + dag_level_list, + max_level, + true, + initial, + ); initial = false; // then top-down - invariant_dag_vert(dag, extended_molg_atom_map, dag_level_list, max_level, false, initial); + invariant_dag_vert( + dag, + extended_molg_atom_map, + dag_level_list, + max_level, + false, + initial, + ); // Create a vector for each atom in molecule graph based on associated vertex in dag - let mut order_map_vert_atom: Vec> = vec![vec![0;(max_level+1).try_into().unwrap()]; mol_graph.node_count()]; + let mut order_map_vert_atom: Vec> = + vec![vec![0; (max_level + 1).try_into().unwrap()]; mol_graph.node_count()]; //for reverse sorting use: max_level - dag[vert].level as per paper for vert in dag.node_indices() { - order_map_vert_atom[dag[vert].atom_idx.index()][(max_level - dag[vert].level) as usize] = dag[vert].inv; + order_map_vert_atom[dag[vert].atom_idx.index()] + [(max_level - dag[vert].level) as usize] = dag[vert].inv; } let mut order_to_atom: HashMap> = HashMap::new(); // turn vectors into strings for sorting for atom in mol_graph.node_indices() { - let order_str = order_map_vert_atom[atom.index()].clone().into_iter().map(|i| i.to_string()).collect::(); - order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + let order_str = order_map_vert_atom[atom.index()] + .clone() + .into_iter() + .map(|i| i.to_string()) + .collect::(); + order_to_atom + .entry(order_str) + .and_modify(|atom_list| atom_list.push(atom)) + .or_insert([atom].to_vec()); } // lexico-sort the vectors-strings @@ -370,13 +474,16 @@ fn invariant_atom( for (idx, order) in atom_ordered_vec.iter().enumerate() { for atom in order_to_atom.get(*order).unwrap() { // extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = (idx as u32)+1); - extended_molg_atom_map[atom.index()].inv = (idx as u32)+1; + extended_molg_atom_map[atom.index()].inv = (idx as u32) + 1; } } - - let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() - .map(|atom_idx| extended_molg_atom_map[atom_idx.index()].inv)).len(); + let end_inv_atoms = HashSet::::from_iter( + mol_graph + .node_indices() + .map(|atom_idx| extended_molg_atom_map[atom_idx.index()].inv), + ) + .len(); // compare the no. of invariants of all the atoms with the one's they started from if start_inv_atoms == end_inv_atoms { @@ -388,93 +495,101 @@ fn invariant_atom( println!("breaking out because reached upper limit!"); break; } - count +=1; + count += 1; } } /* - 3. Generate Invariant for Vertices - */ +3. Generate Invariant for Vertices + */ fn invariant_dag_vert( - dag: &mut Graph::, + dag: &mut Graph, extended_molg_atom_map: &[MolAtomNode], dag_level_list: &[Vec], max_level: u32, bottom: bool, - initial: bool) { - // top-down or bottom-up calculation of invariants for each vertex in dag - let mut curr_lvl_range = if bottom {max_level} else {0}; - loop { - // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors - let mut order_str_set: HashSet = HashSet::new(); - for vert in &dag_level_list[curr_lvl_range as usize] { - let atom_idx_for_vert = dag[*vert].atom_idx; - let atom_node = &extended_molg_atom_map[atom_idx_for_vert.index()]; - let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); - let vert_inv = dag[*vert].inv; - let mut vert_order; - let mut child_inv_set: Vec = Vec::new(); - vert_order = if initial { format!("{atom_color}{atom_inv}") } else { format!("{atom_color}{vert_inv}") }; - if bottom { - // vert_order = format!("{}{}", atom_color, atom_inv); - for vert_neigh in dag.neighbors_directed(*vert, Outgoing) { - child_inv_set.push(dag[vert_neigh].inv); - } + initial: bool, +) { + // top-down or bottom-up calculation of invariants for each vertex in dag + let mut curr_lvl_range = if bottom { max_level } else { 0 }; + loop { + // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors + let mut order_str_set: HashSet = HashSet::new(); + for vert in &dag_level_list[curr_lvl_range as usize] { + let atom_idx_for_vert = dag[*vert].atom_idx; + let atom_node = &extended_molg_atom_map[atom_idx_for_vert.index()]; + let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); + let vert_inv = dag[*vert].inv; + let mut vert_order; + let mut child_inv_set: Vec = Vec::new(); + vert_order = if initial { + format!("{atom_color}{atom_inv}") + } else { + format!("{atom_color}{vert_inv}") + }; + if bottom { + // vert_order = format!("{}{}", atom_color, atom_inv); + for vert_neigh in dag.neighbors_directed(*vert, Outgoing) { + child_inv_set.push(dag[vert_neigh].inv); } - else { - // vert_order = format!("{}{}", atom_color, vert_inv); - for vert_neigh in dag.neighbors_directed(*vert, Incoming) { - child_inv_set.push(dag[vert_neigh].inv); - } + } else { + // vert_order = format!("{}{}", atom_color, vert_inv); + for vert_neigh in dag.neighbors_directed(*vert, Incoming) { + child_inv_set.push(dag[vert_neigh].inv); } + } - while child_inv_set.len() < 10 { - child_inv_set.push(0); - } + while child_inv_set.len() < 10 { + child_inv_set.push(0); + } - child_inv_set.sort(); - child_inv_set.reverse(); - child_inv_set.iter().for_each(|val| vert_order.push_str(&format!("{}", *val))); + child_inv_set.sort(); + child_inv_set.reverse(); + child_inv_set + .iter() + .for_each(|val| vert_order.push_str(&format!("{}", *val))); - let vec_string = format!("{vert_order:0>20}"); - dag[*vert].order = vec_string.clone(); - order_str_set.insert(vec_string); - } + let vec_string = format!("{vert_order:0>20}"); + dag[*vert].order = vec_string.clone(); + order_str_set.insert(vec_string); + } - // lexico-sort the invariant-strings in descending order - let mut ordered_vec: Vec = order_str_set.into_iter().collect(); - ordered_vec.string_sort_unstable(natural_cmp); - ordered_vec.reverse(); + // lexico-sort the invariant-strings in descending order + let mut ordered_vec: Vec = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(natural_cmp); + ordered_vec.reverse(); - let mut order_idx: HashMap = HashMap::new(); + let mut order_idx: HashMap = HashMap::new(); - for (idx, order_str) in ordered_vec.iter().enumerate() { - order_idx.insert(order_str.clone(), idx as u32); - } - - // assign the invariant of vertex as the order of invariant-strings - for vert in &dag_level_list[curr_lvl_range as usize] { - dag[*vert].inv = (*order_idx.get(&dag[*vert].order).unwrap())+1; - } + for (idx, order_str) in ordered_vec.iter().enumerate() { + order_idx.insert(order_str.clone(), idx as u32); + } - if bottom { - if curr_lvl_range == 0 {break}; - curr_lvl_range -= 1; - } - else { - if curr_lvl_range == max_level {break}; - curr_lvl_range += 1; - } + // assign the invariant of vertex as the order of invariant-strings + for vert in &dag_level_list[curr_lvl_range as usize] { + dag[*vert].inv = (*order_idx.get(&dag[*vert].order).unwrap()) + 1; } -} + if bottom { + if curr_lvl_range == 0 { + break; + }; + curr_lvl_range -= 1; + } else { + if curr_lvl_range == max_level { + break; + }; + curr_lvl_range += 1; + } + } +} mod tests { #![allow(unused_imports)] - use std::fs; - use std::path::PathBuf; use super::*; use crate::loader; + use std::fs; + use std::path::PathBuf; #[test] fn canonize_benzene() { @@ -482,10 +597,13 @@ mod tests { let molfile = fs::read_to_string(path).expect("Cannot read the data file"); let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile."); let canonical_repr = canonize(&molecule); - + println!("{}", canonical_repr); - - assert_eq!(canonical_repr, "[C]([2]([C]([1]([C]([2]([C,1])))))[1]([C]([2]([C]([1]([C,1]))))))") + + assert_eq!( + canonical_repr, + "[C]([2]([C]([1]([C]([2]([C,1])))))[1]([C]([2]([C]([1]([C,1]))))))" + ) } #[test] @@ -494,22 +612,22 @@ mod tests { let molfile = fs::read_to_string(path).expect("Cannot read the data file"); let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile."); let canonical_repr = canonize(&molecule); - + println!("{}", canonical_repr); - + assert_eq!(canonical_repr, "[C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C,1])))))[1]([C,8]([2]([C]([1]([C,1])))))))))[1]([C,17]([2]([C]([1]([C,8])))))))))[1]([C]([2]([C]([1]([C,17]))))))") } // Dummy Molecule for testing /* fn canonize_dummy() { - + let mut mol_graph: Graph = Graph::::new_undirected(); let mut vec_nodes: Vec = Vec::new(); vec_nodes.push(NodeIndex::new(99999)); for i in 0..16 { - vec_nodes.push(mol_graph.add_node(Atom::new(Element::Carbon))); + vec_nodes.push(mol_graph.add_node(Atom::new(Element::Carbon))); } mol_graph.add_edge(vec_nodes[1],vec_nodes[2],Bond::Single); mol_graph.add_edge(vec_nodes[2],vec_nodes[3],Bond::Single); @@ -542,4 +660,4 @@ mod tests { mol_graph.add_edge(vec_nodes[10],vec_nodes[9],Bond::Single); } */ -} \ No newline at end of file +} From e0b1c7741f1fefed4e84b6ec415748e333352e5c Mon Sep 17 00:00:00 2001 From: agentelement Date: Fri, 11 Jul 2025 14:03:23 -0700 Subject: [PATCH 5/8] feat: swap out nauty canonize for handwritten canonize --- src/canonize.rs | 19 +++++++++++------ src/molecule.rs | 55 +++---------------------------------------------- 2 files changed, 16 insertions(+), 58 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index d4c79445..970fe624 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,7 +1,8 @@ use crate::molecule::{AtomOrBond, CGraph, Molecule}; +use bit_set::BitSet; use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; use petgraph::{ - graph::NodeIndex, + graph::{EdgeIndex, NodeIndex}, Direction::{Incoming, Outgoing}, Graph, }; @@ -47,13 +48,13 @@ impl MolAtomNode { } } -// Compute the assembly index of a molecule -pub fn canonize(molecule: &Molecule) -> String { +pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> String { let mgraph = molecule.graph(); let mut mol_graph = CGraph::new_undirected(); let mut vtx_map = vec![NodeIndex::default(); mgraph.node_count()]; - for bond_idx in mgraph.edge_indices() { + for subgraph_bond_idx in subgraph { + let bond_idx = EdgeIndex::new(subgraph_bond_idx); let bond = mgraph.edge_weight(bond_idx).unwrap(); let (start_atom_idx, end_atom_idx) = mgraph.edge_endpoints(bond_idx).unwrap(); let start_atom = mgraph.node_weight(start_atom_idx).unwrap(); @@ -596,7 +597,10 @@ mod tests { let path = PathBuf::from(format!("./data/checks/benzene.mol")); let molfile = fs::read_to_string(path).expect("Cannot read the data file"); let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile."); - let canonical_repr = canonize(&molecule); + let canonical_repr = canonize( + &molecule, + &BitSet::from_iter(molecule.graph().edge_indices().map(|e| e.index())), + ); println!("{}", canonical_repr); @@ -611,7 +615,10 @@ mod tests { let path = PathBuf::from(format!("./data/checks/anthracene.mol")); let molfile = fs::read_to_string(path).expect("Cannot read the data file"); let molecule = loader::parse_molfile_str(&molfile).expect("Cannot parse molfile."); - let canonical_repr = canonize(&molecule); + let canonical_repr = canonize( + &molecule, + &BitSet::from_iter(molecule.graph().edge_indices().map(|e| e.index())), + ); println!("{}", canonical_repr); diff --git a/src/molecule.rs b/src/molecule.rs index d332ad70..1a27a8e6 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -10,7 +10,6 @@ use std::{ }; use bit_set::BitSet; -use graph_canon::CanonLabeling; use petgraph::{ algo::{is_isomorphic, is_isomorphic_subgraph}, dot::Dot, @@ -18,7 +17,7 @@ use petgraph::{ Undirected, }; -use crate::utils::{edge_induced_subgraph, is_subset_connected}; +use crate::{canonize::canonize, utils::{edge_induced_subgraph, is_subset_connected}}; pub(crate) type Index = u32; pub(crate) type MGraph = Graph; @@ -455,38 +454,12 @@ impl Molecule { ); } - fn subgraph_to_cgraph(&self, subgraph: &BitSet) -> CGraph { - let mut h = CGraph::with_capacity(subgraph.len(), 2 * subgraph.len()); - let mut vtx_map = HashMap::::new(); - for e in subgraph { - let eix = EdgeIndex::new(e); - let (src, dst) = self.graph.edge_endpoints(eix).unwrap(); - let src_w = self.graph.node_weight(src).unwrap(); - let dst_w = self.graph.node_weight(dst).unwrap(); - let e_w = self.graph.edge_weight(eix).unwrap(); - - let h_enode = h.add_node(AtomOrBond::Bond(*e_w)); - - let h_src = vtx_map - .entry(src) - .or_insert(h.add_node(AtomOrBond::Atom(*src_w))); - h.add_edge(*h_src, h_enode, ()); - - let h_dst = vtx_map - .entry(dst) - .or_insert(h.add_node(AtomOrBond::Atom(*dst_w))); - h.add_edge(*h_dst, h_enode, ()); - } - h - } - /// Return an iterator of bitsets from self containing all duplicate and /// non-overlapping pairs of isomorphic subgraphs pub fn matches(&self) -> impl Iterator { - let mut isomorphic_map = HashMap::, Vec>::new(); + let mut isomorphic_map = HashMap::>::new(); for subgraph in self.enumerate_noninduced_subgraphs() { - let cgraph = self.subgraph_to_cgraph(&subgraph); - let repr = CanonLabeling::new(&cgraph); + let repr = canonize(&self, &subgraph); isomorphic_map .entry(repr) @@ -595,28 +568,6 @@ mod tests { assert!(str::parse::("Foo").is_err()); } - #[test] - fn noncanonical() { - let mut p3_010 = Graph::::new_undirected(); - let n0 = p3_010.add_node(0); - let n1 = p3_010.add_node(1); - let n2 = p3_010.add_node(0); - p3_010.add_edge(n0, n1, ()); - p3_010.add_edge(n1, n2, ()); - - let mut p3_001 = Graph::::new_undirected(); - let n0 = p3_001.add_node(0); - let n1 = p3_001.add_node(0); - let n2 = p3_001.add_node(1); - p3_001.add_edge(n0, n1, ()); - p3_001.add_edge(n1, n2, ()); - - let repr_a = CanonLabeling::new(&p3_010); - let repr_b = CanonLabeling::new(&p3_001); - - assert_ne!(repr_a, repr_b); - } - #[test] fn nonisomorphic() { let mut p3_010 = Graph::::new_undirected(); From 178517f97918434f5275defa77d1d5a24a391113 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Thu, 17 Jul 2025 15:56:23 +0530 Subject: [PATCH 6/8] changed the canonize fn to return bytes vector --- src/canonize.rs | 18 ++++++++---------- src/molecule.rs | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 970fe624..eeb5af03 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -48,7 +48,7 @@ impl MolAtomNode { } } -pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> String { +pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { let mgraph = molecule.graph(); let mut mol_graph = CGraph::new_undirected(); let mut vtx_map = vec![NodeIndex::default(); mgraph.node_count()]; @@ -228,7 +228,7 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> String { max_string = canon_string } } - max_string + Some(max_string.as_bytes().to_vec()) } fn canonize_signature( @@ -600,12 +600,11 @@ mod tests { let canonical_repr = canonize( &molecule, &BitSet::from_iter(molecule.graph().edge_indices().map(|e| e.index())), - ); - - println!("{}", canonical_repr); + ) + .unwrap(); assert_eq!( - canonical_repr, + String::from_utf8(canonical_repr).unwrap(), "[C]([2]([C]([1]([C]([2]([C,1])))))[1]([C]([2]([C]([1]([C,1]))))))" ) } @@ -618,11 +617,10 @@ mod tests { let canonical_repr = canonize( &molecule, &BitSet::from_iter(molecule.graph().edge_indices().map(|e| e.index())), - ); - - println!("{}", canonical_repr); + ) + .unwrap(); - assert_eq!(canonical_repr, "[C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C,1])))))[1]([C,8]([2]([C]([1]([C,1])))))))))[1]([C,17]([2]([C]([1]([C,8])))))))))[1]([C]([2]([C]([1]([C,17]))))))") + assert_eq!(String::from_utf8(canonical_repr).unwrap(), "[C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C]([1]([C]([2]([C,1])))))[1]([C,8]([2]([C]([1]([C,1])))))))))[1]([C,17]([2]([C]([1]([C,8])))))))))[1]([C]([2]([C]([1]([C,17]))))))") } // Dummy Molecule for testing diff --git a/src/molecule.rs b/src/molecule.rs index 1a27a8e6..79c10f34 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -459,7 +459,7 @@ impl Molecule { pub fn matches(&self) -> impl Iterator { let mut isomorphic_map = HashMap::>::new(); for subgraph in self.enumerate_noninduced_subgraphs() { - let repr = canonize(&self, &subgraph); + let repr = String::from_utf8(canonize(&self, &subgraph).unwrap()).unwrap(); isomorphic_map .entry(repr) From f7349ce1eec0ea180b2ff798eb2ae90b4376162c Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Thu, 7 Aug 2025 17:57:03 +0530 Subject: [PATCH 7/8] removed Strings from canonization and lexical-sort dependency --- Cargo.lock | 16 ------- Cargo.toml | 1 - src/canonize.rs | 113 +++++++++++++++++++++--------------------------- 3 files changed, 50 insertions(+), 80 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index de1204fa..6347195a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -61,12 +61,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "any_ascii" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70033777eb8b5124a81a1889416543dddef2de240019b674c81285a2635a7e1e" - [[package]] name = "anyhow" version = "1.0.98" @@ -83,7 +77,6 @@ dependencies = [ "criterion", "csv", "graph-canon", - "lexical-sort", "petgraph", "pyo3", "rayon", @@ -558,15 +551,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" -[[package]] -name = "lexical-sort" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c09e4591611e231daf4d4c685a66cb0410cc1e502027a20ae55f2bb9e997207a" -dependencies = [ - "any_ascii", -] - [[package]] name = "libc" version = "0.2.174" diff --git a/Cargo.toml b/Cargo.toml index 8137174d..31feb2f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,6 @@ graph-canon = { git = "https://github.com/AgentElement/graph-canon", version = " petgraph = "0.6.5" pyo3 = { version = "0.24.1", features = ["abi3-py38", "extension-module"]} rayon = "1.10.0" -lexical-sort = "0.3.1" [dev-dependencies] criterion = "0.3" diff --git a/src/canonize.rs b/src/canonize.rs index eeb5af03..88ba0be0 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,6 +1,5 @@ use crate::molecule::{AtomOrBond, CGraph, Molecule}; use bit_set::BitSet; -use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; use petgraph::{ graph::{EdgeIndex, NodeIndex}, Direction::{Incoming, Outgoing}, @@ -12,7 +11,7 @@ use std::collections::{HashMap, HashSet, VecDeque}; struct DAGVert { atom_idx: NodeIndex, inv: u32, - order: String, + order: Vec, parents: Vec, level: u32, } @@ -24,7 +23,7 @@ impl DAGVert { inv: 0, parents, level, - order: String::new(), + order: vec![], } } } @@ -33,12 +32,12 @@ impl DAGVert { struct MolAtomNode { color: u32, inv: u32, - order: String, + order: Vec, num_parents: u32, } impl MolAtomNode { - pub fn new(color: u32, inv: u32, order: String, num_parents: u32) -> Self { + pub fn new(color: u32, inv: u32, order: Vec, num_parents: u32) -> Self { MolAtomNode { color, inv, @@ -83,7 +82,7 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { ); } - let mut max_string = String::new(); + let mut max_string = Vec::new(); for root in mol_graph.node_indices() { // for each node in the molecule graph create a signature /* @@ -173,21 +172,21 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { */ let mut extended_molg_atom_map: Vec = Vec::with_capacity(mol_graph.node_count()); - let mut order_str_set: HashSet = HashSet::new(); + let mut order_str_set: HashSet> = HashSet::new(); // Each atom does not have just one vertex in dag!!! for atom_node in mol_graph.node_indices() { // find unique parents for an atom's associated vertices in dag let atom_assoc_vert_list = &mol_g_dag_vertex_map[atom_node.index()]; - let mut parents = HashSet::new(); + let mut parents = BitSet::new(); for vert_id in atom_assoc_vert_list { for parent in &dag[*vert_id].parents { - parents.insert(parent); + parents.insert(parent.index()); } } let parent_len = parents.len(); - let atom_str = mol_graph[atom_node].to_string(); - let atom_order_str = format!("{atom_str}{parent_len}"); + let mut atom_order_str = mol_graph[atom_node].to_string().into_bytes(); + atom_order_str.extend_from_slice(&parent_len.to_string().into_bytes()); order_str_set.insert(atom_order_str.clone()); extended_molg_atom_map.insert( atom_node.index(), @@ -197,9 +196,9 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { // lexico-sort let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); - ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + ordered_vec.sort(); - let mut order_idx: HashMap = HashMap::new(); + let mut order_idx: HashMap, u32> = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { order_idx.insert(order_str.clone(), (idx as u32) + 1); @@ -213,22 +212,23 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { } // get the canonized string for current root atom - let canon_string = canonize_signature( + let canon_string: Vec = canonize_signature( &mol_graph, &mut dag, &mut extended_molg_atom_map, &dag_level_list, max_level, 1, - "".to_string(), + vec![], ); // lexico-compare strings to save the max one. - if lexical_cmp(&max_string, &canon_string).is_lt() { + if max_string < canon_string { max_string = canon_string } } - Some(max_string.as_bytes().to_vec()) + // Some(max_string.as_bytes().to_vec()) + Some(max_string) } fn canonize_signature( @@ -238,8 +238,8 @@ fn canonize_signature( dag_level_list: &[Vec], max_level: u32, color_c: u32, - s_max: String, -) -> String { + s_max: Vec, +) -> Vec { // 1. get the invariants for each atom invariant_atom( mol_graph, @@ -342,18 +342,19 @@ fn print_signature_string( mol_graph: &CGraph, extended_molg_atom_map: &[MolAtomNode], edges: &mut Vec<(NodeIndex, NodeIndex)>, -) -> String { - let mut print_sign = String::new(); - print_sign.push('['); +) -> Vec { + let mut print_sign: Vec = vec![]; + print_sign.push(b'['); let atom_idx = dag[vertex].atom_idx; let atom = &mol_graph[dag[vertex].atom_idx]; - print_sign.push_str(&atom.to_string()); + // print_sign.push_str(&atom.to_string()); + print_sign.extend_from_slice(atom.to_string().as_bytes()); let atom_color = extended_molg_atom_map[atom_idx.index()].color; if atom_color != 0 { - print_sign.push(','); - print_sign.push_str(&atom_color.to_string()); + print_sign.push(b','); + print_sign.extend_from_slice(&atom_color.to_string().into_bytes()); } - print_sign.push(']'); + print_sign.push(b']'); let mut child_vec = dag .neighbors_directed(vertex, Outgoing) @@ -364,7 +365,7 @@ fn print_signature_string( // sort children in descending order of inv child_vec.sort_by(|vert_a, vert_b| dag[*vert_b].inv.cmp(&dag[*vert_a].inv)); - let mut sub_print_sign = String::new(); + let mut sub_print_sign = vec![]; for child in child_vec { if let Some(_edge) = edges @@ -374,7 +375,7 @@ fn print_signature_string( } else { // if the edge is not already seen then add it to seen and generate signature-string for the child edges.push((vertex, child)); - sub_print_sign.push_str(&print_signature_string( + sub_print_sign.extend_from_slice(&print_signature_string( child, dag, mol_graph, @@ -384,9 +385,9 @@ fn print_signature_string( } } if !sub_print_sign.is_empty() { - print_sign.push('('); - print_sign.push_str(&sub_print_sign); - print_sign.push(')'); + print_sign.push(b'('); + print_sign.extend_from_slice(&sub_print_sign); + print_sign.push(b')'); } print_sign } @@ -449,25 +450,21 @@ fn invariant_atom( [(max_level - dag[vert].level) as usize] = dag[vert].inv; } - let mut order_to_atom: HashMap> = HashMap::new(); + let mut order_to_atom: HashMap, Vec> = HashMap::new(); // turn vectors into strings for sorting for atom in mol_graph.node_indices() { - let order_str = order_map_vert_atom[atom.index()] - .clone() - .into_iter() - .map(|i| i.to_string()) - .collect::(); + let order_str = &order_map_vert_atom[atom.index()]; + order_to_atom - .entry(order_str) + .entry(order_str.to_vec()) .and_modify(|atom_list| atom_list.push(atom)) .or_insert([atom].to_vec()); } // lexico-sort the vectors-strings let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().collect(); - atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); - // atom_ordered_vec.string_sort_unstable(natural_cmp); + atom_ordered_vec.sort(); // descend sort atom_ordered_vec.reverse(); @@ -515,52 +512,42 @@ fn invariant_dag_vert( let mut curr_lvl_range = if bottom { max_level } else { 0 }; loop { // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors - let mut order_str_set: HashSet = HashSet::new(); + let mut order_str_set: HashSet> = HashSet::new(); for vert in &dag_level_list[curr_lvl_range as usize] { let atom_idx_for_vert = dag[*vert].atom_idx; let atom_node = &extended_molg_atom_map[atom_idx_for_vert.index()]; let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); let vert_inv = dag[*vert].inv; - let mut vert_order; + let mut vert_order: Vec = vec![]; let mut child_inv_set: Vec = Vec::new(); - vert_order = if initial { - format!("{atom_color}{atom_inv}") - } else { - format!("{atom_color}{vert_inv}") - }; + vert_order.push(atom_color); + if initial { vert_order.push(atom_inv); } else { vert_order.push(vert_inv); } + if bottom { - // vert_order = format!("{}{}", atom_color, atom_inv); for vert_neigh in dag.neighbors_directed(*vert, Outgoing) { child_inv_set.push(dag[vert_neigh].inv); } } else { - // vert_order = format!("{}{}", atom_color, vert_inv); for vert_neigh in dag.neighbors_directed(*vert, Incoming) { child_inv_set.push(dag[vert_neigh].inv); } } - while child_inv_set.len() < 10 { - child_inv_set.push(0); - } - + // first sort (desc) among the children/parent child_inv_set.sort(); child_inv_set.reverse(); - child_inv_set - .iter() - .for_each(|val| vert_order.push_str(&format!("{}", *val))); - - let vec_string = format!("{vert_order:0>20}"); - dag[*vert].order = vec_string.clone(); - order_str_set.insert(vec_string); + vert_order.append(&mut child_inv_set); + + dag[*vert].order = vert_order.clone(); + order_str_set.insert(vert_order); } // lexico-sort the invariant-strings in descending order - let mut ordered_vec: Vec = order_str_set.into_iter().collect(); - ordered_vec.string_sort_unstable(natural_cmp); + let mut ordered_vec: Vec> = order_str_set.into_iter().collect(); + ordered_vec.sort(); ordered_vec.reverse(); - let mut order_idx: HashMap = HashMap::new(); + let mut order_idx: HashMap, u32> = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { order_idx.insert(order_str.clone(), idx as u32); From 2e5d97069c27ab6976eee44538366c0b035f5435 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Fri, 8 Aug 2025 18:10:05 +0530 Subject: [PATCH 8/8] Added basic documentation and comments for canonization --- src/canonize.rs | 177 +++++++++++++++++++++++++++--------------------- 1 file changed, 101 insertions(+), 76 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 88ba0be0..58cf0c2f 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -7,6 +7,8 @@ use petgraph::{ }; use std::collections::{HashMap, HashSet, VecDeque}; +// Struct for the vertex of the rooted-DAG, stores associated node's index +// invariant no. and auxiliary information needed while processing DAG. #[derive(Debug, Clone)] struct DAGVert { atom_idx: NodeIndex, @@ -28,6 +30,8 @@ impl DAGVert { } } +// Struct for the node of the molecule subgraph to store added information required +// by the Faulon et al. (2004) algorithm. #[derive(Debug, Clone)] struct MolAtomNode { color: u32, @@ -47,11 +51,17 @@ impl MolAtomNode { } } +/// Our implementation of [Faulon et al. (2004)](https://doi.org/10.1021/ci0341823). +/// Returns a canonical byte array representation of a molecule's subgraph +/// such that two isomorphic subgraphs will have same representation. pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { let mgraph = molecule.graph(); let mut mol_graph = CGraph::new_undirected(); let mut vtx_map = vec![NodeIndex::default(); mgraph.node_count()]; + // The Faulon et al. (2004) algorithm does not consider bond types while + // generating the canonization representation. Convert bonds to nodes + // and construct a new molecule subgraph for subgraph_bond_idx in subgraph { let bond_idx = EdgeIndex::new(subgraph_bond_idx); let bond = mgraph.edge_weight(bond_idx).unwrap(); @@ -82,15 +92,21 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { ); } + // Maintian lexicographically largest representation for the molecule subgraph let mut max_string = Vec::new(); + + // Constuct a representation of a molecule subgraph for a starting node + // and save the lexicographically largest representation by iterating over + // all the nodes in the molecule subgraph for root in mol_graph.node_indices() { - // for each node in the molecule graph create a signature - /* - 1. create a dag from each start node - */ + // Step 1: Create a rooted Directed Acyclic graph (DAG) of the + // molecule subgraph with the current node as the root let mut dag = Graph::::new(); + // Maintain a molecule node to DAG vertex map let mut dag_vertex_map: HashMap<(NodeIndex, u32), NodeIndex> = HashMap::new(); + // Maintain a DAG vertex to molecule node map let mut mol_g_dag_vertex_map: Vec> = vec![vec![]; mol_graph.node_count()]; + // Maintain a level by level let mut dag_level_list: Vec> = vec![vec![]; mol_graph.node_count()]; let mut max_level: u32 = 0; @@ -107,57 +123,51 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { dag_level_list[0].push(root_vertex_id); mol_g_dag_vertex_map[root.index()].push(root_vertex_id); + // A Breadth-First order traversal to process the molecule subgraph + // level by level and using references instead of duplicating the + // previously seen subtrees to construct a rooted-DAG loop { let (curr, level) = visited.pop_front().unwrap(); for neigh in mol_graph.neighbors(curr) { - // let mut add_node_to_dag = false; - - //check if curr -> neigh or neigh -> curr already exists + // skip further processing if the edge to the neighbor is seen in one of + // the previous levels if let Some(seen_at_level) = seen_edges_cache.get(&(curr, neigh)) { - // edge already exists at a level above if *seen_at_level < (level + 1) { continue; } } - // if add_node_to_dag { - - //check if a atom has already been processed during this current level's processing + // Process only the edge to the neighbor if the neighbor was previously processed at + // at the current level if let Some(present_node_idx) = dag_vertex_map.get(&(neigh, (level + 1))) { seen_edges_cache.insert((curr, neigh), level + 1); seen_edges_cache.insert((neigh, curr), level + 1); - //get parent node's NodeIndex + if let Some(parent_node_idx) = dag_vertex_map.get(&(curr, level)) { dag.add_edge(*parent_node_idx, *present_node_idx, ""); - // add as parent in the DAGvert dag[*present_node_idx].parents.push(*parent_node_idx); } - //skip rest of the processing for the atom continue; } - // haven't seen the atom before so add it to dag + // Process the both newly seen neighbor node and the edge to it max_level = level + 1; seen_edges_cache.insert((curr, neigh), level + 1); seen_edges_cache.insert((neigh, curr), level + 1); let child_node_idx = dag.add_node(DAGVert::new(neigh, [].to_vec(), level + 1)); dag_vertex_map.insert((neigh, level + 1), child_node_idx); - // Overriding the map!!! neigh can be seen before in previous layer mol_g_dag_vertex_map[neigh.index()].push(child_node_idx); - // Insert into a level by level hashmap of dag nodes dag_level_list[(level + 1) as usize].push(child_node_idx); visited.push_back((neigh, level + 1)); - //get parent node's NodeIndex + if let Some(parent_node_idx) = dag_vertex_map.get(&(curr, level)) { dag.add_edge(*parent_node_idx, child_node_idx, ""); - // add as parent in the DAGvert dag[child_node_idx].parents.push(*parent_node_idx); } - // } } if visited.is_empty() { @@ -166,17 +176,17 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { } } - /* - 2.1. Initialize the molecule graph with color = 0 and invariant no. for each atom from (atom_type,#parents in dag) - 2.2. Do lexicographical ordering of the (atom_type, #parents in dag) - */ + // Step 2: + // First, initialize the molecule subgraph with color set to 0. + // Next, set the invariant no. for each node to the order index + // after lexicographical sorting based on the value (node_type, #parents in DAG) + // associated with each node let mut extended_molg_atom_map: Vec = Vec::with_capacity(mol_graph.node_count()); let mut order_str_set: HashSet> = HashSet::new(); - // Each atom does not have just one vertex in dag!!! + // set the value (node_type, #parents in DAG) for each node for atom_node in mol_graph.node_indices() { - // find unique parents for an atom's associated vertices in dag let atom_assoc_vert_list = &mol_g_dag_vertex_map[atom_node.index()]; let mut parents = BitSet::new(); for vert_id in atom_assoc_vert_list { @@ -185,6 +195,7 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { } } let parent_len = parents.len(); + let mut atom_order_str = mol_graph[atom_node].to_string().into_bytes(); atom_order_str.extend_from_slice(&parent_len.to_string().into_bytes()); order_str_set.insert(atom_order_str.clone()); @@ -194,7 +205,7 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { ); } - // lexico-sort + // lexicographical sorting based on the value (node_type, #parents in DAG) let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); ordered_vec.sort(); @@ -204,14 +215,14 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { order_idx.insert(order_str.clone(), (idx as u32) + 1); } - // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in dag) + // set the invariant no. for each node to the order index for atom_node in mol_graph.node_indices() { extended_molg_atom_map[atom_node.index()].inv = *order_idx .get(&extended_molg_atom_map[atom_node.index()].order) .unwrap(); } - // get the canonized string for current root atom + // get the canonized representation for current root atom let canon_string: Vec = canonize_signature( &mol_graph, &mut dag, @@ -222,15 +233,15 @@ pub fn canonize(molecule: &Molecule, subgraph: &BitSet) -> Option> { vec![], ); - // lexico-compare strings to save the max one. + // lexicographical compare the representations to save the larger one if max_string < canon_string { max_string = canon_string } } - // Some(max_string.as_bytes().to_vec()) Some(max_string) } +// Generate a canonical representation for the generated rooted-DAG. fn canonize_signature( mol_graph: &CGraph, dag: &mut Graph, @@ -240,7 +251,7 @@ fn canonize_signature( color_c: u32, s_max: Vec, ) -> Vec { - // 1. get the invariants for each atom + // Step 1: Calculate the invariants no. for each node in the molecule subgraph invariant_atom( mol_graph, dag, @@ -249,15 +260,16 @@ fn canonize_signature( max_level, ); - // 2. generate orbits based on atom's invariant values + // Step 2: Generate orbits based on nodes invariant no. + // A single orbit is created for each invariant value. Assign a + // node to an orbit if it has 2 or more parents in the DAG let mut orbits: HashMap> = HashMap::new(); for atom in mol_graph.node_indices() { - // let extended_atom = extended_molg_atom_map.get(&atom).unwrap(); let extended_atom = &extended_molg_atom_map[atom.index()]; let atom_inv = extended_atom.inv; let parent_len = extended_atom.num_parents; - // only add atoms which have 2 or more parents in dag + if parent_len >= 2 { orbits .entry(atom_inv) @@ -266,7 +278,6 @@ fn canonize_signature( } } - // 3. max length of any orbit let mut max_orbit_len = 0; orbits.values().for_each(|orbit| { if orbit.len() > max_orbit_len { @@ -274,13 +285,16 @@ fn canonize_signature( } }); + // Find if any orbit exists with 2 or more nodes + // then break the tie between these nodes by generating + // canonized representation but with different node colors if max_orbit_len >= 2 { - // find the orbits with max len of atoms + // First, find the orbits with max len of atoms let max_orbits = orbits .keys() .filter(|orbit| orbits.get(orbit).unwrap().len() == max_orbit_len) .collect::>(); - // if multiple then use orbit with min value + // If multiple then use orbit with min value let min_orbit = (if max_orbits.len() > 1 { max_orbits.iter().min() } else { @@ -289,7 +303,10 @@ fn canonize_signature( .unwrap(); let mut local_smax = s_max.clone(); - // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit + + // recurse further for each of the atom in such a orbit and generate a canonized representation + // by setting a different color for the atom. Use this new canonized representation if it is + // larger than previously calculated representation. for atom in orbits.get(min_orbit).unwrap() { extended_molg_atom_map[atom.index()].color = color_c; local_smax = canonize_signature( @@ -305,18 +322,21 @@ fn canonize_signature( } local_smax } else { - // no need to recurse further and print the signature-string + // Generate the signature repesentation. Use this new canonized representation + // if it is larger than previously calculated representation. + + // first update any node without a color to be same as its invariant value for atom in mol_graph.node_indices() { let extended_atom = &extended_molg_atom_map[atom.index()]; let atom_inv = extended_atom.inv; let atom_color = extended_atom.color; let parent_len = extended_atom.num_parents; - // first update any atom without a color to be same as its invariant value + if (atom_color == 0) && (parent_len >= 2) { extended_molg_atom_map[atom.index()].color = atom_inv; } } - // start from root node of the dag + let root_node = dag .node_indices() .find(|vert| dag.neighbors_directed(*vert, Incoming).count() == 0) @@ -336,6 +356,8 @@ fn canonize_signature( } } +// Constructs the signature representation for a vertex in DAG using the +// Depth-First order traversal on DAG. Called recursively. fn print_signature_string( vertex: NodeIndex, dag: &Graph, @@ -347,7 +369,6 @@ fn print_signature_string( print_sign.push(b'['); let atom_idx = dag[vertex].atom_idx; let atom = &mol_graph[dag[vertex].atom_idx]; - // print_sign.push_str(&atom.to_string()); print_sign.extend_from_slice(atom.to_string().as_bytes()); let atom_color = extended_molg_atom_map[atom_idx.index()].color; if atom_color != 0 { @@ -362,7 +383,7 @@ fn print_signature_string( if child_vec.is_empty() { print_sign } else { - // sort children in descending order of inv + // sort children in descending order of invariant no. child_vec.sort_by(|vert_a, vert_b| dag[*vert_b].inv.cmp(&dag[*vert_a].inv)); let mut sub_print_sign = vec![]; @@ -373,7 +394,8 @@ fn print_signature_string( .find(|egde| (egde.0 == vertex) && (egde.1 == child)) { } else { - // if the edge is not already seen then add it to seen and generate signature-string for the child + // if the edge is not already seen then mark it seen and generate + // signature representation for the child edges.push((vertex, child)); sub_print_sign.extend_from_slice(&print_signature_string( child, @@ -393,9 +415,10 @@ fn print_signature_string( } } -/* -3. Generate Invariant for Atoms - */ +// Calculate the invariant no. for the nodes in the molecule subgraph +// based on the invariant no. of vertices of the DAG. The process makes +// repeated passes to calculate invariant no. until it stabilizes ie. +// no. of unique invariant values don't change fn invariant_atom( mol_graph: &CGraph, dag: &mut Graph, @@ -406,7 +429,7 @@ fn invariant_atom( let mut count = 0; let mut initial = true; loop { - // Unique invariant values + // Calculate unique invariant values at the start of the pass let start_inv_atoms = HashSet::::from_iter( mol_graph .node_indices() @@ -414,11 +437,7 @@ fn invariant_atom( ) .len(); - /* - 3.1 Generate Invariants for dag vertex - */ - - // first bottom-up + // Step 1: Calculate the vertex invariants bottom up invariant_dag_vert( dag, extended_molg_atom_map, @@ -430,7 +449,7 @@ fn invariant_atom( initial = false; - // then top-down + // Step 2: Calculate the vertex invariants top down invariant_dag_vert( dag, extended_molg_atom_map, @@ -440,11 +459,14 @@ fn invariant_atom( initial, ); - // Create a vector for each atom in molecule graph based on associated vertex in dag + // Step 3: set the invariant no. for each node to the order index + // after sorting based on the invariant no. of vertices associated + // with each node + + // Create a vector to store invariant no. of the associated vertices let mut order_map_vert_atom: Vec> = vec![vec![0; (max_level + 1).try_into().unwrap()]; mol_graph.node_count()]; - //for reverse sorting use: max_level - dag[vert].level as per paper for vert in dag.node_indices() { order_map_vert_atom[dag[vert].atom_idx.index()] [(max_level - dag[vert].level) as usize] = dag[vert].inv; @@ -452,30 +474,28 @@ fn invariant_atom( let mut order_to_atom: HashMap, Vec> = HashMap::new(); - // turn vectors into strings for sorting for atom in mol_graph.node_indices() { let order_str = &order_map_vert_atom[atom.index()]; - + order_to_atom .entry(order_str.to_vec()) .and_modify(|atom_list| atom_list.push(atom)) .or_insert([atom].to_vec()); } - // lexico-sort the vectors-strings + // lexicographicaly sort the vectors in descending order let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().collect(); atom_ordered_vec.sort(); - // descend sort atom_ordered_vec.reverse(); - // assign the invariant of atom as the order of vectors-strings + // assign the invariant of atom as the order index of sorted vectors for (idx, order) in atom_ordered_vec.iter().enumerate() { for atom in order_to_atom.get(*order).unwrap() { - // extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = (idx as u32)+1); extended_molg_atom_map[atom.index()].inv = (idx as u32) + 1; } } + // Calculate unique invariant values at the end of the pass let end_inv_atoms = HashSet::::from_iter( mol_graph .node_indices() @@ -483,12 +503,12 @@ fn invariant_atom( ) .len(); - // compare the no. of invariants of all the atoms with the one's they started from + // Stop the process if the invariant values stabilize if start_inv_atoms == end_inv_atoms { break; } - // Naive way of stopping + // Hard stopping the process if count > mol_graph.node_count() { println!("breaking out because reached upper limit!"); break; @@ -497,9 +517,10 @@ fn invariant_atom( } } -/* -3. Generate Invariant for Vertices - */ +// Calculate the invariant no. for the vertices of DAG based on +// associated node's color and invariant no. +// The invariant calculation can proceed bottom-up or top-down +// specified by bottom flag fn invariant_dag_vert( dag: &mut Graph, extended_molg_atom_map: &[MolAtomNode], @@ -508,10 +529,10 @@ fn invariant_dag_vert( bottom: bool, initial: bool, ) { - // top-down or bottom-up calculation of invariants for each vertex in dag let mut curr_lvl_range = if bottom { max_level } else { 0 }; + // for each vertex generate a ordering vector based on associated node's color, + // invariant no. and vertex's directed neighbor's invariant values loop { - // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors let mut order_str_set: HashSet> = HashSet::new(); for vert in &dag_level_list[curr_lvl_range as usize] { let atom_idx_for_vert = dag[*vert].atom_idx; @@ -521,8 +542,12 @@ fn invariant_dag_vert( let mut vert_order: Vec = vec![]; let mut child_inv_set: Vec = Vec::new(); vert_order.push(atom_color); - if initial { vert_order.push(atom_inv); } else { vert_order.push(vert_inv); } - + if initial { + vert_order.push(atom_inv); + } else { + vert_order.push(vert_inv); + } + if bottom { for vert_neigh in dag.neighbors_directed(*vert, Outgoing) { child_inv_set.push(dag[vert_neigh].inv); @@ -533,16 +558,16 @@ fn invariant_dag_vert( } } - // first sort (desc) among the children/parent + // sort the invariant values of the directed neighbors child_inv_set.sort(); child_inv_set.reverse(); vert_order.append(&mut child_inv_set); - + dag[*vert].order = vert_order.clone(); order_str_set.insert(vert_order); } - // lexico-sort the invariant-strings in descending order + // lexicographicaly sort the vectors in descending order let mut ordered_vec: Vec> = order_str_set.into_iter().collect(); ordered_vec.sort(); ordered_vec.reverse(); @@ -553,7 +578,7 @@ fn invariant_dag_vert( order_idx.insert(order_str.clone(), idx as u32); } - // assign the invariant of vertex as the order of invariant-strings + // assign the invariant of the vertex as the order index of sorted vectors for vert in &dag_level_list[curr_lvl_range as usize] { dag[*vert].inv = (*order_idx.get(&dag[*vert].order).unwrap()) + 1; }