From a59b293cfee00944ceefba30009e4b8e1b855a25 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Tue, 17 Dec 2024 19:41:01 -0700 Subject: [PATCH 01/13] DAG data-structure for canonization code --- src/canonize.rs | 102 ++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 4 ++ src/molecule.rs | 4 ++ 3 files changed, 110 insertions(+) create mode 100644 src/canonize.rs diff --git a/src/canonize.rs b/src/canonize.rs new file mode 100644 index 00000000..6c33cb8c --- /dev/null +++ b/src/canonize.rs @@ -0,0 +1,102 @@ +use std::collections::{HashMap, VecDeque}; + +use crate::molecule::Molecule; +use petgraph::{dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Graph}; + +// Compute the assembly index of a molecule +pub fn canonize(m: &Molecule) -> u32 { + let mol_graph = m.get_graph(); + for root in mol_graph.node_indices() { + // for each node in the molecule graph create a signature + + /* + 1. create a DAG from each start node + */ + let mut DAG = Graph::::new(); + + let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); + + let mut DAG_vertex_map: HashMap = HashMap::new(); + + let mut visited: VecDeque<(NodeIndex,u32)> = VecDeque::new(); + visited.push_back((root, 0)); + + seen_edges_cache.insert((root, root), 0); + DAG_vertex_map.insert(format!("{:?}-{}", root, 0), DAG.add_node(format!("{:?}-{}", root, 0))); + + + loop { + + let (curr, level) = visited.pop_front().unwrap(); + + println!("Current:{:?}", curr); + + for neigh in mol_graph.neighbors(curr) { + println!("Neighbor:{:?}", neigh); + + let mut add_node_to_dag = false; + + //check if curr -> neigh or neigh -> curr already exists + match seen_edges_cache.get(&(curr, neigh)) { + Some(seen_at_level) => { + // edge already exists at a level above + println!("Already Seen: {:?} <--> {:?} at level {}", curr, neigh, *seen_at_level); + println!("Current Level: {}", level); + if *seen_at_level < (level+1) { + continue; + } + else { + //edge at the same level + add_node_to_dag = true; + } + }, + None => { + //No edge found + add_node_to_dag = true; + } + } + + if add_node_to_dag { + + //check if a atom has already been processed during this current level's processing ? + match DAG_vertex_map.get(&format!("{:?}-{}", neigh, (level+1))) { + Some(present_node_idx) => { + seen_edges_cache.insert((curr, neigh), level+1); + seen_edges_cache.insert((neigh, curr), level+1); + //get parent node's NodeIndex + match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { + Some(parent_node_idx) => { + DAG.add_edge(*parent_node_idx, *present_node_idx, ""); + } + None => {} + } + //skip rest of the processing for the atom + continue; + } + None => {} + } + + seen_edges_cache.insert((curr, neigh), level+1); + seen_edges_cache.insert((neigh, curr), level+1); + let child_node_idx = DAG.add_node(format!("{:?}-{}", neigh, level+1)); + DAG_vertex_map.insert(format!("{:?}-{}", neigh, level+1), child_node_idx); + visited.push_back((neigh, level+1)); + //get parent node's NodeIndex + match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { + Some(parent_node_idx) => { + DAG.add_edge(*parent_node_idx, child_node_idx, ""); + } + None => {} + } + } + } + + if visited.len() == 0 { + break; + } + } + println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + break; + } + 0 +} diff --git a/src/main.rs b/src/main.rs index c10d4af2..a7227ab6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,9 @@ mod molecule; // Data IO mod loader; +// Canonizer +mod canonize; + // The hard bit: compute assembly index mod assembly; @@ -23,6 +26,7 @@ fn main() -> std::io::Result<()> { if let Some(path) = cli.path { let molecule = loader::parse(&path)?; + canonize::canonize(&molecule); let index = assembly::index(&molecule); println!("{}", index); } diff --git a/src/molecule.rs b/src/molecule.rs index 1677c4ea..3f4b6da0 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -99,6 +99,10 @@ impl Molecule { Self { graph: g } } + pub fn get_graph(&self) -> &MGraph { + &self.graph + } + pub fn single_bond() -> Self { let mut g = Graph::default(); let u = g.add_node(Atom { From f40ad5666747555cb55f7ccb1c40107c9c4a1ca8 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Wed, 18 Dec 2024 17:27:05 -0700 Subject: [PATCH 02/13] canonization base algorithm added --- Cargo.toml | 1 + src/canonize.rs | 242 +++++++++++++++++++++++++++++++++++++++++++++--- src/molecule.rs | 8 ++ 3 files changed, 239 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3ed075a2..4d141142 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive"] } petgraph = "0.6.5" +lexical-sort = "0.3.1" \ No newline at end of file diff --git a/src/canonize.rs b/src/canonize.rs index 6c33cb8c..1fc3ca70 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,29 +1,74 @@ -use std::collections::{HashMap, VecDeque}; +use std::collections::{btree_set::Range, HashMap, HashSet, VecDeque}; +use lexical_sort::{StringSort, lexical_only_alnum_cmp}; use crate::molecule::Molecule; -use petgraph::{dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Graph}; +use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph}; + +#[derive(Debug, Clone)] +struct DAGVert { + atom_idx: NodeIndex, + inv: u32, + order: String, + parents: Vec, + level: u32 +} + +impl DAGVert { + pub fn new(atom_idx: NodeIndex, parents: Vec, level: u32) -> Self { + DAGVert { + atom_idx, + inv: 0, + parents, + level, + order: String::new() + } + } +} + +#[derive(Debug, Clone)] +struct MolAtomNode { + color: u32, + inv: u32, + order: String +} + +impl MolAtomNode { + pub fn new(color: u32, inv: u32, order: String) -> Self { + MolAtomNode {color, inv, order} + } +} // Compute the assembly index of a molecule pub fn canonize(m: &Molecule) -> u32 { - let mol_graph = m.get_graph(); + let mol_graph = m.get_graph().clone(); for root in mol_graph.node_indices() { // for each node in the molecule graph create a signature - /* 1. create a DAG from each start node */ - let mut DAG = Graph::::new(); - - let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); + let mut DAG = Graph::::new(); let mut DAG_vertex_map: HashMap = HashMap::new(); + let mut mol_g_dag_vertex_map: HashMap = HashMap::new(); + + let mut dag_level_list: HashMap> = HashMap::new(); + + let mut max_level: u32 = 0; + + { + let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); + let mut visited: VecDeque<(NodeIndex,u32)> = VecDeque::new(); + visited.push_back((root, 0)); seen_edges_cache.insert((root, root), 0); - DAG_vertex_map.insert(format!("{:?}-{}", root, 0), DAG.add_node(format!("{:?}-{}", root, 0))); - + // DAG_vertex_map.insert(format!("{:?}-{}", root, 0), DAG.add_node(format!("{:?}-{}", root, 0))); + let root_vertex_id = DAG.add_node(DAGVert::new(root, [].to_vec(), 0)); + DAG_vertex_map.insert(format!("{:?}-{}", root, 0), root_vertex_id); + dag_level_list.insert(0, [root_vertex_id].to_vec()); + mol_g_dag_vertex_map.insert(root, root_vertex_id); loop { @@ -67,6 +112,8 @@ pub fn canonize(m: &Molecule) -> u32 { match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { Some(parent_node_idx) => { DAG.add_edge(*parent_node_idx, *present_node_idx, ""); + // add as parent in the DAGvert + (&mut DAG[*present_node_idx]).parents.push(*parent_node_idx); } None => {} } @@ -75,16 +122,26 @@ pub fn canonize(m: &Molecule) -> u32 { } None => {} } - + + // haven't seen the atom before so add it to DAG + max_level = level + 1; seen_edges_cache.insert((curr, neigh), level+1); seen_edges_cache.insert((neigh, curr), level+1); - let child_node_idx = DAG.add_node(format!("{:?}-{}", neigh, level+1)); + // let child_node_idx = DAG.add_node(format!("{:?}-{}", neigh, level+1)); + let child_node_idx = DAG.add_node(DAGVert::new(neigh, [].to_vec(), level+1)); DAG_vertex_map.insert(format!("{:?}-{}", neigh, level+1), child_node_idx); + mol_g_dag_vertex_map.insert(neigh, child_node_idx); + + // Insert into a level by level hashmap of dag nodes + dag_level_list.entry((level+1)).and_modify(|level_vert_list| level_vert_list.push(child_node_idx)).or_insert([child_node_idx].to_vec()); + visited.push_back((neigh, level+1)); //get parent node's NodeIndex match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { Some(parent_node_idx) => { DAG.add_edge(*parent_node_idx, child_node_idx, ""); + // add as parent in the DAGvert + (&mut DAG[child_node_idx]).parents.push(*parent_node_idx); } None => {} } @@ -95,8 +152,169 @@ pub fn canonize(m: &Molecule) -> u32 { break; } } + println!("DAG before invariants"); println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - break; + } + + // println!("DAG Lvl by lvl list"); + // println!("{:?}", dag_level_list); + // println!("Max lvl: {}",max_level); + /* + 2.1. Initialize the molecule graph with color = 0 and invariant no. for each atom from (atom_type,#parents in DAG) + 2.2. Do lexicographical ordering of the (atom_type, #parents in DAG) + */ + let mut extended_molg_atom_map: HashMap = HashMap::new(); + let mut order_str_set: HashSet = HashSet::new(); + for atom_node in mol_graph.node_indices() { + let dag_vert = mol_g_dag_vertex_map.get(&atom_node).unwrap(); + let parent_len = DAG[*dag_vert].parents.len(); + let atom_str = mol_graph[atom_node].get_element(); + let atom_order_str = format!("{}{}", atom_str, parent_len); + order_str_set.insert(atom_order_str.clone()); + extended_molg_atom_map.insert(atom_node, MolAtomNode::new(0, 0, atom_order_str)); + } + + // sort + let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + + // println!("{:?}", ordered_vec); + + let mut order_idx:HashMap = HashMap::new(); + + for (idx, order_str) in ordered_vec.iter().enumerate() { + order_idx.insert(order_str.clone(), idx as u32); + } + + for atom_node in mol_graph.node_indices() { + extended_molg_atom_map.entry(atom_node).and_modify(|ext_molg_atom| ext_molg_atom.inv = *order_idx.get(&ext_molg_atom.order).unwrap()); + } + + // println!("Extended Atom Entry"); + // println!("{:?}", extended_molg_atom_map); + + /* + 3. Generate Invariant for Atoms + */ + + loop { + // let start_invariants_atoms; + + let start_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); + + println!("Begin Atom Invariants: {}", start_inv_atoms); + + /* + 3.1 Generate Invariants for DAG vertex + */ + invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); + + // println!("DAG after bottom invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + + invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); + + // println!("DAG after top invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + + let mut order_map_vert_atom: HashMap> = HashMap::new(); + for atom in mol_graph.node_indices() { + order_map_vert_atom.insert(atom, vec![0;(max_level+1).try_into().unwrap()]); + } + + for vert in DAG.node_indices() { + order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); + } + + let mut order_to_atom: HashMap> = HashMap::new(); + + for atom in mol_graph.node_indices() { + let order_str = order_map_vert_atom.get(&atom).unwrap().into_iter().map(|i| i.to_string()).collect::(); + order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + + // sort in descending order + let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); + atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + atom_ordered_vec.reverse(); + + for (idx, order) in atom_ordered_vec.iter().enumerate() { + for atom in order_to_atom.get(*order).unwrap() { + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); + } + } + + let end_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); + + println!("End Atom Invariants: {}", end_inv_atoms); + + if start_inv_atoms == end_inv_atoms {break} + } + break; } 0 } + +fn invariant_dag( + DAG: &mut Graph::, + extended_molg_atom_map: &HashMap, + dag_level_list: &HashMap>, + max_level: u32, + bottom: bool) { + // let mut vert_order_map:HashMap = HashMap::new(); + let mut curr_lvl_range = if bottom {max_level} else {0}; + loop { + let mut order_str_set: HashSet = HashSet::new(); + for vert in dag_level_list.get(&curr_lvl_range).unwrap() { + let atom_idx_for_vert = DAG[*vert].atom_idx; + let atom_node = extended_molg_atom_map.get(&atom_idx_for_vert).unwrap(); + let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); + let mut vert_order = format!("{}{}", atom_color, atom_inv); + // let vert_neighbor_list + if bottom { + for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { + vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + } + } + else { + for vert_neigh in DAG.neighbors_directed(*vert, Incoming) { + vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + } + } + + DAG[*vert].order = vert_order.clone(); + order_str_set.insert(vert_order); + } + + let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + ordered_vec.reverse(); + + println!("{:?}", ordered_vec); + + let mut order_idx:HashMap = HashMap::new(); + + for (idx, order_str) in ordered_vec.iter().enumerate() { + order_idx.insert(order_str.clone(), idx as u32); + } + + for vert in dag_level_list.get(&curr_lvl_range).unwrap() { + DAG[*vert].inv = (*order_idx.get(&DAG[*vert].order).unwrap())+1; + } + + if bottom { + if curr_lvl_range == 0 {break}; + curr_lvl_range -= 1; + } + else { + if curr_lvl_range == max_level {break}; + curr_lvl_range += 1; + } + } +} diff --git a/src/molecule.rs b/src/molecule.rs index 3f4b6da0..57594940 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -41,6 +41,14 @@ impl Atom { capacity: 0, } } + pub fn get_element(&self) -> String { + (match self.element { + Element::Oxygen => "O", + Element::Nitrogen => "N", + Element::Carbon => "C", + Element::Hydrogen => "H", + }).to_string() + } } impl Molecule { From 725bc5111ecf6f43f30682a7ff0347c83e2254ad Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Thu, 19 Dec 2024 15:32:32 -0700 Subject: [PATCH 03/13] Added the signature generation code for canonization --- src/canonize.rs | 275 ++++++++++++++++++++++++++++++++++++------------ src/main.rs | 2 +- 2 files changed, 206 insertions(+), 71 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 1fc3ca70..e5c8cd30 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,8 +1,8 @@ use std::collections::{btree_set::Range, HashMap, HashSet, VecDeque}; -use lexical_sort::{StringSort, lexical_only_alnum_cmp}; +use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, StringSort}; -use crate::molecule::Molecule; -use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph}; +use crate::molecule::{Atom, Bond, Molecule}; +use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph, Undirected}; #[derive(Debug, Clone)] struct DAGVert { @@ -39,8 +39,9 @@ impl MolAtomNode { } // Compute the assembly index of a molecule -pub fn canonize(m: &Molecule) -> u32 { +pub fn canonize(m: &Molecule) -> String { let mol_graph = m.get_graph().clone(); + let mut max_string = String::new(); for root in mol_graph.node_indices() { // for each node in the molecule graph create a signature /* @@ -74,10 +75,10 @@ pub fn canonize(m: &Molecule) -> u32 { let (curr, level) = visited.pop_front().unwrap(); - println!("Current:{:?}", curr); + // println!("Current:{:?}", curr); for neigh in mol_graph.neighbors(curr) { - println!("Neighbor:{:?}", neigh); + // println!("Neighbor:{:?}", neigh); let mut add_node_to_dag = false; @@ -85,8 +86,8 @@ pub fn canonize(m: &Molecule) -> u32 { match seen_edges_cache.get(&(curr, neigh)) { Some(seen_at_level) => { // edge already exists at a level above - println!("Already Seen: {:?} <--> {:?} at level {}", curr, neigh, *seen_at_level); - println!("Current Level: {}", level); + // println!("Already Seen: {:?} <--> {:?} at level {}", curr, neigh, *seen_at_level); + // println!("Current Level: {}", level); if *seen_at_level < (level+1) { continue; } @@ -152,9 +153,9 @@ pub fn canonize(m: &Molecule) -> u32 { break; } } - println!("DAG before invariants"); - println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - } + // println!("DAG before invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + } // println!("DAG Lvl by lvl list"); // println!("{:?}", dag_level_list); @@ -174,109 +175,243 @@ pub fn canonize(m: &Molecule) -> u32 { extended_molg_atom_map.insert(atom_node, MolAtomNode::new(0, 0, atom_order_str)); } - // sort + // lexico-sort let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); - // println!("{:?}", ordered_vec); - let mut order_idx:HashMap = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { order_idx.insert(order_str.clone(), idx as u32); } + // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in DAG) for atom_node in mol_graph.node_indices() { extended_molg_atom_map.entry(atom_node).and_modify(|ext_molg_atom| ext_molg_atom.inv = *order_idx.get(&ext_molg_atom.order).unwrap()); } - // println!("Extended Atom Entry"); - // println!("{:?}", extended_molg_atom_map); + // get the canonized string for current root atom + let canon_string = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, 1, "".to_string()); - /* - 3. Generate Invariant for Atoms - */ + // lexico-compare strings to save the max one. + if lexical_cmp(&max_string, &canon_string).is_lt() { + max_string = canon_string + } + + // break; + } + return max_string; +} - loop { - // let start_invariants_atoms; - - let start_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); +fn canonize_signature( + mol_graph: &Graph, + mut DAG: &mut Graph::, + mut extended_molg_atom_map: &mut HashMap, + dag_level_list: &HashMap>, + mol_g_dag_vertex_map: &HashMap, + max_level: u32, + color_c: u32, + s_max: String, +) -> String { + // 1. get the invariants for each atom + invariant_atom(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level); + + // 2. generate orbits based on atom's invariant values + let mut orbits: HashMap> = HashMap::new(); + + for atom in mol_graph.node_indices() { + let atom_inv = extended_molg_atom_map.get(&atom).unwrap().inv; + let parent_len = DAG[*mol_g_dag_vertex_map.get(&atom).unwrap()].parents.len() as u32; + // only add atoms which have 2 or more parents in DAG + if parent_len >= 2 { + orbits.entry(atom_inv).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + } + + // 3. max length of any orbit + let mut max_orbit_len = 0; + orbits.values().for_each(|orbit| if orbit.len() > max_orbit_len {max_orbit_len = orbit.len()}); + + if max_orbit_len >= 2 { + // find the orbits with max len of atoms + let max_orbits = orbits.keys().filter(|orbit| orbits.get(&orbit).unwrap().len() == max_orbit_len).collect::>(); + // if multiple then use orbit with min value + let min_orbit = (if (&max_orbits.len()).clone() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); + + let mut local_smax = String::new(); + // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit + for atom in orbits.get(&min_orbit).unwrap() { + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.color = color_c as u32); + local_smax = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, color_c+1, s_max.clone()); + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.color = 0); + } + return local_smax; + } + else { + // not need to recurse further and print the signature-string + for atom in mol_graph.node_indices() { + let atom_inv = extended_molg_atom_map.get(&atom).unwrap().inv; + let atom_color = extended_molg_atom_map.get(&atom).unwrap().color; + let parent_len = DAG[*mol_g_dag_vertex_map.get(&atom).unwrap()].parents.len() as u32; + // first update any atom without a color to be same as its invariant value + if (atom_color == 0) && (parent_len >= 2) { + extended_molg_atom_map.entry(atom).and_modify(|atom_node| atom_node.color = atom_inv); + } + } + // start from root node of the DAG + let root_node = DAG.node_indices().find(|vert| DAG.neighbors_directed(*vert, Incoming).count() == 0).unwrap(); + let local_smax = print_signature_string(root_node, &DAG, &mol_graph, &extended_molg_atom_map, &mut vec![]); + if local_smax.len() > s_max.len() { + return local_smax; + } + else { + return s_max; + } + } +} - println!("Begin Atom Invariants: {}", start_inv_atoms); +fn print_signature_string( + vertex: NodeIndex, + DAG: &Graph::, + mol_graph: &Graph, + extended_molg_atom_map: &HashMap, + mut edges: &mut Vec<(NodeIndex, NodeIndex)> +) -> String { + let mut print_sign = String::new(); + print_sign.push('['); + let atom_idx = DAG[vertex].atom_idx; + let atom = &mol_graph[DAG[vertex].atom_idx]; + print_sign.push_str(&atom.get_element()); + let atom_color = extended_molg_atom_map.get(&atom_idx).unwrap().color; + if atom_color != 0 { + print_sign.push(','); + print_sign.push_str(&atom_color.to_string()); + } + print_sign.push(']'); + + let dag_childs = &DAG.neighbors_directed(vertex, Outgoing); + if dag_childs.clone().count() == 0 { return print_sign; } + else { + print_sign.push('('); + + // sort children in descending order of inv + let mut child_vec = dag_childs.clone().collect::>(); + child_vec.sort_by(|vert_a, vert_b| DAG[*vert_b].inv.cmp(&DAG[*vert_a].inv)); + for child in child_vec { + if let Some(_edge) = edges.iter().find(|egde| (egde.0 == vertex) && (egde.1 == child)) {} + else { + // if the edge is not already seen then add it to seen and generate signature-string for the child + edges.push((vertex, child)); + print_sign.push_str(&print_signature_string(child, &DAG, &mol_graph, &extended_molg_atom_map, edges)); + } + } + print_sign.push(')'); + return print_sign; + } +} - /* - 3.1 Generate Invariants for DAG vertex - */ - invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); +/* + 3. Generate Invariant for Atoms + */ +fn invariant_atom( + mol_graph: &Graph, + mut DAG: &mut Graph::, + extended_molg_atom_map: &mut HashMap, + dag_level_list: &HashMap>, + max_level: u32, +) { + let mut count = 0; + loop { + let start_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); - // println!("DAG after bottom invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + println!("Begin Atom Invariants: {}", start_inv_atoms); - invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); + /* + 3.1 Generate Invariants for DAG vertex + */ - // println!("DAG after top invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + // first bottom-up + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); - let mut order_map_vert_atom: HashMap> = HashMap::new(); - for atom in mol_graph.node_indices() { - order_map_vert_atom.insert(atom, vec![0;(max_level+1).try_into().unwrap()]); - } + // println!("DAG after bottom invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - for vert in DAG.node_indices() { - order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); - } + // then top-down + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); - let mut order_to_atom: HashMap> = HashMap::new(); + // println!("DAG after top invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - for atom in mol_graph.node_indices() { - let order_str = order_map_vert_atom.get(&atom).unwrap().into_iter().map(|i| i.to_string()).collect::(); - order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); - } + // Create a vector for each atom in molecule graph based on associated vertex in + let mut order_map_vert_atom: HashMap> = HashMap::new(); + for atom in mol_graph.node_indices() { + order_map_vert_atom.insert(atom, vec![0;(max_level+1).try_into().unwrap()]); + } - // sort in descending order - let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); - atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); - atom_ordered_vec.reverse(); + for vert in DAG.node_indices() { + order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); + } - for (idx, order) in atom_ordered_vec.iter().enumerate() { - for atom in order_to_atom.get(*order).unwrap() { - extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); - } + let mut order_to_atom: HashMap> = HashMap::new(); + + // turn vectors into strings for sorting + for atom in mol_graph.node_indices() { + let order_str = order_map_vert_atom.get(&atom).unwrap().into_iter().map(|i| i.to_string()).collect::(); + order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + + // lexico-sort the vectors-strings in descending order + let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); + atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + atom_ordered_vec.reverse(); + + // assign the invariant of atom as the order of vectors-strings + for (idx, order) in atom_ordered_vec.iter().enumerate() { + for atom in order_to_atom.get(*order).unwrap() { + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); } + } - let end_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); + let end_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); - println!("End Atom Invariants: {}", end_inv_atoms); + println!("End Atom Invariants: {}", end_inv_atoms); - if start_inv_atoms == end_inv_atoms {break} + // compare invariants of all the atoms with the one's they started from + if start_inv_atoms == end_inv_atoms {break;} + if count > 100 { + println!("Breaking because the invariants never settle!"); + break; } - break; + count +=1; } - 0 } -fn invariant_dag( +/* + 3. Generate Invariant for Vertices + */ +fn invariant_dag_vert( DAG: &mut Graph::, extended_molg_atom_map: &HashMap, dag_level_list: &HashMap>, max_level: u32, bottom: bool) { - // let mut vert_order_map:HashMap = HashMap::new(); + // top-down or bottom-up calculation of invariants for each vertex in DAG let mut curr_lvl_range = if bottom {max_level} else {0}; loop { + // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors let mut order_str_set: HashSet = HashSet::new(); for vert in dag_level_list.get(&curr_lvl_range).unwrap() { let atom_idx_for_vert = DAG[*vert].atom_idx; let atom_node = extended_molg_atom_map.get(&atom_idx_for_vert).unwrap(); let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); let mut vert_order = format!("{}{}", atom_color, atom_inv); - // let vert_neighbor_list + if bottom { for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); @@ -292,18 +427,18 @@ fn invariant_dag( order_str_set.insert(vert_order); } + // lexico-sort the invariant-strings in descending order let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); ordered_vec.reverse(); - println!("{:?}", ordered_vec); - let mut order_idx:HashMap = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { order_idx.insert(order_str.clone(), idx as u32); } - + + // assign the invariant of vertex as the order of invariant-strings for vert in dag_level_list.get(&curr_lvl_range).unwrap() { DAG[*vert].inv = (*order_idx.get(&DAG[*vert].order).unwrap())+1; } diff --git a/src/main.rs b/src/main.rs index a7227ab6..fab99e20 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,7 +26,7 @@ fn main() -> std::io::Result<()> { if let Some(path) = cli.path { let molecule = loader::parse(&path)?; - canonize::canonize(&molecule); + println!("Final Molecule Signature: {}", canonize::canonize(&molecule)); let index = assembly::index(&molecule); println!("{}", index); } From 5072b1629af8487deea63930add7ec1995136718 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Thu, 19 Dec 2024 17:37:59 -0700 Subject: [PATCH 04/13] added better stopping cond. for atom invariants --- src/canonize.rs | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index e5c8cd30..4a0f7a22 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -197,8 +197,6 @@ pub fn canonize(m: &Molecule) -> String { if lexical_cmp(&max_string, &canon_string).is_lt() { max_string = canon_string } - - // break; } return max_string; } @@ -289,14 +287,14 @@ fn print_signature_string( } print_sign.push(']'); - let dag_childs = &DAG.neighbors_directed(vertex, Outgoing); - if dag_childs.clone().count() == 0 { return print_sign; } + let mut child_vec = DAG.neighbors_directed(vertex, Outgoing).collect::>(); + if child_vec.len() == 0 { return print_sign; } else { - print_sign.push('('); - // sort children in descending order of inv - let mut child_vec = dag_childs.clone().collect::>(); + // let child_vec = dag_childs; child_vec.sort_by(|vert_a, vert_b| DAG[*vert_b].inv.cmp(&DAG[*vert_a].inv)); + + print_sign.push('('); for child in child_vec { if let Some(_edge) = edges.iter().find(|egde| (egde.0 == vertex) && (egde.1 == child)) {} else { @@ -322,12 +320,10 @@ fn invariant_atom( ) { let mut count = 0; loop { - let start_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); - - println!("Begin Atom Invariants: {}", start_inv_atoms); + let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + // println!("Begin Atom Invariants: {}", start_inv_atoms); /* 3.1 Generate Invariants for DAG vertex @@ -375,17 +371,20 @@ fn invariant_atom( } } - let end_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); - println!("End Atom Invariants: {}", end_inv_atoms); + let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + - // compare invariants of all the atoms with the one's they started from + // println!("End Atom Invariants: {}", end_inv_atoms); + + // compare the no. of invariants of all the atoms with the one's they started from if start_inv_atoms == end_inv_atoms {break;} - if count > 100 { - println!("Breaking because the invariants never settle!"); + + // Naive way of stopping + if count > mol_graph.node_count() { + println!("breaking out because reached upper limit!"); break; } count +=1; From af5534faaab688fded2319fd644acd36c9017f89 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Fri, 20 Dec 2024 11:10:15 -0700 Subject: [PATCH 05/13] lock file for lexical-sort --- Cargo.lock | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index ab7c3afb..dfa454e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "anstream" @@ -51,6 +51,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "any_ascii" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70033777eb8b5124a81a1889416543dddef2de240019b674c81285a2635a7e1e" + [[package]] name = "clap" version = "4.5.23" @@ -108,6 +114,7 @@ name = "fastassembly" version = "0.1.0" dependencies = [ "clap", + "lexical-sort", "petgraph", ] @@ -145,6 +152,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "lexical-sort" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c09e4591611e231daf4d4c685a66cb0410cc1e502027a20ae55f2bb9e997207a" +dependencies = [ + "any_ascii", +] + [[package]] name = "petgraph" version = "0.6.5" From f96676364b48a781b0faec8945861a6418a38056 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Sat, 21 Dec 2024 14:40:10 -0700 Subject: [PATCH 06/13] fixed a few sorting mismatch in invariant calculation --- src/canonize.rs | 100 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 21 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 4a0f7a22..98590dae 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,7 +1,7 @@ use std::collections::{btree_set::Range, HashMap, HashSet, VecDeque}; -use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, StringSort}; +use lexical_sort::{cmp, lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; -use crate::molecule::{Atom, Bond, Molecule}; +use crate::molecule::{Atom, Bond, Molecule, Element}; use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph, Undirected}; #[derive(Debug, Clone)] @@ -40,9 +40,52 @@ impl MolAtomNode { // Compute the assembly index of a molecule pub fn canonize(m: &Molecule) -> String { - let mol_graph = m.get_graph().clone(); + let mol_graph = m.get_graph(); + + /* + // Dummy Molecule for testing + + let mut mol_graph: Graph = Graph::::new_undirected(); + + let mut vec_nodes: Vec = Vec::new(); + vec_nodes.push(NodeIndex::new(99999)); + for i in 0..16 { + vec_nodes.push(mol_graph.add_node(Atom::new(Element::Carbon))); + } + mol_graph.add_edge(vec_nodes[1],vec_nodes[2],Bond::Single); + mol_graph.add_edge(vec_nodes[2],vec_nodes[3],Bond::Single); + mol_graph.add_edge(vec_nodes[3],vec_nodes[4],Bond::Single); + mol_graph.add_edge(vec_nodes[4],vec_nodes[1],Bond::Single); + + mol_graph.add_edge(vec_nodes[1],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[2],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[3],vec_nodes[9],Bond::Single); + mol_graph.add_edge(vec_nodes[4],vec_nodes[11],Bond::Single); + + mol_graph.add_edge(vec_nodes[13],vec_nodes[14],Bond::Single); + mol_graph.add_edge(vec_nodes[14],vec_nodes[15],Bond::Single); + mol_graph.add_edge(vec_nodes[15],vec_nodes[16],Bond::Single); + mol_graph.add_edge(vec_nodes[16],vec_nodes[13],Bond::Single); + + mol_graph.add_edge(vec_nodes[13],vec_nodes[6],Bond::Single); + mol_graph.add_edge(vec_nodes[14],vec_nodes[8],Bond::Single); + mol_graph.add_edge(vec_nodes[15],vec_nodes[10],Bond::Single); + mol_graph.add_edge(vec_nodes[16],vec_nodes[12],Bond::Single); + + mol_graph.add_edge(vec_nodes[12],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[6],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[8],vec_nodes[9],Bond::Single); + mol_graph.add_edge(vec_nodes[10],vec_nodes[11],Bond::Single); + + mol_graph.add_edge(vec_nodes[12],vec_nodes[11],Bond::Single); + mol_graph.add_edge(vec_nodes[6],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[8],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[10],vec_nodes[9],Bond::Single); + */ + let mut max_string = String::new(); for root in mol_graph.node_indices() { + // let root = vec_nodes[1]; // for each node in the molecule graph create a signature /* 1. create a DAG from each start node @@ -182,7 +225,7 @@ pub fn canonize(m: &Molecule) -> String { let mut order_idx:HashMap = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { - order_idx.insert(order_str.clone(), idx as u32); + order_idx.insert(order_str.clone(), (idx as u32)+1); } // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in DAG) @@ -320,9 +363,14 @@ fn invariant_atom( ) { let mut count = 0; loop { - let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // .into_iter() + // .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + + // Better stopping condition ?? + let start_inv_atoms = mol_graph.node_indices() .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); // println!("Begin Atom Invariants: {}", start_inv_atoms); /* @@ -333,13 +381,13 @@ fn invariant_atom( invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); // println!("DAG after bottom invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); // then top-down invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); // println!("DAG after top invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); // Create a vector for each atom in molecule graph based on associated vertex in let mut order_map_vert_atom: HashMap> = HashMap::new(); @@ -359,22 +407,26 @@ fn invariant_atom( order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); } - // lexico-sort the vectors-strings in descending order + // lexico-sort the vectors-strings let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); - atom_ordered_vec.reverse(); // assign the invariant of atom as the order of vectors-strings for (idx, order) in atom_ordered_vec.iter().enumerate() { for atom in order_to_atom.get(*order).unwrap() { - extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = (idx as u32)+1); } } - let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // .into_iter() + // .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + + // Better stopping condition ?? + let end_inv_atoms = mol_graph.node_indices() .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); // println!("End Atom Invariants: {}", end_inv_atoms); @@ -410,25 +462,31 @@ fn invariant_dag_vert( let atom_node = extended_molg_atom_map.get(&atom_idx_for_vert).unwrap(); let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); let mut vert_order = format!("{}{}", atom_color, atom_inv); - + let mut child_inv_set: Vec = Vec::new(); + if bottom { for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { - vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + child_inv_set.push(DAG[vert_neigh].inv); } } else { for vert_neigh in DAG.neighbors_directed(*vert, Incoming) { - vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + child_inv_set.push(DAG[vert_neigh].inv); } } - - DAG[*vert].order = vert_order.clone(); - order_str_set.insert(vert_order); + + child_inv_set.sort(); + child_inv_set.reverse(); + child_inv_set.iter().for_each(|val| vert_order.push_str(&format!("{}", *val))); + + let vec_string = format!("{:0>20}", vert_order); + DAG[*vert].order = vec_string.clone(); + order_str_set.insert(vec_string); } // lexico-sort the invariant-strings in descending order - let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); - ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + let mut ordered_vec: Vec = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(natural_cmp); ordered_vec.reverse(); let mut order_idx:HashMap = HashMap::new(); From 19b7ef34304b778d4c1aee8f0891980093326c93 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Tue, 17 Dec 2024 19:41:01 -0700 Subject: [PATCH 07/13] DAG data-structure for canonization code --- src/canonize.rs | 102 ++++++++++++++++++++++++++++++++++++++++++++++++ src/molecule.rs | 4 ++ 2 files changed, 106 insertions(+) create mode 100644 src/canonize.rs diff --git a/src/canonize.rs b/src/canonize.rs new file mode 100644 index 00000000..6c33cb8c --- /dev/null +++ b/src/canonize.rs @@ -0,0 +1,102 @@ +use std::collections::{HashMap, VecDeque}; + +use crate::molecule::Molecule; +use petgraph::{dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Graph}; + +// Compute the assembly index of a molecule +pub fn canonize(m: &Molecule) -> u32 { + let mol_graph = m.get_graph(); + for root in mol_graph.node_indices() { + // for each node in the molecule graph create a signature + + /* + 1. create a DAG from each start node + */ + let mut DAG = Graph::::new(); + + let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); + + let mut DAG_vertex_map: HashMap = HashMap::new(); + + let mut visited: VecDeque<(NodeIndex,u32)> = VecDeque::new(); + visited.push_back((root, 0)); + + seen_edges_cache.insert((root, root), 0); + DAG_vertex_map.insert(format!("{:?}-{}", root, 0), DAG.add_node(format!("{:?}-{}", root, 0))); + + + loop { + + let (curr, level) = visited.pop_front().unwrap(); + + println!("Current:{:?}", curr); + + for neigh in mol_graph.neighbors(curr) { + println!("Neighbor:{:?}", neigh); + + let mut add_node_to_dag = false; + + //check if curr -> neigh or neigh -> curr already exists + match seen_edges_cache.get(&(curr, neigh)) { + Some(seen_at_level) => { + // edge already exists at a level above + println!("Already Seen: {:?} <--> {:?} at level {}", curr, neigh, *seen_at_level); + println!("Current Level: {}", level); + if *seen_at_level < (level+1) { + continue; + } + else { + //edge at the same level + add_node_to_dag = true; + } + }, + None => { + //No edge found + add_node_to_dag = true; + } + } + + if add_node_to_dag { + + //check if a atom has already been processed during this current level's processing ? + match DAG_vertex_map.get(&format!("{:?}-{}", neigh, (level+1))) { + Some(present_node_idx) => { + seen_edges_cache.insert((curr, neigh), level+1); + seen_edges_cache.insert((neigh, curr), level+1); + //get parent node's NodeIndex + match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { + Some(parent_node_idx) => { + DAG.add_edge(*parent_node_idx, *present_node_idx, ""); + } + None => {} + } + //skip rest of the processing for the atom + continue; + } + None => {} + } + + seen_edges_cache.insert((curr, neigh), level+1); + seen_edges_cache.insert((neigh, curr), level+1); + let child_node_idx = DAG.add_node(format!("{:?}-{}", neigh, level+1)); + DAG_vertex_map.insert(format!("{:?}-{}", neigh, level+1), child_node_idx); + visited.push_back((neigh, level+1)); + //get parent node's NodeIndex + match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { + Some(parent_node_idx) => { + DAG.add_edge(*parent_node_idx, child_node_idx, ""); + } + None => {} + } + } + } + + if visited.len() == 0 { + break; + } + } + println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + break; + } + 0 +} diff --git a/src/molecule.rs b/src/molecule.rs index 4370dd1a..0230a9ec 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -226,6 +226,10 @@ impl Molecule { Self { graph: g } } + pub fn get_graph(&self) -> &MGraph { + &self.graph + } + pub fn single_bond() -> Self { let mut g = Graph::default(); let u = g.add_node(Atom { From 0b133b15b21feb637efd6824df216328a91d8845 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Mon, 3 Feb 2025 12:01:08 -0700 Subject: [PATCH 08/13] canonization based implementation --- Cargo.lock | 18 +++- Cargo.toml | 1 + src/canonize.rs | 242 +++++++++++++++++++++++++++++++++++++++++++++--- src/molecule.rs | 8 ++ 4 files changed, 256 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 98afd53e..865465a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "aho-corasick" @@ -61,6 +61,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "any_ascii" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70033777eb8b5124a81a1889416543dddef2de240019b674c81285a2635a7e1e" + [[package]] name = "atty" version = "0.2.14" @@ -276,6 +282,7 @@ dependencies = [ "clap 4.5.27", "criterion", "csv", + "lexical-sort", "petgraph", ] @@ -359,6 +366,15 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lexical-sort" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c09e4591611e231daf4d4c685a66cb0410cc1e502027a20ae55f2bb9e997207a" +dependencies = [ + "any_ascii", +] + [[package]] name = "libc" version = "0.2.169" diff --git a/Cargo.toml b/Cargo.toml index 21d7c3ee..3bc85b82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ bit-set = "0.8.0" clap = { version = "4.5.23", features = ["derive"] } csv = "1.3.1" petgraph = "0.6.5" +lexical-sort = "0.3.1" [dev-dependencies] criterion = "0.3" diff --git a/src/canonize.rs b/src/canonize.rs index 6c33cb8c..1fc3ca70 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,29 +1,74 @@ -use std::collections::{HashMap, VecDeque}; +use std::collections::{btree_set::Range, HashMap, HashSet, VecDeque}; +use lexical_sort::{StringSort, lexical_only_alnum_cmp}; use crate::molecule::Molecule; -use petgraph::{dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Graph}; +use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph}; + +#[derive(Debug, Clone)] +struct DAGVert { + atom_idx: NodeIndex, + inv: u32, + order: String, + parents: Vec, + level: u32 +} + +impl DAGVert { + pub fn new(atom_idx: NodeIndex, parents: Vec, level: u32) -> Self { + DAGVert { + atom_idx, + inv: 0, + parents, + level, + order: String::new() + } + } +} + +#[derive(Debug, Clone)] +struct MolAtomNode { + color: u32, + inv: u32, + order: String +} + +impl MolAtomNode { + pub fn new(color: u32, inv: u32, order: String) -> Self { + MolAtomNode {color, inv, order} + } +} // Compute the assembly index of a molecule pub fn canonize(m: &Molecule) -> u32 { - let mol_graph = m.get_graph(); + let mol_graph = m.get_graph().clone(); for root in mol_graph.node_indices() { // for each node in the molecule graph create a signature - /* 1. create a DAG from each start node */ - let mut DAG = Graph::::new(); - - let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); + let mut DAG = Graph::::new(); let mut DAG_vertex_map: HashMap = HashMap::new(); + let mut mol_g_dag_vertex_map: HashMap = HashMap::new(); + + let mut dag_level_list: HashMap> = HashMap::new(); + + let mut max_level: u32 = 0; + + { + let mut seen_edges_cache: HashMap<(NodeIndex,NodeIndex), u32> = HashMap::new(); + let mut visited: VecDeque<(NodeIndex,u32)> = VecDeque::new(); + visited.push_back((root, 0)); seen_edges_cache.insert((root, root), 0); - DAG_vertex_map.insert(format!("{:?}-{}", root, 0), DAG.add_node(format!("{:?}-{}", root, 0))); - + // DAG_vertex_map.insert(format!("{:?}-{}", root, 0), DAG.add_node(format!("{:?}-{}", root, 0))); + let root_vertex_id = DAG.add_node(DAGVert::new(root, [].to_vec(), 0)); + DAG_vertex_map.insert(format!("{:?}-{}", root, 0), root_vertex_id); + dag_level_list.insert(0, [root_vertex_id].to_vec()); + mol_g_dag_vertex_map.insert(root, root_vertex_id); loop { @@ -67,6 +112,8 @@ pub fn canonize(m: &Molecule) -> u32 { match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { Some(parent_node_idx) => { DAG.add_edge(*parent_node_idx, *present_node_idx, ""); + // add as parent in the DAGvert + (&mut DAG[*present_node_idx]).parents.push(*parent_node_idx); } None => {} } @@ -75,16 +122,26 @@ pub fn canonize(m: &Molecule) -> u32 { } None => {} } - + + // haven't seen the atom before so add it to DAG + max_level = level + 1; seen_edges_cache.insert((curr, neigh), level+1); seen_edges_cache.insert((neigh, curr), level+1); - let child_node_idx = DAG.add_node(format!("{:?}-{}", neigh, level+1)); + // let child_node_idx = DAG.add_node(format!("{:?}-{}", neigh, level+1)); + let child_node_idx = DAG.add_node(DAGVert::new(neigh, [].to_vec(), level+1)); DAG_vertex_map.insert(format!("{:?}-{}", neigh, level+1), child_node_idx); + mol_g_dag_vertex_map.insert(neigh, child_node_idx); + + // Insert into a level by level hashmap of dag nodes + dag_level_list.entry((level+1)).and_modify(|level_vert_list| level_vert_list.push(child_node_idx)).or_insert([child_node_idx].to_vec()); + visited.push_back((neigh, level+1)); //get parent node's NodeIndex match DAG_vertex_map.get(&format!("{:?}-{}", curr, level)) { Some(parent_node_idx) => { DAG.add_edge(*parent_node_idx, child_node_idx, ""); + // add as parent in the DAGvert + (&mut DAG[child_node_idx]).parents.push(*parent_node_idx); } None => {} } @@ -95,8 +152,169 @@ pub fn canonize(m: &Molecule) -> u32 { break; } } + println!("DAG before invariants"); println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - break; + } + + // println!("DAG Lvl by lvl list"); + // println!("{:?}", dag_level_list); + // println!("Max lvl: {}",max_level); + /* + 2.1. Initialize the molecule graph with color = 0 and invariant no. for each atom from (atom_type,#parents in DAG) + 2.2. Do lexicographical ordering of the (atom_type, #parents in DAG) + */ + let mut extended_molg_atom_map: HashMap = HashMap::new(); + let mut order_str_set: HashSet = HashSet::new(); + for atom_node in mol_graph.node_indices() { + let dag_vert = mol_g_dag_vertex_map.get(&atom_node).unwrap(); + let parent_len = DAG[*dag_vert].parents.len(); + let atom_str = mol_graph[atom_node].get_element(); + let atom_order_str = format!("{}{}", atom_str, parent_len); + order_str_set.insert(atom_order_str.clone()); + extended_molg_atom_map.insert(atom_node, MolAtomNode::new(0, 0, atom_order_str)); + } + + // sort + let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + + // println!("{:?}", ordered_vec); + + let mut order_idx:HashMap = HashMap::new(); + + for (idx, order_str) in ordered_vec.iter().enumerate() { + order_idx.insert(order_str.clone(), idx as u32); + } + + for atom_node in mol_graph.node_indices() { + extended_molg_atom_map.entry(atom_node).and_modify(|ext_molg_atom| ext_molg_atom.inv = *order_idx.get(&ext_molg_atom.order).unwrap()); + } + + // println!("Extended Atom Entry"); + // println!("{:?}", extended_molg_atom_map); + + /* + 3. Generate Invariant for Atoms + */ + + loop { + // let start_invariants_atoms; + + let start_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); + + println!("Begin Atom Invariants: {}", start_inv_atoms); + + /* + 3.1 Generate Invariants for DAG vertex + */ + invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); + + // println!("DAG after bottom invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + + invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); + + // println!("DAG after top invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + + let mut order_map_vert_atom: HashMap> = HashMap::new(); + for atom in mol_graph.node_indices() { + order_map_vert_atom.insert(atom, vec![0;(max_level+1).try_into().unwrap()]); + } + + for vert in DAG.node_indices() { + order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); + } + + let mut order_to_atom: HashMap> = HashMap::new(); + + for atom in mol_graph.node_indices() { + let order_str = order_map_vert_atom.get(&atom).unwrap().into_iter().map(|i| i.to_string()).collect::(); + order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + + // sort in descending order + let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); + atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + atom_ordered_vec.reverse(); + + for (idx, order) in atom_ordered_vec.iter().enumerate() { + for atom in order_to_atom.get(*order).unwrap() { + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); + } + } + + let end_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); + + println!("End Atom Invariants: {}", end_inv_atoms); + + if start_inv_atoms == end_inv_atoms {break} + } + break; } 0 } + +fn invariant_dag( + DAG: &mut Graph::, + extended_molg_atom_map: &HashMap, + dag_level_list: &HashMap>, + max_level: u32, + bottom: bool) { + // let mut vert_order_map:HashMap = HashMap::new(); + let mut curr_lvl_range = if bottom {max_level} else {0}; + loop { + let mut order_str_set: HashSet = HashSet::new(); + for vert in dag_level_list.get(&curr_lvl_range).unwrap() { + let atom_idx_for_vert = DAG[*vert].atom_idx; + let atom_node = extended_molg_atom_map.get(&atom_idx_for_vert).unwrap(); + let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); + let mut vert_order = format!("{}{}", atom_color, atom_inv); + // let vert_neighbor_list + if bottom { + for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { + vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + } + } + else { + for vert_neigh in DAG.neighbors_directed(*vert, Incoming) { + vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + } + } + + DAG[*vert].order = vert_order.clone(); + order_str_set.insert(vert_order); + } + + let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + ordered_vec.reverse(); + + println!("{:?}", ordered_vec); + + let mut order_idx:HashMap = HashMap::new(); + + for (idx, order_str) in ordered_vec.iter().enumerate() { + order_idx.insert(order_str.clone(), idx as u32); + } + + for vert in dag_level_list.get(&curr_lvl_range).unwrap() { + DAG[*vert].inv = (*order_idx.get(&DAG[*vert].order).unwrap())+1; + } + + if bottom { + if curr_lvl_range == 0 {break}; + curr_lvl_range -= 1; + } + else { + if curr_lvl_range == max_level {break}; + curr_lvl_range += 1; + } + } +} diff --git a/src/molecule.rs b/src/molecule.rs index 0230a9ec..92d869a4 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -47,6 +47,14 @@ impl Atom { capacity: 0, } } + pub fn get_element(&self) -> String { + (match self.element { + Element::Oxygen => "O", + Element::Nitrogen => "N", + Element::Carbon => "C", + Element::Hydrogen => "H", + }).to_string() + } } impl Molecule { From 9a45d36e8dea5c32803e89e2b86e3fba959d6de8 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Thu, 19 Dec 2024 15:32:32 -0700 Subject: [PATCH 09/13] Added the signature generation code for canonization --- src/canonize.rs | 275 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 205 insertions(+), 70 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 1fc3ca70..e5c8cd30 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,8 +1,8 @@ use std::collections::{btree_set::Range, HashMap, HashSet, VecDeque}; -use lexical_sort::{StringSort, lexical_only_alnum_cmp}; +use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, StringSort}; -use crate::molecule::Molecule; -use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph}; +use crate::molecule::{Atom, Bond, Molecule}; +use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph, Undirected}; #[derive(Debug, Clone)] struct DAGVert { @@ -39,8 +39,9 @@ impl MolAtomNode { } // Compute the assembly index of a molecule -pub fn canonize(m: &Molecule) -> u32 { +pub fn canonize(m: &Molecule) -> String { let mol_graph = m.get_graph().clone(); + let mut max_string = String::new(); for root in mol_graph.node_indices() { // for each node in the molecule graph create a signature /* @@ -74,10 +75,10 @@ pub fn canonize(m: &Molecule) -> u32 { let (curr, level) = visited.pop_front().unwrap(); - println!("Current:{:?}", curr); + // println!("Current:{:?}", curr); for neigh in mol_graph.neighbors(curr) { - println!("Neighbor:{:?}", neigh); + // println!("Neighbor:{:?}", neigh); let mut add_node_to_dag = false; @@ -85,8 +86,8 @@ pub fn canonize(m: &Molecule) -> u32 { match seen_edges_cache.get(&(curr, neigh)) { Some(seen_at_level) => { // edge already exists at a level above - println!("Already Seen: {:?} <--> {:?} at level {}", curr, neigh, *seen_at_level); - println!("Current Level: {}", level); + // println!("Already Seen: {:?} <--> {:?} at level {}", curr, neigh, *seen_at_level); + // println!("Current Level: {}", level); if *seen_at_level < (level+1) { continue; } @@ -152,9 +153,9 @@ pub fn canonize(m: &Molecule) -> u32 { break; } } - println!("DAG before invariants"); - println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - } + // println!("DAG before invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + } // println!("DAG Lvl by lvl list"); // println!("{:?}", dag_level_list); @@ -174,109 +175,243 @@ pub fn canonize(m: &Molecule) -> u32 { extended_molg_atom_map.insert(atom_node, MolAtomNode::new(0, 0, atom_order_str)); } - // sort + // lexico-sort let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); - // println!("{:?}", ordered_vec); - let mut order_idx:HashMap = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { order_idx.insert(order_str.clone(), idx as u32); } + // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in DAG) for atom_node in mol_graph.node_indices() { extended_molg_atom_map.entry(atom_node).and_modify(|ext_molg_atom| ext_molg_atom.inv = *order_idx.get(&ext_molg_atom.order).unwrap()); } - // println!("Extended Atom Entry"); - // println!("{:?}", extended_molg_atom_map); + // get the canonized string for current root atom + let canon_string = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, 1, "".to_string()); - /* - 3. Generate Invariant for Atoms - */ + // lexico-compare strings to save the max one. + if lexical_cmp(&max_string, &canon_string).is_lt() { + max_string = canon_string + } + + // break; + } + return max_string; +} - loop { - // let start_invariants_atoms; - - let start_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); +fn canonize_signature( + mol_graph: &Graph, + mut DAG: &mut Graph::, + mut extended_molg_atom_map: &mut HashMap, + dag_level_list: &HashMap>, + mol_g_dag_vertex_map: &HashMap, + max_level: u32, + color_c: u32, + s_max: String, +) -> String { + // 1. get the invariants for each atom + invariant_atom(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level); + + // 2. generate orbits based on atom's invariant values + let mut orbits: HashMap> = HashMap::new(); + + for atom in mol_graph.node_indices() { + let atom_inv = extended_molg_atom_map.get(&atom).unwrap().inv; + let parent_len = DAG[*mol_g_dag_vertex_map.get(&atom).unwrap()].parents.len() as u32; + // only add atoms which have 2 or more parents in DAG + if parent_len >= 2 { + orbits.entry(atom_inv).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + } + + // 3. max length of any orbit + let mut max_orbit_len = 0; + orbits.values().for_each(|orbit| if orbit.len() > max_orbit_len {max_orbit_len = orbit.len()}); + + if max_orbit_len >= 2 { + // find the orbits with max len of atoms + let max_orbits = orbits.keys().filter(|orbit| orbits.get(&orbit).unwrap().len() == max_orbit_len).collect::>(); + // if multiple then use orbit with min value + let min_orbit = (if (&max_orbits.len()).clone() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); + + let mut local_smax = String::new(); + // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit + for atom in orbits.get(&min_orbit).unwrap() { + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.color = color_c as u32); + local_smax = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, color_c+1, s_max.clone()); + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.color = 0); + } + return local_smax; + } + else { + // not need to recurse further and print the signature-string + for atom in mol_graph.node_indices() { + let atom_inv = extended_molg_atom_map.get(&atom).unwrap().inv; + let atom_color = extended_molg_atom_map.get(&atom).unwrap().color; + let parent_len = DAG[*mol_g_dag_vertex_map.get(&atom).unwrap()].parents.len() as u32; + // first update any atom without a color to be same as its invariant value + if (atom_color == 0) && (parent_len >= 2) { + extended_molg_atom_map.entry(atom).and_modify(|atom_node| atom_node.color = atom_inv); + } + } + // start from root node of the DAG + let root_node = DAG.node_indices().find(|vert| DAG.neighbors_directed(*vert, Incoming).count() == 0).unwrap(); + let local_smax = print_signature_string(root_node, &DAG, &mol_graph, &extended_molg_atom_map, &mut vec![]); + if local_smax.len() > s_max.len() { + return local_smax; + } + else { + return s_max; + } + } +} - println!("Begin Atom Invariants: {}", start_inv_atoms); +fn print_signature_string( + vertex: NodeIndex, + DAG: &Graph::, + mol_graph: &Graph, + extended_molg_atom_map: &HashMap, + mut edges: &mut Vec<(NodeIndex, NodeIndex)> +) -> String { + let mut print_sign = String::new(); + print_sign.push('['); + let atom_idx = DAG[vertex].atom_idx; + let atom = &mol_graph[DAG[vertex].atom_idx]; + print_sign.push_str(&atom.get_element()); + let atom_color = extended_molg_atom_map.get(&atom_idx).unwrap().color; + if atom_color != 0 { + print_sign.push(','); + print_sign.push_str(&atom_color.to_string()); + } + print_sign.push(']'); + + let dag_childs = &DAG.neighbors_directed(vertex, Outgoing); + if dag_childs.clone().count() == 0 { return print_sign; } + else { + print_sign.push('('); + + // sort children in descending order of inv + let mut child_vec = dag_childs.clone().collect::>(); + child_vec.sort_by(|vert_a, vert_b| DAG[*vert_b].inv.cmp(&DAG[*vert_a].inv)); + for child in child_vec { + if let Some(_edge) = edges.iter().find(|egde| (egde.0 == vertex) && (egde.1 == child)) {} + else { + // if the edge is not already seen then add it to seen and generate signature-string for the child + edges.push((vertex, child)); + print_sign.push_str(&print_signature_string(child, &DAG, &mol_graph, &extended_molg_atom_map, edges)); + } + } + print_sign.push(')'); + return print_sign; + } +} - /* - 3.1 Generate Invariants for DAG vertex - */ - invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); +/* + 3. Generate Invariant for Atoms + */ +fn invariant_atom( + mol_graph: &Graph, + mut DAG: &mut Graph::, + extended_molg_atom_map: &mut HashMap, + dag_level_list: &HashMap>, + max_level: u32, +) { + let mut count = 0; + loop { + let start_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); - // println!("DAG after bottom invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + println!("Begin Atom Invariants: {}", start_inv_atoms); - invariant_dag(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); + /* + 3.1 Generate Invariants for DAG vertex + */ - // println!("DAG after top invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + // first bottom-up + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); - let mut order_map_vert_atom: HashMap> = HashMap::new(); - for atom in mol_graph.node_indices() { - order_map_vert_atom.insert(atom, vec![0;(max_level+1).try_into().unwrap()]); - } + // println!("DAG after bottom invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - for vert in DAG.node_indices() { - order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); - } + // then top-down + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); - let mut order_to_atom: HashMap> = HashMap::new(); + // println!("DAG after top invariants"); + // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); - for atom in mol_graph.node_indices() { - let order_str = order_map_vert_atom.get(&atom).unwrap().into_iter().map(|i| i.to_string()).collect::(); - order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); - } + // Create a vector for each atom in molecule graph based on associated vertex in + let mut order_map_vert_atom: HashMap> = HashMap::new(); + for atom in mol_graph.node_indices() { + order_map_vert_atom.insert(atom, vec![0;(max_level+1).try_into().unwrap()]); + } - // sort in descending order - let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); - atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); - atom_ordered_vec.reverse(); + for vert in DAG.node_indices() { + order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); + } - for (idx, order) in atom_ordered_vec.iter().enumerate() { - for atom in order_to_atom.get(*order).unwrap() { - extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); - } + let mut order_to_atom: HashMap> = HashMap::new(); + + // turn vectors into strings for sorting + for atom in mol_graph.node_indices() { + let order_str = order_map_vert_atom.get(&atom).unwrap().into_iter().map(|i| i.to_string()).collect::(); + order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); + } + + // lexico-sort the vectors-strings in descending order + let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); + atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + atom_ordered_vec.reverse(); + + // assign the invariant of atom as the order of vectors-strings + for (idx, order) in atom_ordered_vec.iter().enumerate() { + for atom in order_to_atom.get(*order).unwrap() { + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); } + } - let end_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); + let end_inv_atoms = mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) + .collect::(); - println!("End Atom Invariants: {}", end_inv_atoms); + println!("End Atom Invariants: {}", end_inv_atoms); - if start_inv_atoms == end_inv_atoms {break} + // compare invariants of all the atoms with the one's they started from + if start_inv_atoms == end_inv_atoms {break;} + if count > 100 { + println!("Breaking because the invariants never settle!"); + break; } - break; + count +=1; } - 0 } -fn invariant_dag( +/* + 3. Generate Invariant for Vertices + */ +fn invariant_dag_vert( DAG: &mut Graph::, extended_molg_atom_map: &HashMap, dag_level_list: &HashMap>, max_level: u32, bottom: bool) { - // let mut vert_order_map:HashMap = HashMap::new(); + // top-down or bottom-up calculation of invariants for each vertex in DAG let mut curr_lvl_range = if bottom {max_level} else {0}; loop { + // for each vertex generate a invariant-string based on assoc. atom color and atom invariant + directed neighbors let mut order_str_set: HashSet = HashSet::new(); for vert in dag_level_list.get(&curr_lvl_range).unwrap() { let atom_idx_for_vert = DAG[*vert].atom_idx; let atom_node = extended_molg_atom_map.get(&atom_idx_for_vert).unwrap(); let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); let mut vert_order = format!("{}{}", atom_color, atom_inv); - // let vert_neighbor_list + if bottom { for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); @@ -292,18 +427,18 @@ fn invariant_dag( order_str_set.insert(vert_order); } + // lexico-sort the invariant-strings in descending order let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); ordered_vec.reverse(); - println!("{:?}", ordered_vec); - let mut order_idx:HashMap = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { order_idx.insert(order_str.clone(), idx as u32); } - + + // assign the invariant of vertex as the order of invariant-strings for vert in dag_level_list.get(&curr_lvl_range).unwrap() { DAG[*vert].inv = (*order_idx.get(&DAG[*vert].order).unwrap())+1; } From 11b0c54cb8ed006423a647cb011b990600a7c120 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Thu, 19 Dec 2024 17:37:59 -0700 Subject: [PATCH 10/13] added better stopping cond. for atom invariants --- src/canonize.rs | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index e5c8cd30..4a0f7a22 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -197,8 +197,6 @@ pub fn canonize(m: &Molecule) -> String { if lexical_cmp(&max_string, &canon_string).is_lt() { max_string = canon_string } - - // break; } return max_string; } @@ -289,14 +287,14 @@ fn print_signature_string( } print_sign.push(']'); - let dag_childs = &DAG.neighbors_directed(vertex, Outgoing); - if dag_childs.clone().count() == 0 { return print_sign; } + let mut child_vec = DAG.neighbors_directed(vertex, Outgoing).collect::>(); + if child_vec.len() == 0 { return print_sign; } else { - print_sign.push('('); - // sort children in descending order of inv - let mut child_vec = dag_childs.clone().collect::>(); + // let child_vec = dag_childs; child_vec.sort_by(|vert_a, vert_b| DAG[*vert_b].inv.cmp(&DAG[*vert_a].inv)); + + print_sign.push('('); for child in child_vec { if let Some(_edge) = edges.iter().find(|egde| (egde.0 == vertex) && (egde.1 == child)) {} else { @@ -322,12 +320,10 @@ fn invariant_atom( ) { let mut count = 0; loop { - let start_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); - - println!("Begin Atom Invariants: {}", start_inv_atoms); + let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + // println!("Begin Atom Invariants: {}", start_inv_atoms); /* 3.1 Generate Invariants for DAG vertex @@ -375,17 +371,20 @@ fn invariant_atom( } } - let end_inv_atoms = mol_graph.node_indices() - .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv.to_string()) - .collect::(); - println!("End Atom Invariants: {}", end_inv_atoms); + let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + - // compare invariants of all the atoms with the one's they started from + // println!("End Atom Invariants: {}", end_inv_atoms); + + // compare the no. of invariants of all the atoms with the one's they started from if start_inv_atoms == end_inv_atoms {break;} - if count > 100 { - println!("Breaking because the invariants never settle!"); + + // Naive way of stopping + if count > mol_graph.node_count() { + println!("breaking out because reached upper limit!"); break; } count +=1; From 7538a5d8374fe07a51c1f8209e97f497e5650e3f Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Sat, 21 Dec 2024 14:40:10 -0700 Subject: [PATCH 11/13] fixed a few sorting mismatch in invariant calculation --- src/canonize.rs | 100 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 21 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 4a0f7a22..98590dae 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -1,7 +1,7 @@ use std::collections::{btree_set::Range, HashMap, HashSet, VecDeque}; -use lexical_sort::{lexical_cmp, lexical_only_alnum_cmp, StringSort}; +use lexical_sort::{cmp, lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; -use crate::molecule::{Atom, Bond, Molecule}; +use crate::molecule::{Atom, Bond, Molecule, Element}; use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph, Undirected}; #[derive(Debug, Clone)] @@ -40,9 +40,52 @@ impl MolAtomNode { // Compute the assembly index of a molecule pub fn canonize(m: &Molecule) -> String { - let mol_graph = m.get_graph().clone(); + let mol_graph = m.get_graph(); + + /* + // Dummy Molecule for testing + + let mut mol_graph: Graph = Graph::::new_undirected(); + + let mut vec_nodes: Vec = Vec::new(); + vec_nodes.push(NodeIndex::new(99999)); + for i in 0..16 { + vec_nodes.push(mol_graph.add_node(Atom::new(Element::Carbon))); + } + mol_graph.add_edge(vec_nodes[1],vec_nodes[2],Bond::Single); + mol_graph.add_edge(vec_nodes[2],vec_nodes[3],Bond::Single); + mol_graph.add_edge(vec_nodes[3],vec_nodes[4],Bond::Single); + mol_graph.add_edge(vec_nodes[4],vec_nodes[1],Bond::Single); + + mol_graph.add_edge(vec_nodes[1],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[2],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[3],vec_nodes[9],Bond::Single); + mol_graph.add_edge(vec_nodes[4],vec_nodes[11],Bond::Single); + + mol_graph.add_edge(vec_nodes[13],vec_nodes[14],Bond::Single); + mol_graph.add_edge(vec_nodes[14],vec_nodes[15],Bond::Single); + mol_graph.add_edge(vec_nodes[15],vec_nodes[16],Bond::Single); + mol_graph.add_edge(vec_nodes[16],vec_nodes[13],Bond::Single); + + mol_graph.add_edge(vec_nodes[13],vec_nodes[6],Bond::Single); + mol_graph.add_edge(vec_nodes[14],vec_nodes[8],Bond::Single); + mol_graph.add_edge(vec_nodes[15],vec_nodes[10],Bond::Single); + mol_graph.add_edge(vec_nodes[16],vec_nodes[12],Bond::Single); + + mol_graph.add_edge(vec_nodes[12],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[6],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[8],vec_nodes[9],Bond::Single); + mol_graph.add_edge(vec_nodes[10],vec_nodes[11],Bond::Single); + + mol_graph.add_edge(vec_nodes[12],vec_nodes[11],Bond::Single); + mol_graph.add_edge(vec_nodes[6],vec_nodes[5],Bond::Single); + mol_graph.add_edge(vec_nodes[8],vec_nodes[7],Bond::Single); + mol_graph.add_edge(vec_nodes[10],vec_nodes[9],Bond::Single); + */ + let mut max_string = String::new(); for root in mol_graph.node_indices() { + // let root = vec_nodes[1]; // for each node in the molecule graph create a signature /* 1. create a DAG from each start node @@ -182,7 +225,7 @@ pub fn canonize(m: &Molecule) -> String { let mut order_idx:HashMap = HashMap::new(); for (idx, order_str) in ordered_vec.iter().enumerate() { - order_idx.insert(order_str.clone(), idx as u32); + order_idx.insert(order_str.clone(), (idx as u32)+1); } // update the molecule graph invariant based on order idx of lexico-sort of (atom_type,#parents in DAG) @@ -320,9 +363,14 @@ fn invariant_atom( ) { let mut count = 0; loop { - let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // .into_iter() + // .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + + // Better stopping condition ?? + let start_inv_atoms = mol_graph.node_indices() .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); // println!("Begin Atom Invariants: {}", start_inv_atoms); /* @@ -333,13 +381,13 @@ fn invariant_atom( invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); // println!("DAG after bottom invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); // then top-down invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); // println!("DAG after top invariants"); - // println!("{:?}", Dot::with_config(&DAG, &[Config::EdgeNoLabel])); + // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); // Create a vector for each atom in molecule graph based on associated vertex in let mut order_map_vert_atom: HashMap> = HashMap::new(); @@ -359,22 +407,26 @@ fn invariant_atom( order_to_atom.entry(order_str).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); } - // lexico-sort the vectors-strings in descending order + // lexico-sort the vectors-strings let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); - atom_ordered_vec.reverse(); // assign the invariant of atom as the order of vectors-strings for (idx, order) in atom_ordered_vec.iter().enumerate() { for atom in order_to_atom.get(*order).unwrap() { - extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = idx as u32); + extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.inv = (idx as u32)+1); } } - let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + // .into_iter() + // .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + + // Better stopping condition ?? + let end_inv_atoms = mol_graph.node_indices() .into_iter() - .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); // println!("End Atom Invariants: {}", end_inv_atoms); @@ -410,25 +462,31 @@ fn invariant_dag_vert( let atom_node = extended_molg_atom_map.get(&atom_idx_for_vert).unwrap(); let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); let mut vert_order = format!("{}{}", atom_color, atom_inv); - + let mut child_inv_set: Vec = Vec::new(); + if bottom { for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { - vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + child_inv_set.push(DAG[vert_neigh].inv); } } else { for vert_neigh in DAG.neighbors_directed(*vert, Incoming) { - vert_order.push_str(&format!("{}", DAG[vert_neigh].inv)); + child_inv_set.push(DAG[vert_neigh].inv); } } - - DAG[*vert].order = vert_order.clone(); - order_str_set.insert(vert_order); + + child_inv_set.sort(); + child_inv_set.reverse(); + child_inv_set.iter().for_each(|val| vert_order.push_str(&format!("{}", *val))); + + let vec_string = format!("{:0>20}", vert_order); + DAG[*vert].order = vec_string.clone(); + order_str_set.insert(vec_string); } // lexico-sort the invariant-strings in descending order - let mut ordered_vec: Vec<_> = order_str_set.into_iter().collect(); - ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + let mut ordered_vec: Vec = order_str_set.into_iter().collect(); + ordered_vec.string_sort_unstable(natural_cmp); ordered_vec.reverse(); let mut order_idx:HashMap = HashMap::new(); From e5ecd554bd35cc1f3648eb4a1ac2b130092ff631 Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Tue, 4 Feb 2025 18:56:57 -0700 Subject: [PATCH 12/13] stable and consistent string generation --- src/canonize.rs | 144 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 103 insertions(+), 41 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index 98590dae..d7eeaf3b 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -29,12 +29,13 @@ impl DAGVert { struct MolAtomNode { color: u32, inv: u32, - order: String + order: String, + num_parents: u32 } impl MolAtomNode { - pub fn new(color: u32, inv: u32, order: String) -> Self { - MolAtomNode {color, inv, order} + pub fn new(color: u32, inv: u32, order: String, num_parents: u32) -> Self { + MolAtomNode {color, inv, order, num_parents} } } @@ -42,7 +43,7 @@ impl MolAtomNode { pub fn canonize(m: &Molecule) -> String { let mol_graph = m.get_graph(); - /* + // /* // Dummy Molecule for testing let mut mol_graph: Graph = Graph::::new_undirected(); @@ -81,11 +82,11 @@ pub fn canonize(m: &Molecule) -> String { mol_graph.add_edge(vec_nodes[6],vec_nodes[5],Bond::Single); mol_graph.add_edge(vec_nodes[8],vec_nodes[7],Bond::Single); mol_graph.add_edge(vec_nodes[10],vec_nodes[9],Bond::Single); - */ + // */ let mut max_string = String::new(); for root in mol_graph.node_indices() { - // let root = vec_nodes[1]; + let root = vec_nodes[1]; // for each node in the molecule graph create a signature /* 1. create a DAG from each start node @@ -94,7 +95,7 @@ pub fn canonize(m: &Molecule) -> String { let mut DAG_vertex_map: HashMap = HashMap::new(); - let mut mol_g_dag_vertex_map: HashMap = HashMap::new(); + let mut mol_g_dag_vertex_map: HashMap> = HashMap::new(); let mut dag_level_list: HashMap> = HashMap::new(); @@ -112,7 +113,7 @@ pub fn canonize(m: &Molecule) -> String { let root_vertex_id = DAG.add_node(DAGVert::new(root, [].to_vec(), 0)); DAG_vertex_map.insert(format!("{:?}-{}", root, 0), root_vertex_id); dag_level_list.insert(0, [root_vertex_id].to_vec()); - mol_g_dag_vertex_map.insert(root, root_vertex_id); + mol_g_dag_vertex_map.insert(root, [root_vertex_id].to_vec()); loop { @@ -174,7 +175,11 @@ pub fn canonize(m: &Molecule) -> String { // let child_node_idx = DAG.add_node(format!("{:?}-{}", neigh, level+1)); let child_node_idx = DAG.add_node(DAGVert::new(neigh, [].to_vec(), level+1)); DAG_vertex_map.insert(format!("{:?}-{}", neigh, level+1), child_node_idx); - mol_g_dag_vertex_map.insert(neigh, child_node_idx); + + // Overriding the map!!! neigh can be seen before in previous layer + // mol_g_dag_vertex_map.insert(, child_node_idx); + mol_g_dag_vertex_map.entry(neigh).and_modify(|vert_assoc_list| vert_assoc_list.push(child_node_idx)).or_insert([child_node_idx].to_vec()); + // Insert into a level by level hashmap of dag nodes dag_level_list.entry((level+1)).and_modify(|level_vert_list| level_vert_list.push(child_node_idx)).or_insert([child_node_idx].to_vec()); @@ -209,13 +214,22 @@ pub fn canonize(m: &Molecule) -> String { */ let mut extended_molg_atom_map: HashMap = HashMap::new(); let mut order_str_set: HashSet = HashSet::new(); + + // Each atom does not have just one vertex in dag!!! for atom_node in mol_graph.node_indices() { - let dag_vert = mol_g_dag_vertex_map.get(&atom_node).unwrap(); - let parent_len = DAG[*dag_vert].parents.len(); + // find unique parents for an atom's associated vertices in DAG + let atom_assoc_vert_list = mol_g_dag_vertex_map.get(&atom_node).unwrap(); + let mut parents= HashSet::new(); + for vert_id in atom_assoc_vert_list { + for parent in &DAG[*vert_id].parents { + parents.insert(parent); + } + } + let parent_len = parents.len(); let atom_str = mol_graph[atom_node].get_element(); let atom_order_str = format!("{}{}", atom_str, parent_len); order_str_set.insert(atom_order_str.clone()); - extended_molg_atom_map.insert(atom_node, MolAtomNode::new(0, 0, atom_order_str)); + extended_molg_atom_map.insert(atom_node, MolAtomNode::new(0, 0, atom_order_str, parent_len as u32)); } // lexico-sort @@ -233,13 +247,20 @@ pub fn canonize(m: &Molecule) -> String { extended_molg_atom_map.entry(atom_node).and_modify(|ext_molg_atom| ext_molg_atom.inv = *order_idx.get(&ext_molg_atom.order).unwrap()); } + // println!("{:?}", extended_molg_atom_map); + // get the canonized string for current root atom let canon_string = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, 1, "".to_string()); + println!("{:?}", canon_string); + + break; // lexico-compare strings to save the max one. if lexical_cmp(&max_string, &canon_string).is_lt() { max_string = canon_string } + + break; } return max_string; } @@ -249,7 +270,7 @@ fn canonize_signature( mut DAG: &mut Graph::, mut extended_molg_atom_map: &mut HashMap, dag_level_list: &HashMap>, - mol_g_dag_vertex_map: &HashMap, + mol_g_dag_vertex_map: &HashMap>, max_level: u32, color_c: u32, s_max: String, @@ -257,12 +278,14 @@ fn canonize_signature( // 1. get the invariants for each atom invariant_atom(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, max_level); + // return "".to_string(); // 2. generate orbits based on atom's invariant values let mut orbits: HashMap> = HashMap::new(); for atom in mol_graph.node_indices() { - let atom_inv = extended_molg_atom_map.get(&atom).unwrap().inv; - let parent_len = DAG[*mol_g_dag_vertex_map.get(&atom).unwrap()].parents.len() as u32; + let extended_atom = extended_molg_atom_map.get(&atom).unwrap(); + let atom_inv = extended_atom.inv; + let parent_len = extended_atom.num_parents; // only add atoms which have 2 or more parents in DAG if parent_len >= 2 { orbits.entry(atom_inv).and_modify(|atom_list| atom_list.push(atom)).or_insert([atom].to_vec()); @@ -279,21 +302,25 @@ fn canonize_signature( // if multiple then use orbit with min value let min_orbit = (if (&max_orbits.len()).clone() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); - let mut local_smax = String::new(); + // let mut local_smax = String::new(); + let mut local_smax = s_max.clone(); + + println!("Max Orbit: {:?}", orbits.get(&min_orbit).unwrap()); // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit for atom in orbits.get(&min_orbit).unwrap() { extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.color = color_c as u32); - local_smax = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, color_c+1, s_max.clone()); + local_smax = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, color_c+1, local_smax); extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.color = 0); } return local_smax; } else { - // not need to recurse further and print the signature-string + // no need to recurse further and print the signature-string for atom in mol_graph.node_indices() { - let atom_inv = extended_molg_atom_map.get(&atom).unwrap().inv; - let atom_color = extended_molg_atom_map.get(&atom).unwrap().color; - let parent_len = DAG[*mol_g_dag_vertex_map.get(&atom).unwrap()].parents.len() as u32; + let extended_atom = extended_molg_atom_map.get(&atom).unwrap(); + let atom_inv = extended_atom.inv; + let atom_color = extended_atom.color; + let parent_len = extended_atom.num_parents; // first update any atom without a color to be same as its invariant value if (atom_color == 0) && (parent_len >= 2) { extended_molg_atom_map.entry(atom).and_modify(|atom_node| atom_node.color = atom_inv); @@ -362,15 +389,17 @@ fn invariant_atom( max_level: u32, ) { let mut count = 0; + let mut initial = true; loop { - // let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() - // .into_iter() - // .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + // Unique invariant values + let start_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); // Better stopping condition ?? - let start_inv_atoms = mol_graph.node_indices() - .into_iter() - .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); + // let start_inv_atoms = mol_graph.node_indices() + // .into_iter() + // .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); // println!("Begin Atom Invariants: {}", start_inv_atoms); /* @@ -378,25 +407,31 @@ fn invariant_atom( */ // first bottom-up - invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true); + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, true, initial); // println!("DAG after bottom invariants"); // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); + initial = false; // then top-down - invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false); + invariant_dag_vert(&mut DAG, &extended_molg_atom_map, &dag_level_list, max_level, false, initial); // println!("DAG after top invariants"); // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); - + // Create a vector for each atom in molecule graph based on associated vertex in let mut order_map_vert_atom: HashMap> = HashMap::new(); for atom in mol_graph.node_indices() { order_map_vert_atom.insert(atom, vec![0;(max_level+1).try_into().unwrap()]); } + // sort the vectors correctly + // for vert in DAG.node_indices() { + // order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); + // } + //for reverse sorting max_level - DAG[vert].level as per paper for vert in DAG.node_indices() { - order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[DAG[vert].level as usize] = DAG[vert].inv); + order_map_vert_atom.entry(DAG[vert].atom_idx).and_modify(|order_vec| order_vec[(max_level - DAG[vert].level) as usize] = DAG[vert].inv); } let mut order_to_atom: HashMap> = HashMap::new(); @@ -410,6 +445,14 @@ fn invariant_atom( // lexico-sort the vectors-strings let mut atom_ordered_vec: Vec<_> = order_to_atom.keys().into_iter().collect(); atom_ordered_vec.string_sort_unstable(lexical_only_alnum_cmp); + // atom_ordered_vec.string_sort_unstable(natural_cmp); + // descend sort + atom_ordered_vec.reverse(); + + // println!("Order vector to Atom"); + // println!("{:?}",order_to_atom); + // println!("Order vector string to index"); + // println!("{:?}",atom_ordered_vec); // assign the invariant of atom as the order of vectors-strings for (idx, order) in atom_ordered_vec.iter().enumerate() { @@ -419,20 +462,31 @@ fn invariant_atom( } - // let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() - // .into_iter() - // .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); + let end_inv_atoms = HashSet::::from_iter(mol_graph.node_indices() + .into_iter() + .map(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv)).len(); // Better stopping condition ?? - let end_inv_atoms = mol_graph.node_indices() - .into_iter() - .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); + // let end_inv_atoms = mol_graph.node_indices() + // .into_iter() + // .filter(|atom_idx| extended_molg_atom_map.get(&atom_idx).unwrap().inv == 0).count(); // println!("End Atom Invariants: {}", end_inv_atoms); // compare the no. of invariants of all the atoms with the one's they started from - if start_inv_atoms == end_inv_atoms {break;} + if start_inv_atoms == end_inv_atoms { + println!("breaking out because no. of invariant values don't change"); + + println!("Atom Invariants"); + // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); + // println!("{:?}", extended_molg_atom_map); + + for (node_id, atom_node) in extended_molg_atom_map { + println!("{:?} Inv:{:?} Ord:{:?}", node_id, atom_node.inv, order_map_vert_atom.get(node_id).unwrap()) + } + break; + } // Naive way of stopping if count > mol_graph.node_count() { @@ -451,7 +505,8 @@ fn invariant_dag_vert( extended_molg_atom_map: &HashMap, dag_level_list: &HashMap>, max_level: u32, - bottom: bool) { + bottom: bool, + initial: bool) { // top-down or bottom-up calculation of invariants for each vertex in DAG let mut curr_lvl_range = if bottom {max_level} else {0}; loop { @@ -461,20 +516,27 @@ fn invariant_dag_vert( let atom_idx_for_vert = DAG[*vert].atom_idx; let atom_node = extended_molg_atom_map.get(&atom_idx_for_vert).unwrap(); let (atom_color, atom_inv) = (atom_node.color, atom_node.inv); - let mut vert_order = format!("{}{}", atom_color, atom_inv); + let vert_inv = DAG[*vert].inv; + let mut vert_order; let mut child_inv_set: Vec = Vec::new(); - + vert_order = if initial { format!("{}{}", atom_color, atom_inv) } else { format!("{}{}", atom_color, vert_inv) }; if bottom { + // vert_order = format!("{}{}", atom_color, atom_inv); for vert_neigh in DAG.neighbors_directed(*vert, Outgoing) { child_inv_set.push(DAG[vert_neigh].inv); } } else { + // vert_order = format!("{}{}", atom_color, vert_inv); for vert_neigh in DAG.neighbors_directed(*vert, Incoming) { child_inv_set.push(DAG[vert_neigh].inv); } } + while child_inv_set.len() < 10 { + child_inv_set.push(0); + } + child_inv_set.sort(); child_inv_set.reverse(); child_inv_set.iter().for_each(|val| vert_order.push_str(&format!("{}", *val))); From 90bbd8d1e11f6dc86c444a34c2547a4b8b8a1b3a Mon Sep 17 00:00:00 2001 From: Devendra Parkar Date: Wed, 5 Feb 2025 17:50:48 -0700 Subject: [PATCH 13/13] Augment Molecule for double and triple bonds --- src/canonize.rs | 70 ++++++++++++++++++++++++++++++++++--------------- src/molecule.rs | 10 +++++++ 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/canonize.rs b/src/canonize.rs index d7eeaf3b..a1f30c46 100644 --- a/src/canonize.rs +++ b/src/canonize.rs @@ -2,7 +2,7 @@ use std::collections::{btree_set::Range, HashMap, HashSet, VecDeque}; use lexical_sort::{cmp, lexical_cmp, lexical_only_alnum_cmp, natural_cmp, StringSort}; use crate::molecule::{Atom, Bond, Molecule, Element}; -use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::NodeIndex, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph, Undirected}; +use petgraph::{adj::Neighbors, dot::{Config, Dot}, graph::{EdgeIndex, NodeIndex}, visit::DfsPostOrder, Directed, Direction::{self, Incoming, Outgoing}, Graph, Undirected}; #[derive(Debug, Clone)] struct DAGVert { @@ -40,12 +40,37 @@ impl MolAtomNode { } // Compute the assembly index of a molecule -pub fn canonize(m: &Molecule) -> String { - let mol_graph = m.get_graph(); +pub fn canonize(molecule: &Molecule) -> String { + let mut m = molecule.clone(); + let mut mol_graph = m.get_mut_graph(); + + // Update the molecule so that canonization works to handle bond types + let bonds_iter = mol_graph.edge_indices().clone(); + for bond in bonds_iter { + let bond_type = mol_graph[bond]; + let (start_atom, end_atom) = mol_graph.edge_endpoints(bond).unwrap(); + match bond_type { + Bond::Single => {}, + Bond::Double => { // Handle Triple bonds as well + // Add an atom of type Double for the bond + let new_node_id = mol_graph.add_node(Atom::new(Element::Double)); + + mol_graph.add_edge(start_atom, new_node_id, Bond::Single); + mol_graph.add_edge(new_node_id, end_atom, Bond::Single); + mol_graph.remove_edge(bond); + }, + } + } - // /* - // Dummy Molecule for testing + // println!("Og Molecule:"); + // println!("{:?}", molecule); + + // println!("Upended Molecule:"); + // println!("{:?}", m); + // return "".to_string(); + // Dummy Molecule for testing + /* let mut mol_graph: Graph = Graph::::new_undirected(); let mut vec_nodes: Vec = Vec::new(); @@ -82,11 +107,11 @@ pub fn canonize(m: &Molecule) -> String { mol_graph.add_edge(vec_nodes[6],vec_nodes[5],Bond::Single); mol_graph.add_edge(vec_nodes[8],vec_nodes[7],Bond::Single); mol_graph.add_edge(vec_nodes[10],vec_nodes[9],Bond::Single); - // */ + */ let mut max_string = String::new(); for root in mol_graph.node_indices() { - let root = vec_nodes[1]; + // let root = vec_nodes[1]; // for each node in the molecule graph create a signature /* 1. create a DAG from each start node @@ -252,15 +277,15 @@ pub fn canonize(m: &Molecule) -> String { // get the canonized string for current root atom let canon_string = canonize_signature(&mol_graph, &mut DAG, &mut extended_molg_atom_map, &dag_level_list, &mol_g_dag_vertex_map, max_level, 1, "".to_string()); - println!("{:?}", canon_string); + // println!("{:?}", canon_string); - break; + // break; // lexico-compare strings to save the max one. if lexical_cmp(&max_string, &canon_string).is_lt() { max_string = canon_string } - break; + // break; } return max_string; } @@ -302,10 +327,9 @@ fn canonize_signature( // if multiple then use orbit with min value let min_orbit = (if (&max_orbits.len()).clone() > 1 {max_orbits.iter().min()} else {max_orbits.first()}).unwrap(); + // println!("Max Orbit: {:?}", orbits.get(&min_orbit).unwrap()); // let mut local_smax = String::new(); let mut local_smax = s_max.clone(); - - println!("Max Orbit: {:?}", orbits.get(&min_orbit).unwrap()); // recurse further for each of the atom in such a orbit and generate a canonized signature by diff. the atoms in same orbit for atom in orbits.get(&min_orbit).unwrap() { extended_molg_atom_map.entry(*atom).and_modify(|atom_node| atom_node.color = color_c as u32); @@ -364,16 +388,21 @@ fn print_signature_string( // let child_vec = dag_childs; child_vec.sort_by(|vert_a, vert_b| DAG[*vert_b].inv.cmp(&DAG[*vert_a].inv)); - print_sign.push('('); + let mut sub_print_sign = String::new(); + for child in child_vec { if let Some(_edge) = edges.iter().find(|egde| (egde.0 == vertex) && (egde.1 == child)) {} else { // if the edge is not already seen then add it to seen and generate signature-string for the child edges.push((vertex, child)); - print_sign.push_str(&print_signature_string(child, &DAG, &mol_graph, &extended_molg_atom_map, edges)); + sub_print_sign.push_str(&print_signature_string(child, &DAG, &mol_graph, &extended_molg_atom_map, edges)); } } - print_sign.push(')'); + if sub_print_sign.len() > 0 { + print_sign.push('('); + print_sign.push_str(&sub_print_sign); + print_sign.push(')'); + } return print_sign; } } @@ -476,15 +505,14 @@ fn invariant_atom( // compare the no. of invariants of all the atoms with the one's they started from if start_inv_atoms == end_inv_atoms { - println!("breaking out because no. of invariant values don't change"); - - println!("Atom Invariants"); + // println!("breaking out because no. of invariant values don't change"); + // println!("Atom Invariants"); // println!("{:?}", Dot::with_config(&*DAG, &[Config::EdgeNoLabel])); // println!("{:?}", extended_molg_atom_map); - for (node_id, atom_node) in extended_molg_atom_map { - println!("{:?} Inv:{:?} Ord:{:?}", node_id, atom_node.inv, order_map_vert_atom.get(node_id).unwrap()) - } + // for (node_id, atom_node) in extended_molg_atom_map { + // println!("{:?} Inv:{:?} Ord:{:?}", node_id, atom_node.inv, order_map_vert_atom.get(node_id).unwrap()) + // } break; } diff --git a/src/molecule.rs b/src/molecule.rs index 92d869a4..c81dd7dd 100644 --- a/src/molecule.rs +++ b/src/molecule.rs @@ -21,6 +21,10 @@ pub enum Element { Carbon, Nitrogen, Oxygen, + + // Adding New Elements to handle bond type expansion of molecule graph + Double, + Triple, } #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -53,6 +57,8 @@ impl Atom { Element::Nitrogen => "N", Element::Carbon => "C", Element::Hydrogen => "H", + Element::Double => "2", + Element::Triple => "3", }).to_string() } } @@ -238,6 +244,10 @@ impl Molecule { &self.graph } + pub fn get_mut_graph(&mut self) -> &mut MGraph { + &mut self.graph + } + pub fn single_bond() -> Self { let mut g = Graph::default(); let u = g.add_node(Atom {