From d80d7341aee718851baf799cb2b54fa357a89af3 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Fri, 25 Jul 2025 15:34:59 -0700 Subject: [PATCH 01/51] Add clique reduction --- src/lib.rs | 1 + src/reductions.rs | 96 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 src/reductions.rs diff --git a/src/lib.rs b/src/lib.rs index a8f8c8c2..4310afda 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,7 @@ pub mod canonize; pub mod enumerate; pub mod kernels; pub mod memoize; +pub mod reductions; mod vf3; // Python wrapper. diff --git a/src/reductions.rs b/src/reductions.rs new file mode 100644 index 00000000..5ed0da68 --- /dev/null +++ b/src/reductions.rs @@ -0,0 +1,96 @@ +use bit_set::BitSet; + +pub struct CompatGraph { + graph: Vec, + weights: Vec, + matches: Vec<(BitSet, BitSet)> +} + +impl CompatGraph { + pub fn new(init_matches: Vec<(BitSet, BitSet)>) -> Self { + let size = init_matches.len(); + + // Initialize weights and empty graph + let mut init_graph: Vec = Vec::with_capacity(size); + let mut init_weights: Vec = Vec::with_capacity(size); + for m in init_matches.iter() { + init_graph.push(BitSet::with_capacity(size)); + init_weights.push(m.0.len() - 1); + } + + // Populate graph + for (idx1, (h1, h2)) in init_matches.iter().enumerate() { + for (idx2, (h1p, h2p)) in init_matches[idx1 + 1..].iter().enumerate() { + let idx2 = idx2 + idx1 + 1; + + let forward_compatible = { + h2.is_disjoint(h1p) && + h2.is_disjoint(h2p) && + (h1.is_disjoint(h1p) || h1.is_superset(h1p)) && + (h1.is_disjoint(h2p) || h1.is_superset(h2p)) + }; + + if forward_compatible { + init_graph[idx1].insert(idx2); + init_graph[idx2].insert(idx1); + } + } + } + + Self { + graph: init_graph, + weights: init_weights, + matches: init_matches, + } + } + + pub fn weight(&self, v: usize) -> usize { + self.weights[v] + } + + pub fn matches(&self, v: usize) -> &(BitSet, BitSet) { + &self.matches[v] + } + + pub fn len(&self) -> usize { + self.graph.len() + } + + pub fn degree(&self, v: usize, subgraph: &BitSet) -> usize { + self.graph[v].intersection(subgraph).count() + } + + pub fn compatible_with(&self, v: usize) -> &BitSet { + &self.graph[v] + } + + pub fn neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + + neighbors + } + + pub fn forward_neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + let mut to_remove = vec![]; + for u in neighbors.iter() { + if u <= v { + to_remove.push(u); + } + if u > v { + break; + } + } + for u in to_remove { + neighbors.remove(u); + } + + neighbors + } + + pub fn are_adjacent(&self, v: usize, u: usize) -> bool { + self.graph[v].contains(u) + } +} \ No newline at end of file From 5ebfad0e4efc8a631f936c8ff759bd7fcf72605a Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 14:08:12 -0700 Subject: [PATCH 02/51] Implement subgraph and new loop in index search --- src/assembly.rs | 39 ++++++++++++++++++++++++++++++--------- src/reductions.rs | 14 ++++++++++---- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 79aa3f9a..75a4908a 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -38,6 +38,7 @@ use crate::{ memoize::{Cache, MemoizeMode}, molecule::Molecule, utils::connected_components_under_edges, + reductions::CompatGraph, }; /// Parallelization strategy for the recursive search phase. @@ -217,7 +218,9 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti #[allow(clippy::too_many_arguments)] pub fn recurse_index_search( mol: &Molecule, - matches: &[(BitSet, BitSet)], + matches: &Vec<(BitSet, BitSet)>, + graph: &CompatGraph, + subgraph: BitSet, removal_order: Vec, state: &[BitSet], state_index: usize, @@ -248,7 +251,8 @@ pub fn recurse_index_search( // Define a closure that handles recursing to a new assembly state based on // the given (enumerated) pair of non-overlapping isomorphic subgraphs. - let recurse_on_match = |i: usize, h1: &BitSet, h2: &BitSet| { + let recurse_on_match = |i: usize| { + let (h1, h2) = &matches[i]; if let Some(fragments) = fragments(mol, state, h1, h2) { // If using depth-one parallelism, all descendant states should be // computed serially. @@ -258,10 +262,20 @@ pub fn recurse_index_search( parallel_mode }; + // TODO: should create a method for updating subgraph + let mut sub_clone = subgraph.clone(); + let mut node = subgraph.iter().next().unwrap(); + while node <= i { + sub_clone.remove(node); + node += 1; + } + // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search( mol, - &matches[i + 1..], + matches, + graph, + sub_clone, { let mut clone = removal_order.clone(); clone.push(i); @@ -286,15 +300,15 @@ pub fn recurse_index_search( // Use the iterator type corresponding to the specified parallelism mode. if parallel_mode == ParallelMode::None { - matches + subgraph .iter() - .enumerate() - .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2)); + .for_each(|i| recurse_on_match(i)); } else { - matches + subgraph + .iter() + .collect::>() .par_iter() - .enumerate() - .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2)); + .for_each(|i | recurse_on_match(*i)); } ( @@ -390,12 +404,19 @@ pub fn index_search( let mut init = BitSet::new(); init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + let mut subgraph = BitSet::with_capacity(matches.len()); + for i in 0..matches.len() { + subgraph.insert(i); + } + // Search for the shortest assembly pathway recursively. let edge_count = mol.graph().edge_count(); let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); let (index, states_searched) = recurse_index_search( mol, &matches, + &CompatGraph::new(), + subgraph, Vec::new(), &[init], edge_count - 1, diff --git a/src/reductions.rs b/src/reductions.rs index 5ed0da68..6bca30b8 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -2,12 +2,18 @@ use bit_set::BitSet; pub struct CompatGraph { graph: Vec, - weights: Vec, - matches: Vec<(BitSet, BitSet)> + nodes: Vec, } impl CompatGraph { - pub fn new(init_matches: Vec<(BitSet, BitSet)>) -> Self { + pub fn new () -> Self { + Self { + graph: Vec::new(), + nodes: Vec::new(), + } + } + + /*pub fn new(init_matches: Vec<(BitSet, BitSet)>) -> Self { let size = init_matches.len(); // Initialize weights and empty graph @@ -92,5 +98,5 @@ impl CompatGraph { pub fn are_adjacent(&self, v: usize, u: usize) -> bool { self.graph[v].contains(u) - } + }*/ } \ No newline at end of file From 6cb9fe18d6f8c9e0dc4ffa1e8ba89adcff3d5034 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 15:10:12 -0700 Subject: [PATCH 03/51] Implement using CompatGraph to choose next match --- src/assembly.rs | 22 +++++++++++----------- src/reductions.rs | 20 ++++---------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 75a4908a..67a8fcfd 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -251,8 +251,8 @@ pub fn recurse_index_search( // Define a closure that handles recursing to a new assembly state based on // the given (enumerated) pair of non-overlapping isomorphic subgraphs. - let recurse_on_match = |i: usize| { - let (h1, h2) = &matches[i]; + let recurse_on_match = |v: usize| { + let (h1, h2) = &matches[v]; if let Some(fragments) = fragments(mol, state, h1, h2) { // If using depth-one parallelism, all descendant states should be // computed serially. @@ -263,12 +263,12 @@ pub fn recurse_index_search( }; // TODO: should create a method for updating subgraph - let mut sub_clone = subgraph.clone(); - let mut node = subgraph.iter().next().unwrap(); - while node <= i { - sub_clone.remove(node); - node += 1; - } + let mut sub_clone = BitSet::with_capacity(matches.len()); + /*let mut node = subgraph.iter().next().unwrap(); + for j in (i+1)..matches.len() { + sub_clone.insert(j); + }*/ + sub_clone.intersect_with(&graph.forward_neighbors(v, &subgraph)); // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search( @@ -302,13 +302,13 @@ pub fn recurse_index_search( if parallel_mode == ParallelMode::None { subgraph .iter() - .for_each(|i| recurse_on_match(i)); + .for_each(|v| recurse_on_match(v)); } else { subgraph .iter() .collect::>() .par_iter() - .for_each(|i | recurse_on_match(*i)); + .for_each(|v | recurse_on_match(*v)); } ( @@ -415,7 +415,7 @@ pub fn index_search( let (index, states_searched) = recurse_index_search( mol, &matches, - &CompatGraph::new(), + &CompatGraph::new(&matches), subgraph, Vec::new(), &[init], diff --git a/src/reductions.rs b/src/reductions.rs index 6bca30b8..09862c05 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -2,26 +2,16 @@ use bit_set::BitSet; pub struct CompatGraph { graph: Vec, - nodes: Vec, } impl CompatGraph { - pub fn new () -> Self { - Self { - graph: Vec::new(), - nodes: Vec::new(), - } - } - - /*pub fn new(init_matches: Vec<(BitSet, BitSet)>) -> Self { + pub fn new(init_matches: &Vec<(BitSet, BitSet)>) -> Self { let size = init_matches.len(); // Initialize weights and empty graph let mut init_graph: Vec = Vec::with_capacity(size); - let mut init_weights: Vec = Vec::with_capacity(size); for m in init_matches.iter() { init_graph.push(BitSet::with_capacity(size)); - init_weights.push(m.0.len() - 1); } // Populate graph @@ -45,18 +35,16 @@ impl CompatGraph { Self { graph: init_graph, - weights: init_weights, - matches: init_matches, } } - pub fn weight(&self, v: usize) -> usize { + /*pub fn weight(&self, v: usize) -> usize { self.weights[v] } pub fn matches(&self, v: usize) -> &(BitSet, BitSet) { &self.matches[v] - } + }*/ pub fn len(&self) -> usize { self.graph.len() @@ -98,5 +86,5 @@ impl CompatGraph { pub fn are_adjacent(&self, v: usize, u: usize) -> bool { self.graph[v].contains(u) - }*/ + } } \ No newline at end of file From 6ccb952d764dd157b4ae445342e74d90cdff0d13 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 15:24:20 -0700 Subject: [PATCH 04/51] Add CLI argument for clique --- src/assembly.rs | 32 +++++++++++++++++++++++--------- src/main.rs | 4 ++++ src/python.rs | 2 ++ src/reductions.rs | 2 +- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 67a8fcfd..fde929aa 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -28,7 +28,7 @@ use std::{ use bit_set::BitSet; use clap::ValueEnum; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; +use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use crate::{ bounds::{bound_exceeded, Bound}, @@ -219,7 +219,7 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti pub fn recurse_index_search( mol: &Molecule, matches: &Vec<(BitSet, BitSet)>, - graph: &CompatGraph, + graph: &Option, subgraph: BitSet, removal_order: Vec, state: &[BitSet], @@ -262,13 +262,16 @@ pub fn recurse_index_search( parallel_mode }; - // TODO: should create a method for updating subgraph + // Update subgraph let mut sub_clone = BitSet::with_capacity(matches.len()); - /*let mut node = subgraph.iter().next().unwrap(); - for j in (i+1)..matches.len() { - sub_clone.insert(j); - }*/ - sub_clone.intersect_with(&graph.forward_neighbors(v, &subgraph)); + if let Some(g) = graph { + sub_clone.intersect_with(&g.forward_neighbors(v, &subgraph)); + } + else { + for j in (v+1)..matches.len() { + sub_clone.insert(j); + } + } // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search( @@ -388,6 +391,7 @@ pub fn index_search( memoize_mode: MemoizeMode, kernel_mode: KernelMode, bounds: &[Bound], + clique: bool, ) -> (u32, u32, usize) { // Catch not-yet-implemented modes. if kernel_mode != KernelMode::None { @@ -400,6 +404,15 @@ pub fn index_search( // Create memoization cache. let mut cache = Cache::new(memoize_mode, canonize_mode); + let graph = { + if clique { + Some(CompatGraph::new(&matches)) + } + else { + None + } + }; + // Initialize the first fragment as the entire graph. let mut init = BitSet::new(); init.extend(mol.graph().edge_indices().map(|ix| ix.index())); @@ -415,7 +428,7 @@ pub fn index_search( let (index, states_searched) = recurse_index_search( mol, &matches, - &CompatGraph::new(&matches), + &graph, subgraph, Vec::new(), &[init], @@ -458,6 +471,7 @@ pub fn index(mol: &Molecule) -> u32 { MemoizeMode::CanonIndex, KernelMode::None, &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], + false, ) .0 } diff --git a/src/main.rs b/src/main.rs index 6df6f4e1..9fec220e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -55,6 +55,9 @@ struct Cli { /// Strategy for performing graph kernelization during the search phase. #[arg(long, value_enum, default_value_t = KernelMode::None)] kernel: KernelMode, + + #[arg(long)] + clique: bool, } #[derive(Args, Debug)] @@ -116,6 +119,7 @@ fn main() -> Result<()> { cli.memoize, cli.kernel, boundlist, + cli.clique, ); // Print final output, depending on --verbose. diff --git a/src/python.rs b/src/python.rs index df74f5f2..e224e0c0 100644 --- a/src/python.rs +++ b/src/python.rs @@ -470,6 +470,8 @@ pub fn _index_search( memoize_mode, kernel_mode, &boundlist, + // TODO: implement clique CLI param for python interface + false, )) } diff --git a/src/reductions.rs b/src/reductions.rs index 09862c05..93519e74 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -10,7 +10,7 @@ impl CompatGraph { // Initialize weights and empty graph let mut init_graph: Vec = Vec::with_capacity(size); - for m in init_matches.iter() { + for _ in 0..init_matches.len() { init_graph.push(BitSet::with_capacity(size)); } From f8f043774779735289a86e444bd82fc49e8dd2de Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 15:29:34 -0700 Subject: [PATCH 05/51] Fix search with clique --- src/assembly.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/assembly.rs b/src/assembly.rs index fde929aa..e3743f23 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -263,11 +263,13 @@ pub fn recurse_index_search( }; // Update subgraph - let mut sub_clone = BitSet::with_capacity(matches.len()); + let mut sub_clone; if let Some(g) = graph { + sub_clone = subgraph.clone(); sub_clone.intersect_with(&g.forward_neighbors(v, &subgraph)); } else { + sub_clone = BitSet::with_capacity(matches.len()); for j in (v+1)..matches.len() { sub_clone.insert(j); } From 39e91db5b31fe30a634e34bef6eea4f996ff2869 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 15:48:42 -0700 Subject: [PATCH 06/51] Add clique budget bound --- src/assembly.rs | 2 ++ src/bounds.rs | 73 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/src/assembly.rs b/src/assembly.rs index e3743f23..1022fdb9 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -234,7 +234,9 @@ pub fn recurse_index_search( // enabled and this assembly state is preempted by the cached state, halt. if bound_exceeded( mol, + matches, state, + &subgraph, state_index, best_index.load(Relaxed), largest_remove, diff --git a/src/bounds.rs b/src/bounds.rs index 5cff5ec5..296ae4cf 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -14,7 +14,10 @@ use bit_set::BitSet; use clap::ValueEnum; -use crate::molecule::{Bond, Element, Molecule}; +use crate::{ + molecule::{Bond, Element, Molecule}, + reductions::CompatGraph, +}; /// Type of upper bound on the "savings" possible from an assembly state. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] @@ -66,7 +69,9 @@ struct EdgeType { /// Returns `true` iff any of the given bounds would prune this assembly state. pub fn bound_exceeded( mol: &Molecule, + matches: &Vec<(BitSet, BitSet)>, fragments: &[BitSet], + subgraph: &BitSet, state_index: usize, best_index: usize, largest_remove: usize, @@ -82,6 +87,7 @@ pub fn bound_exceeded( Bound::VecSmallFrags => { state_index - vec_small_frags_bound(fragments, largest_remove, mol) >= best_index } + Bound::CliqueBudget => best_index + clique_budget_bound(matches, subgraph, fragments) <= state_index, _ => { panic!("One of the chosen bounds is not implemented yet!") } @@ -239,3 +245,68 @@ fn vec_small_frags_bound(fragments: &[BitSet], m: usize, mol: &Molecule) -> usiz s - (z + size_two_types.len() + size_two_fragments.len()) - ((sl - z) as f32 / m as f32).ceil() as usize } + +pub fn clique_budget_bound(matches: &Vec<(BitSet, BitSet)>, subgraph: &BitSet, fragments: &[BitSet]) -> usize { + let total_bonds = fragments.iter().map(|x| x.len()).sum::(); + let mut bound = 0; + let sizes = { + let mut vec = vec![]; + let mut prev = 0; + for s in subgraph.iter().map(|v| matches[v].0.len()) { + if s != prev { + vec.push(s); + prev = s; + } + } + + vec + }; + + for i in sizes { + let mut bound_temp = 0; + let mut has_bonds = fragments.len(); + let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); + let mut smallest_remove = i; + + for v in subgraph.iter() { + if has_bonds == 0 { + break; + } + if matches[v].0.len() > i { + continue; + } + + + let dup = &matches[v]; + let bond = dup.1.iter().next().unwrap(); + let mut j = 0; + while !fragments[j].contains(bond) { + j += 1; + } + + if num_bonds[j] > 0 { + let remove = std::cmp::min(dup.0.len(), num_bonds[j]); + bound_temp += 1; + num_bonds[j] -= remove; + smallest_remove = std::cmp::min(smallest_remove, remove); + + if num_bonds[j] == 0 { + has_bonds -= 1; + } + } + } + + let leftover = num_bonds.iter().sum::(); + let log = { + if leftover > 0 { + 0 + } + else { + (smallest_remove as f32).log2().ceil() as usize + } + }; + bound = std::cmp::max(bound, total_bonds - bound_temp - leftover - log); + } + + bound +} From a981ef41f86feecd9b8a364b4359ebf9f6ba4014 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 16:01:22 -0700 Subject: [PATCH 07/51] Add default clique bounds. Only allow clique bounds with clique --- src/main.rs | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/main.rs b/src/main.rs index 9fec220e..c71f7ede 100644 --- a/src/main.rs +++ b/src/main.rs @@ -96,20 +96,29 @@ fn main() -> Result<()> { } // Handle bounding strategy CLI arguments. - let boundlist: &[Bound] = match cli.boundsgroup { - // By default, use a combination of the integer and vector bounds. - None => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], + let boundlist: &[Bound] = match (cli.boundsgroup, cli.clique) { + // If clique not enable, default to a combination of the integer and vector bounds. + (None, false) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], + // If clique is enabled, default to combination of int, vec, and clique bounds. + (None, true) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags, Bound::CliqueBudget], // If --no-bounds is set, do not use any bounds. - Some(BoundsGroup { + (Some(BoundsGroup { no_bounds: true, .. - }) => &[], + }), _) => &[], // Otherwise, use the bounds that were specified. - Some(BoundsGroup { + (Some(BoundsGroup { no_bounds: false, bounds, - }) => &bounds.clone(), + }), _) => &bounds.clone(), }; + let clique_bounds = vec![Bound::CliqueBudget, Bound::CoverNoSort, Bound::CoverSort]; + for bound in clique_bounds { + if !cli.clique && boundlist.contains(&bound) { + panic!("Bound {:?} can not be used without clique enabled", bound); + } + } + // Call index calculation with all the various options. let (index, num_matches, states_searched) = index_search( &mol, From 2bac102808a4771678143e36e08a18daae460fb2 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 16:13:18 -0700 Subject: [PATCH 08/51] Add cover bound --- src/assembly.rs | 1 + src/bounds.rs | 89 +++++++++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 2 +- 3 files changed, 89 insertions(+), 3 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 1022fdb9..07b8f4e6 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -235,6 +235,7 @@ pub fn recurse_index_search( if bound_exceeded( mol, matches, + graph, state, &subgraph, state_index, diff --git a/src/bounds.rs b/src/bounds.rs index 296ae4cf..73307cc4 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -70,6 +70,7 @@ struct EdgeType { pub fn bound_exceeded( mol: &Molecule, matches: &Vec<(BitSet, BitSet)>, + graph: &Option, fragments: &[BitSet], subgraph: &BitSet, state_index: usize, @@ -88,8 +89,21 @@ pub fn bound_exceeded( state_index - vec_small_frags_bound(fragments, largest_remove, mol) >= best_index } Bound::CliqueBudget => best_index + clique_budget_bound(matches, subgraph, fragments) <= state_index, - _ => { - panic!("One of the chosen bounds is not implemented yet!") + Bound::CoverNoSort => { + if let Some(g) = graph { + best_index + cover_bound(matches, g, subgraph, false) <= state_index + } + else { + false + } + } + Bound::CoverSort => { + if let Some(g) = graph { + best_index + cover_bound(matches, g, subgraph, true) <= state_index + } + else { + false + } } }; if exceeds { @@ -310,3 +324,74 @@ pub fn clique_budget_bound(matches: &Vec<(BitSet, BitSet)>, subgraph: &BitSet, f bound } + +pub fn cover_bound(matches: &Vec<(BitSet, BitSet)>, graph: &CompatGraph, subgraph: &BitSet, sort: bool) -> usize { + // Sort vertices + if sort { + let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len()); + for v in subgraph { + vertices.push((v, graph.degree(v, subgraph))); + } + vertices.sort_by(|a, b| b.1.cmp(&a.1)); + cover_bound_helper(matches, graph, subgraph, vertices.iter().map(|(v, _)| *v)) + } + else { + let vertices = (0..graph.len()).rev().filter(|v| subgraph.contains(*v)); + cover_bound_helper(matches, graph, subgraph, vertices) + } +} + +fn cover_bound_helper(matches: &Vec<(BitSet, BitSet)>, graph: &CompatGraph, subgraph: &BitSet, iter: impl Iterator) -> usize { + let mut colors: Vec>> = vec![None; graph.len()]; + let mut col_weights = vec![]; + let mut num_col = 0; + + for v in iter { + let mut v_col = Vec::new(); + let mut used = vec![0; num_col]; + + // Find colors used in neighborhood of v + for u in subgraph.intersection(graph.compatible_with(v)) { + let Some(u_col) = &colors[u] else { + continue; + }; + + for c in u_col { + used[*c] = 1; + } + } + + let mut total_weight = 0; + let v_val = matches[v].0.len() - 1; + // Find colors to give to v + for c in 0..num_col { + if used[c] == 1 { + continue; + } + + v_col.push(c); + total_weight += col_weights[c]; + + if total_weight >= v_val { + break; + } + } + + if total_weight == 0 { + v_col.push(num_col); + col_weights.push(v_val); + num_col += 1 + } + else if total_weight < v_val { + let mut k = num_col - 1; + while used[k] == 1 { + k -= 1 + } + col_weights[k] += v_val - total_weight + } + + colors[v] = Some(v_col); + }; + + col_weights.iter().sum() +} diff --git a/src/main.rs b/src/main.rs index c71f7ede..46881e12 100644 --- a/src/main.rs +++ b/src/main.rs @@ -100,7 +100,7 @@ fn main() -> Result<()> { // If clique not enable, default to a combination of the integer and vector bounds. (None, false) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], // If clique is enabled, default to combination of int, vec, and clique bounds. - (None, true) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags, Bound::CliqueBudget], + (None, true) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags, Bound::CliqueBudget, Bound::CoverNoSort], // If --no-bounds is set, do not use any bounds. (Some(BoundsGroup { no_bounds: true, .. From fe7b47ac52a2b47234aa98e8c0be8301cab541bb Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 16:35:23 -0700 Subject: [PATCH 09/51] Add deletion kernel. Not fully implemented yet --- src/kernels.rs | 40 ++++++++++++++++++++++++++++++++++++++++ src/reductions.rs | 8 -------- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/src/kernels.rs b/src/kernels.rs index 5b1f1292..0e65e994 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -12,6 +12,8 @@ //! the equivalent problem of weighted independent set.) use clap::ValueEnum; +use bit_set::BitSet; +use crate::reductions::CompatGraph; /// Graph kernelization strategy when searching using the clique reduction. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] @@ -27,3 +29,41 @@ pub enum KernelMode { /// Apply kernels after every fragmentation step. Always, } + +pub fn deletion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, mut subgraph: BitSet) -> BitSet{ + let subgraph_copy = subgraph.clone(); + + for v in subgraph_copy.iter() { + let v_val = matches[v].0.len(); + let neighbors_v = g.neighbors(v, &subgraph); + + let Some(w1) = neighbors_v.iter().next() else { + continue; + }; + let Some(w2) = neighbors_v.iter().last() else { + continue; + }; + + let mut s = subgraph.clone(); + s.intersect_with(&g.compatible_with(w1)); + for u in s.intersection(&&g.compatible_with(w2)) { + if g.are_adjacent(v, u) || v == u { + continue; + } + + let u_val = matches[u].0.len(); + if v_val > u_val { + continue; + } + + let neighbors_u = g.neighbors(u, &subgraph); + + if neighbors_v.is_subset(&neighbors_u) { + subgraph.remove(v); + break; + } + } + } + + subgraph +} \ No newline at end of file diff --git a/src/reductions.rs b/src/reductions.rs index 93519e74..184b3df0 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -38,14 +38,6 @@ impl CompatGraph { } } - /*pub fn weight(&self, v: usize) -> usize { - self.weights[v] - } - - pub fn matches(&self, v: usize) -> &(BitSet, BitSet) { - &self.matches[v] - }*/ - pub fn len(&self) -> usize { self.graph.len() } From e94d09ced1b0a89396dcf6f7ab7c2a68fdc321ad Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 16:57:05 -0700 Subject: [PATCH 10/51] Add support for kernel modes. Implement deletion kernel --- src/assembly.rs | 26 +++++++++++++++++++++----- src/main.rs | 3 +++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 07b8f4e6..c796757c 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -34,7 +34,7 @@ use crate::{ bounds::{bound_exceeded, Bound}, canonize::{canonize, CanonizeMode, Labeling}, enumerate::{enumerate_subgraphs, EnumerateMode}, - kernels::KernelMode, + kernels::{KernelMode, deletion_kernel}, memoize::{Cache, MemoizeMode}, molecule::Molecule, utils::connected_components_under_edges, @@ -220,7 +220,7 @@ pub fn recurse_index_search( mol: &Molecule, matches: &Vec<(BitSet, BitSet)>, graph: &Option, - subgraph: BitSet, + mut subgraph: BitSet, removal_order: Vec, state: &[BitSet], state_index: usize, @@ -229,6 +229,7 @@ pub fn recurse_index_search( bounds: &[Bound], cache: &mut Cache, parallel_mode: ParallelMode, + kernel_mode: KernelMode, ) -> (usize, usize) { // If any bounds would prune this assembly state or if memoization is // enabled and this assembly state is preempted by the cached state, halt. @@ -247,6 +248,12 @@ pub fn recurse_index_search( return (state_index, 1); } + // Apply kernels + if kernel_mode != KernelMode::None { + let g = graph.as_ref().unwrap(); + subgraph = deletion_kernel(matches, g, subgraph); + } + // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. let best_child_index = AtomicUsize::from(state_index); @@ -265,6 +272,16 @@ pub fn recurse_index_search( parallel_mode }; + // If kernelizing once, do not kernelize again. If kernelizing at depth-one, + // kernelize once more at the beginning of each descendent. + let new_kernel = if kernel_mode == KernelMode::Once { + KernelMode::None + } else if kernel_mode == KernelMode::DepthOne { + KernelMode::Once + } else { + kernel_mode + }; + // Update subgraph let mut sub_clone; if let Some(g) = graph { @@ -296,6 +313,7 @@ pub fn recurse_index_search( bounds, &mut cache.clone(), new_parallel, + new_kernel, ); // Update the best assembly indices (across children states and @@ -399,9 +417,6 @@ pub fn index_search( clique: bool, ) -> (u32, u32, usize) { // Catch not-yet-implemented modes. - if kernel_mode != KernelMode::None { - panic!("The chosen --kernel mode is not implemented yet!") - } // Enumerate non-overlapping isomorphic subgraph pairs. let matches = matches(mol, enumerate_mode, canonize_mode); @@ -443,6 +458,7 @@ pub fn index_search( bounds, &mut cache, parallel_mode, + kernel_mode, ); (index as u32, matches.len() as u32, states_searched) diff --git a/src/main.rs b/src/main.rs index 46881e12..fa81b6aa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -117,6 +117,9 @@ fn main() -> Result<()> { if !cli.clique && boundlist.contains(&bound) { panic!("Bound {:?} can not be used without clique enabled", bound); } + if !cli.clique && cli.kernel != KernelMode::None { + panic!("Cannot use kernels without clique enabled"); + } } // Call index calculation with all the various options. From 51c961dd1219dc66fc8fb8036ff5757f1e910083 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 28 Jul 2025 18:04:06 -0700 Subject: [PATCH 11/51] Add inclusion clone for serial search --- src/assembly.rs | 43 +++++++++++++++++++++++++++++++++++-------- src/kernels.rs | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index c796757c..b680561f 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -19,11 +19,10 @@ //! ``` use std::{ - collections::HashMap, - sync::{ + collections::HashMap, ops::ControlFlow, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, - }, + } }; use bit_set::BitSet; @@ -34,7 +33,7 @@ use crate::{ bounds::{bound_exceeded, Bound}, canonize::{canonize, CanonizeMode, Labeling}, enumerate::{enumerate_subgraphs, EnumerateMode}, - kernels::{KernelMode, deletion_kernel}, + kernels::{KernelMode, deletion_kernel, inclusion_kernel}, memoize::{Cache, MemoizeMode}, molecule::Molecule, utils::connected_components_under_edges, @@ -221,6 +220,7 @@ pub fn recurse_index_search( matches: &Vec<(BitSet, BitSet)>, graph: &Option, mut subgraph: BitSet, + must_include: &Option>, removal_order: Vec, state: &[BitSet], state_index: usize, @@ -249,9 +249,17 @@ pub fn recurse_index_search( } // Apply kernels + let mut must_include_clone = None; if kernel_mode != KernelMode::None { let g = graph.as_ref().unwrap(); + + // Deletion kernel subgraph = deletion_kernel(matches, g, subgraph); + + //Inclusion kernel + let mut include_temp = must_include.as_ref().unwrap().clone(); + include_temp.append(&mut inclusion_kernel(matches, g, &subgraph)); + must_include_clone = Some(include_temp); } // Keep track of the best assembly index found in any of this assembly @@ -301,6 +309,7 @@ pub fn recurse_index_search( matches, graph, sub_clone, + &must_include_clone, { let mut clone = removal_order.clone(); clone.push(i); @@ -326,15 +335,23 @@ pub fn recurse_index_search( // Use the iterator type corresponding to the specified parallelism mode. if parallel_mode == ParallelMode::None { - subgraph + let _ = subgraph .iter() - .for_each(|v| recurse_on_match(v)); + .try_for_each(|v| { + recurse_on_match(v); + if let Some(vec) = &must_include_clone { + if vec.contains(&v) { + return ControlFlow::Break(()); + } + } + ControlFlow::Continue(()) + }); } else { - subgraph + let _ = subgraph .iter() .collect::>() .par_iter() - .for_each(|v | recurse_on_match(*v)); + .for_each(|v| recurse_on_match(*v)); } ( @@ -442,6 +459,15 @@ pub fn index_search( subgraph.insert(i); } + let must_include = { + if kernel_mode != KernelMode::None { + Some(Vec::new()) + } + else { + None + } + }; + // Search for the shortest assembly pathway recursively. let edge_count = mol.graph().edge_count(); let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); @@ -450,6 +476,7 @@ pub fn index_search( &matches, &graph, subgraph, + &must_include, Vec::new(), &[init], edge_count - 1, diff --git a/src/kernels.rs b/src/kernels.rs index 0e65e994..d083ef5e 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -66,4 +66,41 @@ pub fn deletion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, mut sub } subgraph +} + +pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgraph: &BitSet) -> Vec { + let mut kernel = Vec::new(); + let tot = subgraph.iter().map(|v| matches[v].0.len() - 1).sum::(); + + 'outer: for v in subgraph { + let v_val = matches[v].0.len() - 1; + let neighbors_val = g.neighbors(v, subgraph).iter().map(|u| matches[u].0.len() - 1).sum::(); + if v_val >= tot - neighbors_val - v_val { + kernel.push(v); + continue; + } + + let mut neighbors: Vec = vec![]; + + for u in subgraph.difference(&g.compatible_with(v)) { + if u == v { + continue; + } + if matches[u].0.len() - 1 > v_val { + continue 'outer; + } + + for w in neighbors.iter() { + if g.are_adjacent(u, *w) { + continue 'outer; + } + } + + neighbors.push(u); + } + + kernel.push(v); + } + + kernel } \ No newline at end of file From 6aec6a9082e91c512f2141db75e38bb7e9da2bd7 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 29 Jul 2025 13:17:14 -0700 Subject: [PATCH 12/51] Change must_include_kernel return to usize --- src/assembly.rs | 33 ++++++--------------------------- src/kernels.rs | 9 +++++++-- 2 files changed, 13 insertions(+), 29 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index b680561f..5c9a0268 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -19,7 +19,7 @@ //! ``` use std::{ - collections::HashMap, ops::ControlFlow, sync::{ + collections::HashMap, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, } @@ -220,7 +220,6 @@ pub fn recurse_index_search( matches: &Vec<(BitSet, BitSet)>, graph: &Option, mut subgraph: BitSet, - must_include: &Option>, removal_order: Vec, state: &[BitSet], state_index: usize, @@ -249,7 +248,7 @@ pub fn recurse_index_search( } // Apply kernels - let mut must_include_clone = None; + let mut must_include = matches.len(); if kernel_mode != KernelMode::None { let g = graph.as_ref().unwrap(); @@ -257,9 +256,7 @@ pub fn recurse_index_search( subgraph = deletion_kernel(matches, g, subgraph); //Inclusion kernel - let mut include_temp = must_include.as_ref().unwrap().clone(); - include_temp.append(&mut inclusion_kernel(matches, g, &subgraph)); - must_include_clone = Some(include_temp); + must_include = inclusion_kernel(matches, g, &subgraph); } // Keep track of the best assembly index found in any of this assembly @@ -309,7 +306,6 @@ pub fn recurse_index_search( matches, graph, sub_clone, - &must_include_clone, { let mut clone = removal_order.clone(); clone.push(i); @@ -335,17 +331,10 @@ pub fn recurse_index_search( // Use the iterator type corresponding to the specified parallelism mode. if parallel_mode == ParallelMode::None { - let _ = subgraph + subgraph .iter() - .try_for_each(|v| { - recurse_on_match(v); - if let Some(vec) = &must_include_clone { - if vec.contains(&v) { - return ControlFlow::Break(()); - } - } - ControlFlow::Continue(()) - }); + .filter(|v| *v <= must_include) + .for_each(|v| recurse_on_match(v)); } else { let _ = subgraph .iter() @@ -459,15 +448,6 @@ pub fn index_search( subgraph.insert(i); } - let must_include = { - if kernel_mode != KernelMode::None { - Some(Vec::new()) - } - else { - None - } - }; - // Search for the shortest assembly pathway recursively. let edge_count = mol.graph().edge_count(); let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); @@ -476,7 +456,6 @@ pub fn index_search( &matches, &graph, subgraph, - &must_include, Vec::new(), &[init], edge_count - 1, diff --git a/src/kernels.rs b/src/kernels.rs index d083ef5e..4b8d99d4 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -68,7 +68,7 @@ pub fn deletion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, mut sub subgraph } -pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgraph: &BitSet) -> Vec { +pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgraph: &BitSet) -> usize { let mut kernel = Vec::new(); let tot = subgraph.iter().map(|v| matches[v].0.len() - 1).sum::(); @@ -102,5 +102,10 @@ pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgra kernel.push(v); } - kernel + if let Some(x) = kernel.iter().min() { + *x + } + else { + matches.len() + } } \ No newline at end of file From 82c6701752a7fc4c8208505f2a1ab3df3ff303cd Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 29 Jul 2025 13:29:58 -0700 Subject: [PATCH 13/51] Implement inclusion kernel for parallel modes --- src/assembly.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/assembly.rs b/src/assembly.rs index 5c9a0268..ee42483b 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -338,6 +338,7 @@ pub fn recurse_index_search( } else { let _ = subgraph .iter() + .filter(|v| *v <= must_include) .collect::>() .par_iter() .for_each(|v| recurse_on_match(*v)); From a4684138749ed8d66e5f231ab647323d6f96d427 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 29 Jul 2025 13:34:41 -0700 Subject: [PATCH 14/51] Optimize largest_remove computation --- src/assembly.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index ee42483b..c3be6a40 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -224,12 +224,20 @@ pub fn recurse_index_search( state: &[BitSet], state_index: usize, best_index: Arc, - largest_remove: usize, bounds: &[Bound], cache: &mut Cache, parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { + let largest_remove = { + if let Some(v) = subgraph.iter().next() { + matches[v].0.len() + } + else { + return (state_index, 1); + } + }; + // If any bounds would prune this assembly state or if memoization is // enabled and this assembly state is preempted by the cached state, halt. if bound_exceeded( @@ -314,7 +322,6 @@ pub fn recurse_index_search( &fragments, state_index - h1.len() + 1, best_index.clone(), - h1.len(), bounds, &mut cache.clone(), new_parallel, @@ -461,7 +468,6 @@ pub fn index_search( &[init], edge_count - 1, best_index, - edge_count, bounds, &mut cache, parallel_mode, From 0f45f5aa599d804e4620ca80ddfd6aa2f04bfea8 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 29 Jul 2025 14:50:07 -0700 Subject: [PATCH 15/51] Add some comments and documentation --- src/assembly.rs | 15 ++++++++++++--- src/kernels.rs | 10 +++++++++- src/reductions.rs | 26 ++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index c3be6a40..94c3e317 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -198,16 +198,17 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti /// Inputs: /// - `mol`: The molecule whose assembly index is being calculated. /// - `matches`: The remaining non-overlapping isomorphic subgraph pairs. +/// - `graph`: If clique is enabled, the graph of compatible isomorphic subgraph paris. +/// - `subgraph`: A bitset with length of matches. Has a 1 for every match to be searched in this state. /// - `removal_order`: TODO /// - `state`: The current assembly state, i.e., a list of fragments. /// - `state_index`: This assembly state's upper bound on the assembly index, /// i.e., edges(mol) - 1 - [edges(subgraphs removed) - #(subgraphs removed)]. /// - `best_index`: The smallest assembly index for all assembly states so far. -/// - `largest_remove`: An upper bound on the size of fragments that can be -/// removed from this or any descendant assembly state. /// - `bounds`: The list of bounding strategies to apply. /// - `cache`: TODO /// - `parallel_mode`: The parallelism mode for this state's match iteration. +/// - `kernel_mode`: The kernelization mode for this state. /// /// Returns, from this assembly state and any of its descendents: /// - `usize`: An updated upper bound on the assembly index. (Note: If this @@ -229,6 +230,8 @@ pub fn recurse_index_search( parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { + // An upper bound on the size of fragments that can be + // removed from this or any descendant assembly state. let largest_remove = { if let Some(v) = subgraph.iter().next() { matches[v].0.len() @@ -261,9 +264,11 @@ pub fn recurse_index_search( let g = graph.as_ref().unwrap(); // Deletion kernel + // Returns a subgraph without nodes that never occur in an optimal solution. subgraph = deletion_kernel(matches, g, subgraph); - //Inclusion kernel + // Inclusion kernel. + // must_include is the first match in the matches list that will be included in an optimal solution. must_include = inclusion_kernel(matches, g, &subgraph); } @@ -296,6 +301,8 @@ pub fn recurse_index_search( }; // Update subgraph + // If using CompatGraph, only consider the neighborhood of the node removed. + // Otherwise, only consider the matches that occur after the removed match in the list. let mut sub_clone; if let Some(g) = graph { sub_clone = subgraph.clone(); @@ -337,6 +344,8 @@ pub fn recurse_index_search( }; // Use the iterator type corresponding to the specified parallelism mode. + // Only search on nodes with v <= must_include since any searches after that would + // not use must_include. if parallel_mode == ParallelMode::None { subgraph .iter() diff --git a/src/kernels.rs b/src/kernels.rs index 4b8d99d4..0351bf5d 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -30,6 +30,8 @@ pub enum KernelMode { Always, } +/// Takes a subgraph as input and removes all nodes that will not be part of any maximum weight clique. +/// Uses the strategy of domination as described in Lamm et al. pub fn deletion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, mut subgraph: BitSet) -> BitSet{ let subgraph_copy = subgraph.clone(); @@ -68,20 +70,26 @@ pub fn deletion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, mut sub subgraph } +/// Takes a subgraph as input and returns the first vertex that will be included in some maximum weight clique. +/// Uses the strategies of neighborhood removal and isolated vertex removal from Lamm et al. pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgraph: &BitSet) -> usize { let mut kernel = Vec::new(); let tot = subgraph.iter().map(|v| matches[v].0.len() - 1).sum::(); 'outer: for v in subgraph { let v_val = matches[v].0.len() - 1; + + // Neighborhood removal let neighbors_val = g.neighbors(v, subgraph).iter().map(|u| matches[u].0.len() - 1).sum::(); if v_val >= tot - neighbors_val - v_val { kernel.push(v); continue; } + // Isolated vertex removal. + // Only makes it through this loop if the non-neighborhood of v is independent and + // contains vertices with weight no higher than v. let mut neighbors: Vec = vec![]; - for u in subgraph.difference(&g.compatible_with(v)) { if u == v { continue; diff --git a/src/reductions.rs b/src/reductions.rs index 184b3df0..fc0e82ce 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -1,10 +1,25 @@ +//! Reduces the optimal molecular assembly index problem into maximum weighted clique. +//! +//! Removing a duplicatable subgraph pair (h1, h2) gives a savings of |h1| - 1. +//! The problem of finding the optimal molecular assembly index can thus be thought of as finding +//! a set of duplicatable subgraph removals that are all compatible with each other (i.e. can be removed +//! at the same time) with the largest savings possible. Thus, this problem is close to that of finding +//! a maximum clique in a graph with duplicatable subgraph pairs for vertices, vertex weights |h1| - 1, and +//! edges representing compatibility. However, the order of duplicatable subgraph removals can affect +//! compatibility, and so edges must be directed. By ordering the vertices in a nice way, we can ignore +//! edge directionality. + use bit_set::BitSet; +/// Graph representation of the compatibility of duplicatable subgraph pairs. pub struct CompatGraph { + /// The graph is implemented as an adjacency matrix. The ith element of graph + /// has a 1 at position j if {i, j} is an edge. graph: Vec, } impl CompatGraph { + /// Constructs a compatibility graph given a set of matches. pub fn new(init_matches: &Vec<(BitSet, BitSet)>) -> Self { let size = init_matches.len(); @@ -38,18 +53,25 @@ impl CompatGraph { } } + /// Returns the number of vertices in the graph. pub fn len(&self) -> usize { self.graph.len() } + /// Returns the degree vertex v in the subgraph pub fn degree(&self, v: usize, subgraph: &BitSet) -> usize { self.graph[v].intersection(subgraph).count() } + /// Returns the set of matches compatible with v. I.e. the set neighbors + /// of v. pub fn compatible_with(&self, v: usize) -> &BitSet { &self.graph[v] } + /// Returns the neighbors of v in the given subgraph. + /// The returned BitSet is a clone, and thus can be safely modified by the calling function + /// without changing the graph. pub fn neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { let mut neighbors = self.graph[v].clone(); neighbors.intersect_with(subgraph); @@ -57,6 +79,9 @@ impl CompatGraph { neighbors } + /// Returns the neighbors of v that occur after v in the matches list. + /// The returned BitSet is a clone, and thus can be safely modified by the calling function + /// without changing the graph. pub fn forward_neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { let mut neighbors = self.graph[v].clone(); neighbors.intersect_with(subgraph); @@ -76,6 +101,7 @@ impl CompatGraph { neighbors } + /// Returns true if vertices v and u are adjacent. pub fn are_adjacent(&self, v: usize, u: usize) -> bool { self.graph[v].contains(u) } From 69082d05f4d1268d2e09287650f343848ef45ead Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 29 Jul 2025 14:52:51 -0700 Subject: [PATCH 16/51] Optimize inclusion_kernel return --- src/kernels.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/kernels.rs b/src/kernels.rs index 0351bf5d..f4574df0 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -73,7 +73,6 @@ pub fn deletion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, mut sub /// Takes a subgraph as input and returns the first vertex that will be included in some maximum weight clique. /// Uses the strategies of neighborhood removal and isolated vertex removal from Lamm et al. pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgraph: &BitSet) -> usize { - let mut kernel = Vec::new(); let tot = subgraph.iter().map(|v| matches[v].0.len() - 1).sum::(); 'outer: for v in subgraph { @@ -82,8 +81,7 @@ pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgra // Neighborhood removal let neighbors_val = g.neighbors(v, subgraph).iter().map(|u| matches[u].0.len() - 1).sum::(); if v_val >= tot - neighbors_val - v_val { - kernel.push(v); - continue; + return v; } // Isolated vertex removal. @@ -107,13 +105,9 @@ pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgra neighbors.push(u); } - kernel.push(v); + return v; } - if let Some(x) = kernel.iter().min() { - *x - } - else { - matches.len() - } + matches.len() + } \ No newline at end of file From 30612c07076d43605d8c41424b76f23865cb48e1 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 4 Aug 2025 13:05:00 -0700 Subject: [PATCH 17/51] Fix removal_order --- src/assembly.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assembly.rs b/src/assembly.rs index 94c3e317..bd69cfc7 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -323,7 +323,7 @@ pub fn recurse_index_search( sub_clone, { let mut clone = removal_order.clone(); - clone.push(i); + clone.push(v); clone }, &fragments, From 26cea2fd03bb7559e77ad6dd7b50eaa4be156277 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 21 Aug 2025 15:36:22 -0700 Subject: [PATCH 18/51] Add new int bound --- src/assembly.rs | 45 ++++++++++++++++++++++++++++--- src/bounds.rs | 72 ++++++++++++++++++++++++++++++++++++++++++++++++- src/main.rs | 2 +- 3 files changed, 114 insertions(+), 5 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index bd69cfc7..374dcc3b 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -193,6 +193,19 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti Some(fragments) } +fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec>) -> bool { + let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c)); + let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c)); + + if let (Some((i1, _)), Some((i2, _))) = (f1, f2) { + masks[h1.len() - 2][i1].union_with(h1); + masks[h1.len() - 2][i2].union_with(h2); + true + } else { + false + } +} + /// Recursive helper for [`index_search`], only public for benchmarking. /// /// Inputs: @@ -230,6 +243,9 @@ pub fn recurse_index_search( parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { + //println!("{:?}", removal_order); + //println!("{:?}", state); + // An upper bound on the size of fragments that can be // removed from this or any descendant assembly state. let largest_remove = { @@ -241,19 +257,40 @@ pub fn recurse_index_search( } }; + let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; + for v in subgraph.iter() { + let m = &matches[v]; + is_usable(state, &m.0, &m.1, &mut masks); + } + /*while masks.iter().last().unwrap().len() == 0 { + masks.pop(); + }*/ + /*for (i, m) in masks[0].iter().enumerate() { + state[i] = m.clone(); + }*/ + /*let mut state = Vec::new(); + for m in masks[0].iter() { + if m.len() != 0 { + state.extend(connected_components_under_edges(mol.graph(), m)); + } + } + state.retain(|i| i.len() > 1);*/ + + // If any bounds would prune this assembly state or if memoization is // enabled and this assembly state is preempted by the cached state, halt. if bound_exceeded( mol, matches, graph, - state, + &state, &subgraph, state_index, best_index.load(Relaxed), largest_remove, bounds, - ) || cache.memoize_state(mol, state, state_index, &removal_order) + &masks, + ) || cache.memoize_state(mol, &state, state_index, &removal_order) { return (state_index, 1); } @@ -281,7 +318,7 @@ pub fn recurse_index_search( // the given (enumerated) pair of non-overlapping isomorphic subgraphs. let recurse_on_match = |v: usize| { let (h1, h2) = &matches[v]; - if let Some(fragments) = fragments(mol, state, h1, h2) { + if let Some(fragments) = fragments(mol, &state, h1, h2) { // If using depth-one parallelism, all descendant states should be // computed serially. let new_parallel = if parallel_mode == ParallelMode::DepthOne { @@ -483,6 +520,8 @@ pub fn index_search( kernel_mode, ); + println!("{}", cache.count()); + (index as u32, matches.len() as u32, states_searched) } diff --git a/src/bounds.rs b/src/bounds.rs index 73307cc4..d4e5aa39 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -77,11 +77,13 @@ pub fn bound_exceeded( best_index: usize, largest_remove: usize, bounds: &[Bound], + masks: &Vec>, ) -> bool { for bound_type in bounds { let exceeds = match bound_type { Bound::Log => state_index - log_bound(fragments) >= best_index, - Bound::Int => state_index - int_bound(fragments, largest_remove) >= best_index, + //Bound::Int => state_index - int_bound_seet(masks, largest_remove) >= best_index, + Bound::Int => best_index + int_bound_new(masks) <= state_index, Bound::VecSimple => { state_index - vec_simple_bound(fragments, largest_remove, mol) >= best_index } @@ -149,6 +151,74 @@ fn int_bound(fragments: &[BitSet], m: usize) -> usize { max_s } +fn int_bound_new(masks: &Vec>) -> usize { + let mut max = 0; + for (i, list) in masks.iter().enumerate() { + let mut val = 0; + let i = i + 2; + for (j, frag) in list.iter().enumerate() { + let mf = masks[0][j].len(); + let mi = frag.len(); + let x = mf + (mi % i) - mi; + val += mf - (mi / i) - x / (i-1) - (x % (i-1) != 0) as usize; + } + val -= (i as f32).log2().ceil() as usize; + max = max.max(val); + } + + max +} + +fn int_bound_seet(masks: &Vec>, largest_remove: usize) -> usize { + let mut dup_bonds_2 = 0; + let mut dup_bonds_total; + let mut max; + let mut size_lists = vec![Vec::new(); masks.len()]; + + for j in 0..masks.len() + { + size_lists[j] = vec![0; masks[j].len()]; + for i in 0..masks[0].len() + { + size_lists[j][i] = masks[j][i].len(); + } + } + + for i in 0..size_lists[0].len() { + dup_bonds_2 += size_lists[0][i] / 2; + } + dup_bonds_2 -= 1; + max = dup_bonds_2; + + for j in 3..largest_remove+1 + { + let size_list = &size_lists[j - 2]; + let size_list2 = &size_lists[0]; + let mut adjusted_size_list = vec![0; size_list.len()]; + let mut adjusted_size_list2 = vec![0; size_list.len()]; + for i in 0..size_list.len() + { + adjusted_size_list[i] = size_list[i] - size_list[i] % j; + adjusted_size_list2[i] = size_list2[i] - adjusted_size_list[i]; + } + dup_bonds_total = 0; + for i in 0..size_list.len() + { + dup_bonds_total += adjusted_size_list[i] - adjusted_size_list[i] / j; + dup_bonds_total += adjusted_size_list2[i] - adjusted_size_list2[i] / (j - 1); + if adjusted_size_list2[i] % (j - 1) != 0 { + dup_bonds_total -= 1; + } + } + dup_bonds_total -= (j as f32).log2().ceil() as usize; + max = max.max(dup_bonds_total); + } + + max +} + + + /// TODO // Count number of unique edges in a fragment // Helper function for vector bounds diff --git a/src/main.rs b/src/main.rs index fa81b6aa..8b30222a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -100,7 +100,7 @@ fn main() -> Result<()> { // If clique not enable, default to a combination of the integer and vector bounds. (None, false) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], // If clique is enabled, default to combination of int, vec, and clique bounds. - (None, true) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags, Bound::CliqueBudget, Bound::CoverNoSort], + (None, true) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags, /*Bound::CliqueBudget,*/ Bound::CoverNoSort], // If --no-bounds is set, do not use any bounds. (Some(BoundsGroup { no_bounds: true, .. From f0c38bccf79b2546de561aa5d8c61e41c23bf461 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 21 Aug 2025 15:36:36 -0700 Subject: [PATCH 19/51] Add cache counter --- src/memoize.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/memoize.rs b/src/memoize.rs index 6ae2060e..c2029d5b 100644 --- a/src/memoize.rs +++ b/src/memoize.rs @@ -6,6 +6,8 @@ use bit_set::BitSet; use clap::ValueEnum; use dashmap::DashMap; +use std::sync::atomic::AtomicUsize; + use crate::{ canonize::{canonize, CanonizeMode, Labeling}, molecule::Molecule, @@ -47,6 +49,8 @@ pub struct Cache { /// A parallel-aware map from fragments to their canonical labelings; only /// used with [`MemoizeMode::CanonIndex`]. fragment_labels: Arc>, + + count: Arc, } impl Cache { @@ -57,6 +61,7 @@ impl Cache { canonize_mode, cache: Arc::new(DashMap::)>::new()), fragment_labels: Arc::new(DashMap::::new()), + count: Arc::new(AtomicUsize::from(0)), } } @@ -95,7 +100,7 @@ impl Cache { /// preempted by the cached assembly state. /// See https://github.com/DaymudeLab/assembly-theory/pull/95 for details. pub fn memoize_state( - &self, + &mut self, mol: &Molecule, state: &[BitSet], state_index: usize, @@ -121,9 +126,14 @@ impl Cache { .value() .clone(); if cached_index <= state_index && cached_order < *removal_order { + self.count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); return true; } } false } + + pub fn count(&mut self) -> usize { + self.count.load(std::sync::atomic::Ordering::Relaxed) + } } From 4f87f7a6852e637f50dd572e33eff97fc06d7722 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 21 Aug 2025 16:04:42 -0700 Subject: [PATCH 20/51] Faster matches --- src/assembly.rs | 103 ++++++++++++++++++++++++++++++++++-------------- src/bounds.rs | 4 +- 2 files changed, 75 insertions(+), 32 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 374dcc3b..17dd4e79 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -19,7 +19,7 @@ //! ``` use std::{ - collections::HashMap, sync::{ + collections::{HashMap, HashSet}, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, } @@ -28,15 +28,16 @@ use std::{ use bit_set::BitSet; use clap::ValueEnum; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; +use petgraph::graph::EdgeIndex; use crate::{ bounds::{bound_exceeded, Bound}, canonize::{canonize, CanonizeMode, Labeling}, - enumerate::{enumerate_subgraphs, EnumerateMode}, + enumerate::{EnumerateMode}, kernels::{KernelMode, deletion_kernel, inclusion_kernel}, memoize::{Cache, MemoizeMode}, molecule::Molecule, - utils::connected_components_under_edges, + utils::{connected_components_under_edges, edge_neighbors}, reductions::CompatGraph, }; @@ -98,38 +99,82 @@ pub fn depth(mol: &Molecule) -> u32 { /// /// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, /// sorted to guarantee deterministic iteration. -pub fn matches( - mol: &Molecule, - enumerate_mode: EnumerateMode, - canonize_mode: CanonizeMode, -) -> Vec<(BitSet, BitSet)> { - // Enumerate all connected, non-induced subgraphs with at most |E|/2 edges - // and bin them into isomorphism classes using canonization. - let mut isomorphism_classes = HashMap::>::new(); - for subgraph in enumerate_subgraphs(mol, enumerate_mode) { - isomorphism_classes - .entry(canonize(mol, &subgraph, canonize_mode)) - .and_modify(|bucket| bucket.push(subgraph.clone())) - .or_insert(vec![subgraph.clone()]); +pub fn matches(mol: &Molecule) -> Vec<(BitSet, BitSet)> { + let num_edges = mol.graph().edge_count(); + let mut subgraphs: HashSet = HashSet::new(); + let mut size_one_subgraphs: HashSet = HashSet::new(); + let mut matches: Vec<(BitSet, BitSet)> = Vec::new(); + let mut buckets: HashMap> = HashMap::new(); + + // Generate subgraphs with one edge + for i in 0..num_edges { + let mut new_bitset = BitSet::with_capacity(num_edges); + new_bitset.insert(i); + size_one_subgraphs.insert(new_bitset); + } + + // Generate subgraphs with two edges + for subgraph in size_one_subgraphs.iter() { + let mut neighborhood = BitSet::with_capacity(num_edges); + for e in subgraph { + neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); + } + neighborhood.difference_with(subgraph); + + for e in neighborhood.iter() { + let mut new_subgraph = subgraph.clone(); + new_subgraph.insert(e); + subgraphs.insert(new_subgraph); + } } - // In each isomorphism class, collect non-overlapping pairs of subgraphs. - let mut matches = Vec::new(); - for bucket in isomorphism_classes.values() { - for (i, first) in bucket.iter().enumerate() { - for second in &bucket[i..] { - if first.is_disjoint(second) { - if first > second { + while subgraphs.len() > 0 { + buckets.clear(); + // Generate buckets + for g in subgraphs.iter() { + let canon = canonize(mol, g, CanonizeMode::TreeNauty); + buckets.entry(canon) + .and_modify(|v| v.push(g.clone())) + .or_insert(vec![g.clone()]); + } + + subgraphs.clear(); + for (_, bucket) in buckets.iter() { + // Generate matches + let mut has_match = BitSet::with_capacity(bucket.len()); + for (i, first) in bucket.iter().enumerate() { + for (j, second) in bucket[i+1..].iter().enumerate() { + if first.intersection(second).count() == 0 { + if first > second { matches.push((first.clone(), second.clone())); - } else { - matches.push((second.clone(), first.clone())); + } else { + matches.push((second.clone(), first.clone())); + } + has_match.insert(i); + has_match.insert(i + 1 + j); } } } + + // Generate new subgraphs + // Only extend if subgraph has a match + for i in has_match.iter() { + let subgraph = &bucket[i]; + let mut neighborhood = BitSet::with_capacity(num_edges); + for e in subgraph { + neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); + } + neighborhood.difference_with(subgraph); + + for e in neighborhood.iter() { + let mut new_subgraph = subgraph.clone(); + new_subgraph.insert(e); + subgraphs.insert(new_subgraph); + } + } } } - // Sort pairs in a deterministic order and return. matches.sort_by(|e1, e2| { let ord = [ e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size. @@ -468,7 +513,7 @@ pub fn recurse_index_search( /// ``` pub fn index_search( mol: &Molecule, - enumerate_mode: EnumerateMode, + _enumerate_mode: EnumerateMode, canonize_mode: CanonizeMode, parallel_mode: ParallelMode, memoize_mode: MemoizeMode, @@ -479,7 +524,7 @@ pub fn index_search( // Catch not-yet-implemented modes. // Enumerate non-overlapping isomorphic subgraph pairs. - let matches = matches(mol, enumerate_mode, canonize_mode); + let matches = matches(mol); // Create memoization cache. let mut cache = Cache::new(memoize_mode, canonize_mode); @@ -520,8 +565,6 @@ pub fn index_search( kernel_mode, ); - println!("{}", cache.count()); - (index as u32, matches.len() as u32, states_searched) } diff --git a/src/bounds.rs b/src/bounds.rs index d4e5aa39..e13b097d 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -126,7 +126,7 @@ fn log_bound(fragments: &[BitSet]) -> usize { } /// TODO -fn int_bound(fragments: &[BitSet], m: usize) -> usize { +fn _int_bound(fragments: &[BitSet], m: usize) -> usize { let mut max_s: usize = 0; let mut frag_sizes: Vec = Vec::new(); @@ -169,7 +169,7 @@ fn int_bound_new(masks: &Vec>) -> usize { max } -fn int_bound_seet(masks: &Vec>, largest_remove: usize) -> usize { +fn _int_bound_seet(masks: &Vec>, largest_remove: usize) -> usize { let mut dup_bonds_2 = 0; let mut dup_bonds_total; let mut max; From fef2d1362ad36e23d46aaefb44e129b8e9c861d6 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 21 Aug 2025 16:04:54 -0700 Subject: [PATCH 21/51] Remove cache counter --- src/memoize.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/memoize.rs b/src/memoize.rs index c2029d5b..2df74e3c 100644 --- a/src/memoize.rs +++ b/src/memoize.rs @@ -6,8 +6,6 @@ use bit_set::BitSet; use clap::ValueEnum; use dashmap::DashMap; -use std::sync::atomic::AtomicUsize; - use crate::{ canonize::{canonize, CanonizeMode, Labeling}, molecule::Molecule, @@ -49,8 +47,6 @@ pub struct Cache { /// A parallel-aware map from fragments to their canonical labelings; only /// used with [`MemoizeMode::CanonIndex`]. fragment_labels: Arc>, - - count: Arc, } impl Cache { @@ -61,7 +57,6 @@ impl Cache { canonize_mode, cache: Arc::new(DashMap::)>::new()), fragment_labels: Arc::new(DashMap::::new()), - count: Arc::new(AtomicUsize::from(0)), } } @@ -126,14 +121,9 @@ impl Cache { .value() .clone(); if cached_index <= state_index && cached_order < *removal_order { - self.count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); return true; } } false } - - pub fn count(&mut self) -> usize { - self.count.load(std::sync::atomic::Ordering::Relaxed) - } } From ccd5a7916703319660615e4fd909804d267436e4 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 26 Aug 2025 11:44:32 -0700 Subject: [PATCH 22/51] new memoize strategy --- src/assembly.rs | 52 +++++++++++++++++++++++------- src/main.rs | 41 ++++++++++++++++++++++++ src/memoize.rs | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 11 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 17dd4e79..d6bda21d 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -35,7 +35,7 @@ use crate::{ canonize::{canonize, CanonizeMode, Labeling}, enumerate::{EnumerateMode}, kernels::{KernelMode, deletion_kernel, inclusion_kernel}, - memoize::{Cache, MemoizeMode}, + memoize::{Cache, MemoizeMode, NewCache}, molecule::Molecule, utils::{connected_components_under_edges, edge_neighbors}, reductions::CompatGraph, @@ -284,7 +284,7 @@ pub fn recurse_index_search( state_index: usize, best_index: Arc, bounds: &[Bound], - cache: &mut Cache, + cache: &mut NewCache, parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { @@ -310,17 +310,31 @@ pub fn recurse_index_search( /*while masks.iter().last().unwrap().len() == 0 { masks.pop(); }*/ - /*for (i, m) in masks[0].iter().enumerate() { + /*let mut state = Vec::new(); + for (i, m) in masks[0].iter().enumerate() { state[i] = m.clone(); }*/ - /*let mut state = Vec::new(); - for m in masks[0].iter() { - if m.len() != 0 { - state.extend(connected_components_under_edges(mol.graph(), m)); + /*let mut temp = Vec::new(); + let state = { + if removal_order.len() == 0 { + for m in masks[0].iter() { + if m.len() != 0 { + temp.extend(connected_components_under_edges(mol.graph(), m)); + } + } + temp.retain(|i| i.len() > 1); + &temp } - } - state.retain(|i| i.len() > 1);*/ + else { + state + } + };*/ + /*let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; + for v in subgraph.iter() { + let m = &matches[v]; + is_usable(&state, &m.0, &m.1, &mut masks); + }*/ // If any bounds would prune this assembly state or if memoization is // enabled and this assembly state is preempted by the cached state, halt. @@ -335,11 +349,16 @@ pub fn recurse_index_search( largest_remove, bounds, &masks, - ) || cache.memoize_state(mol, &state, state_index, &removal_order) + ) || cache.memoize_state(mol, &state, state_index) { return (state_index, 1); } + let mut state = Vec::new(); + for (i, m) in masks[0].iter().enumerate() { + state.push(m.clone()); + } + // Apply kernels let mut must_include = matches.len(); if kernel_mode != KernelMode::None { @@ -442,6 +461,10 @@ pub fn recurse_index_search( .for_each(|v| recurse_on_match(*v)); } + if removal_order.len() == 1 { + println!("{:?}, {}", removal_order, states_searched.load(Relaxed)); + } + ( best_child_index.load(Relaxed), states_searched.load(Relaxed), @@ -525,9 +548,13 @@ pub fn index_search( // Enumerate non-overlapping isomorphic subgraph pairs. let matches = matches(mol); + /*for (i, m) in matches.iter().enumerate() { + println!("{i}: {:?}", m); + }*/ // Create memoization cache. - let mut cache = Cache::new(memoize_mode, canonize_mode); + //let mut cache = Cache::new(memoize_mode, canonize_mode); + let mut cache = NewCache::new(); let graph = { if clique { @@ -550,6 +577,7 @@ pub fn index_search( // Search for the shortest assembly pathway recursively. let edge_count = mol.graph().edge_count(); let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); + //let best_index = Arc::new(AtomicUsize::from(22)); let (index, states_searched) = recurse_index_search( mol, &matches, @@ -565,6 +593,8 @@ pub fn index_search( kernel_mode, ); + println!("Bounded: {}", cache.count()); + (index as u32, matches.len() as u32, states_searched) } diff --git a/src/main.rs b/src/main.rs index 8b30222a..e8f36260 100644 --- a/src/main.rs +++ b/src/main.rs @@ -122,6 +122,47 @@ fn main() -> Result<()> { } } + + /*use std::ffi::{OsStr, OsString}; + use std::path::Path; + use assembly_theory::molecule::Molecule; + use rayon::iter::{IntoParallelIterator, ParallelIterator, IntoParallelRefIterator, IndexedParallelIterator}; + + let paths = fs::read_dir(Path::new("data").join("plycyclic_hydrocarbons")).unwrap(); + let mut mol_list: Vec = Vec::new(); + let mut names: Vec = Vec::new(); + + for path in paths { + let name = path.unwrap().path(); + if name.extension().and_then(OsStr::to_str) != Some("mol") { + continue; + } + names.push(name.file_name().unwrap().to_os_string()); + mol_list.push( + parse_molfile_str( + &fs::read_to_string(name.clone()) + .expect(&format!("Could not read file {name:?}")), + ) + .expect(&format!("Failed to parse {name:?}")), + ); + } + + names.par_iter().zip(mol_list.par_iter()).for_each(|(n, mol)| { + let index = index_search( + &mol, + cli.enumerate, + cli.canonize, + cli.parallel, + cli.memoize, + cli.kernel, + boundlist, + cli.clique, + ); + println!("{:?}: MA: {} Space: {}, Searched: {}", n, index.0, index.1, index.2); + }); + std::process::exit(1);*/ + + // Call index calculation with all the various options. let (index, num_matches, states_searched) = index_search( &mol, diff --git a/src/memoize.rs b/src/memoize.rs index 2df74e3c..4593b4b3 100644 --- a/src/memoize.rs +++ b/src/memoize.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use bit_set::BitSet; use clap::ValueEnum; use dashmap::DashMap; +use std::sync::atomic::{AtomicUsize, Ordering::Relaxed}; use crate::{ canonize::{canonize, CanonizeMode, Labeling}, @@ -47,6 +48,8 @@ pub struct Cache { /// A parallel-aware map from fragments to their canonical labelings; only /// used with [`MemoizeMode::CanonIndex`]. fragment_labels: Arc>, + + counter: Arc, } impl Cache { @@ -57,6 +60,7 @@ impl Cache { canonize_mode, cache: Arc::new(DashMap::)>::new()), fragment_labels: Arc::new(DashMap::::new()), + counter: Arc::new(AtomicUsize::from(0)), } } @@ -121,9 +125,89 @@ impl Cache { .value() .clone(); if cached_index <= state_index && cached_order < *removal_order { + self.counter.fetch_add(1, Relaxed); + //println!("{:?}", state); return true; } } false } + + pub fn count(&self) -> usize { + self.counter.load(Relaxed) + } +} + +#[derive(Clone)] +pub struct NewCache { + cache: Arc, usize>>, + label_to_canon_id: Arc>, + frag_to_canon_id: Arc>, + next_id: Arc, + counter: Arc, } + +impl NewCache { + pub fn new() -> Self { + Self { + cache: Arc::new(DashMap::, usize>::new()), + label_to_canon_id: Arc::new(DashMap::::new()), + frag_to_canon_id: Arc::new(DashMap::::new()), + next_id: Arc::new(AtomicUsize::from(0)), + counter: Arc::new(AtomicUsize::from(0)), + } + } + + pub fn memoize_state(&mut self, mol: &Molecule, state: &[BitSet], state_index: usize) -> bool { + let mut frag_ids = Vec::new(); + for frag in state { + let id = self.get_canon_id(mol, frag); + frag_ids.push(id); + } + frag_ids.sort(); + + let mut stop = false; + + self.cache + .entry(frag_ids) + .and_modify(|val| { + if *val > state_index { + *val = state_index; + } else { + self.counter.fetch_add(1, Relaxed); + stop = true; + } + }) + .or_insert(state_index); + + stop + } + + fn get_canon_id(&mut self, mol: &Molecule, frag: &BitSet) -> usize { + // If frag has id, use it + if let Some(x) = self.frag_to_canon_id.get(frag) { + *x + } + // Otherwise canonize to get labeling + else { + let canon = canonize(mol, frag, CanonizeMode::TreeNauty); + // If label has id, use it + if let Some(x) = self.label_to_canon_id.get(&canon) { + let id = *x; + self.frag_to_canon_id.insert(frag.clone(), id); + id + } + // Otherwise asign new id + else { + let id = self.next_id.fetch_add(1, Relaxed); + self.label_to_canon_id.insert(canon, id); + self.frag_to_canon_id.insert(frag.clone(), id); + id + } + } + } + + pub fn count(&self) -> usize { + self.counter.load(Relaxed) + } +} \ No newline at end of file From 4f0f7c0b78ce21b1ef1ce9ba5cf758cddcef1e02 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 26 Aug 2025 15:48:27 -0700 Subject: [PATCH 23/51] Modified int bound --- src/assembly.rs | 53 +++++++++++++++++++++++++++++++++++++---------- src/main.rs | 2 +- src/memoize.rs | 17 +++++++++------ src/reductions.rs | 4 ++++ 4 files changed, 58 insertions(+), 18 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index d6bda21d..ec1bcdea 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -307,6 +307,13 @@ pub fn recurse_index_search( let m = &matches[v]; is_usable(state, &m.0, &m.1, &mut masks); } + + let mut state = Vec::new(); + for (i, m) in masks[0].iter().enumerate() { + if m.len() != 0 { + state.extend(connected_components_under_edges(mol.graph(), m)); + } + } /*while masks.iter().last().unwrap().len() == 0 { masks.pop(); }*/ @@ -336,9 +343,34 @@ pub fn recurse_index_search( is_usable(&state, &m.0, &m.1, &mut masks); }*/ + let mut max = Vec::new(); + for (i, list) in masks.iter().enumerate() { + let mut val = 0; + let i = i + 2; + for (j, frag) in list.iter().enumerate() { + let mf = masks[0][j].len(); + let mi = frag.len(); + let x = mf + (mi % i) - mi; + val += mf - (mi / i) - x / (i-1) - (x % (i-1) != 0) as usize; + } + val -= (i as f32).log2().ceil() as usize; + if max.len() == 0 { + max.push(val); + } + else { + max.push(val.max(max[max.len() - 1])); + } + } + + let best = best_index.load(Relaxed); + let mut idx = 2; + while (idx < max.len() + 2) && best + max[idx - 2] <= state_index { + idx += 1; + } + // If any bounds would prune this assembly state or if memoization is // enabled and this assembly state is preempted by the cached state, halt. - if bound_exceeded( + /*if bound_exceeded( mol, matches, graph, @@ -349,14 +381,13 @@ pub fn recurse_index_search( largest_remove, bounds, &masks, - ) || cache.memoize_state(mol, &state, state_index) + ) || cache.memoize_state(mol, &state, state_index, &removal_order) { return (state_index, 1); - } + }*/ - let mut state = Vec::new(); - for (i, m) in masks[0].iter().enumerate() { - state.push(m.clone()); + if cache.memoize_state(mol, &state, state_index, &removal_order) { + return (state_index, 1); } // Apply kernels @@ -450,7 +481,7 @@ pub fn recurse_index_search( if parallel_mode == ParallelMode::None { subgraph .iter() - .filter(|v| *v <= must_include) + .filter(|v| *v <= must_include && matches[*v].0.len() >= idx) .for_each(|v| recurse_on_match(v)); } else { let _ = subgraph @@ -461,9 +492,9 @@ pub fn recurse_index_search( .for_each(|v| recurse_on_match(*v)); } - if removal_order.len() == 1 { + /*if removal_order.len() == 1 { println!("{:?}, {}", removal_order, states_searched.load(Relaxed)); - } + }*/ ( best_child_index.load(Relaxed), @@ -577,7 +608,7 @@ pub fn index_search( // Search for the shortest assembly pathway recursively. let edge_count = mol.graph().edge_count(); let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); - //let best_index = Arc::new(AtomicUsize::from(22)); + //let best_index = Arc::new(AtomicUsize::from(12)); let (index, states_searched) = recurse_index_search( mol, &matches, @@ -593,7 +624,7 @@ pub fn index_search( kernel_mode, ); - println!("Bounded: {}", cache.count()); + //println!("Bounded: {}", cache.count()); (index as u32, matches.len() as u32, states_searched) } diff --git a/src/main.rs b/src/main.rs index e8f36260..36f2f937 100644 --- a/src/main.rs +++ b/src/main.rs @@ -128,7 +128,7 @@ fn main() -> Result<()> { use assembly_theory::molecule::Molecule; use rayon::iter::{IntoParallelIterator, ParallelIterator, IntoParallelRefIterator, IndexedParallelIterator}; - let paths = fs::read_dir(Path::new("data").join("plycyclic_hydrocarbons")).unwrap(); + let paths = fs::read_dir(Path::new("data").join("coconut_220")).unwrap(); let mut mol_list: Vec = Vec::new(); let mut names: Vec = Vec::new(); diff --git a/src/memoize.rs b/src/memoize.rs index 4593b4b3..80efa27b 100644 --- a/src/memoize.rs +++ b/src/memoize.rs @@ -140,7 +140,7 @@ impl Cache { #[derive(Clone)] pub struct NewCache { - cache: Arc, usize>>, + cache: Arc, (usize, Vec)>>, label_to_canon_id: Arc>, frag_to_canon_id: Arc>, next_id: Arc, @@ -150,7 +150,7 @@ pub struct NewCache { impl NewCache { pub fn new() -> Self { Self { - cache: Arc::new(DashMap::, usize>::new()), + cache: Arc::new(DashMap::, (usize, Vec)>::new()), label_to_canon_id: Arc::new(DashMap::::new()), frag_to_canon_id: Arc::new(DashMap::::new()), next_id: Arc::new(AtomicUsize::from(0)), @@ -158,7 +158,7 @@ impl NewCache { } } - pub fn memoize_state(&mut self, mol: &Molecule, state: &[BitSet], state_index: usize) -> bool { + pub fn memoize_state(&mut self, mol: &Molecule, state: &[BitSet], state_index: usize, removal_order: &Vec) -> bool { let mut frag_ids = Vec::new(); for frag in state { let id = self.get_canon_id(mol, frag); @@ -171,14 +171,15 @@ impl NewCache { self.cache .entry(frag_ids) .and_modify(|val| { - if *val > state_index { - *val = state_index; + if val.0 > state_index || val.1 > *removal_order { + val.0 = state_index; + val.1 = removal_order.clone(); } else { self.counter.fetch_add(1, Relaxed); stop = true; } }) - .or_insert(state_index); + .or_insert((state_index, removal_order.clone())); stop } @@ -207,6 +208,10 @@ impl NewCache { } } + pub fn add_count(&mut self) { + self.counter.fetch_add(1, Relaxed); + } + pub fn count(&self) -> usize { self.counter.load(Relaxed) } diff --git a/src/reductions.rs b/src/reductions.rs index fc0e82ce..5ac282c9 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -10,6 +10,7 @@ //! edge directionality. use bit_set::BitSet; +use std::time::Instant; /// Graph representation of the compatibility of duplicatable subgraph pairs. pub struct CompatGraph { @@ -21,6 +22,7 @@ pub struct CompatGraph { impl CompatGraph { /// Constructs a compatibility graph given a set of matches. pub fn new(init_matches: &Vec<(BitSet, BitSet)>) -> Self { + let start = Instant::now(); let size = init_matches.len(); // Initialize weights and empty graph @@ -48,6 +50,8 @@ impl CompatGraph { } } + println!("Graph Time: {:?}", start.elapsed()); + Self { graph: init_graph, } From 7bd9fdd1360754c67386b1027f0f99ae528033f7 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 27 Aug 2025 16:39:00 -0700 Subject: [PATCH 24/51] New (unoptimized) method of calculating masks --- src/assembly.rs | 116 ++++++++++++++++++++++++------------------------ src/memoize.rs | 8 +++- 2 files changed, 66 insertions(+), 58 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index ec1bcdea..568c8a30 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -128,6 +128,7 @@ pub fn matches(mol: &Molecule) -> Vec<(BitSet, BitSet)> { } } + // Generate subgraphs with more than two edges while subgraphs.len() > 0 { buckets.clear(); // Generate buckets @@ -175,6 +176,7 @@ pub fn matches(mol: &Molecule) -> Vec<(BitSet, BitSet)> { } } + // Sort matches matches.sort_by(|e1, e2| { let ord = [ e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size. @@ -291,8 +293,6 @@ pub fn recurse_index_search( //println!("{:?}", removal_order); //println!("{:?}", state); - // An upper bound on the size of fragments that can be - // removed from this or any descendant assembly state. let largest_remove = { if let Some(v) = subgraph.iter().next() { matches[v].0.len() @@ -302,47 +302,63 @@ pub fn recurse_index_search( } }; + // Prune edges of state + let mut subgraphs: HashSet = HashSet::new(); let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; - for v in subgraph.iter() { - let m = &matches[v]; - is_usable(state, &m.0, &m.1, &mut masks); - } + subgraph = subgraph.iter().filter(|v| { + let h1 = &matches[*v].0; + let h2 = &matches[*v].1; - let mut state = Vec::new(); - for (i, m) in masks[0].iter().enumerate() { - if m.len() != 0 { - state.extend(connected_components_under_edges(mol.graph(), m)); + let pre_have_h1 = subgraphs.contains(h1); + let pre_have_h2 = subgraphs.contains(h2); + + let mut have_h1 = pre_have_h1; + let mut have_h2 = pre_have_h2; + + let mut idx1 = 0; + let mut idx2 = 0; + + if !pre_have_h1 { + if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)) { + idx1 = idx; + have_h1 = true; + } } - } - /*while masks.iter().last().unwrap().len() == 0 { - masks.pop(); - }*/ - /*let mut state = Vec::new(); - for (i, m) in masks[0].iter().enumerate() { - state[i] = m.clone(); - }*/ - /*let mut temp = Vec::new(); - let state = { - if removal_order.len() == 0 { - for m in masks[0].iter() { - if m.len() != 0 { - temp.extend(connected_components_under_edges(mol.graph(), m)); - } + if !pre_have_h2 { + if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)) { + idx2 = idx; + have_h2 = true; } - temp.retain(|i| i.len() > 1); - &temp } - else { - state + + if have_h1 && have_h2 { + if !pre_have_h1 { + subgraphs.insert(h1.clone()); + masks[h1.len() - 2][idx1].union_with(h1); + } + if !pre_have_h2 { + subgraphs.insert(h2.clone()); + masks[h1.len() - 2][idx2].union_with(h2); + } } - };*/ + + have_h1 && have_h2 + }).collect(); /*let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; - for v in subgraph.iter() { - let m = &matches[v]; - is_usable(&state, &m.0, &m.1, &mut masks); - }*/ + let subgraph1 = subgraph.iter().filter(|v| { + let m = &matches[*v]; + is_usable(state, &m.0, &m.1, &mut masks) + }).collect::();*/ + + let mut state = Vec::new(); + for m in masks[0].iter() { + if m.len() != 0 { + state.extend(connected_components_under_edges(mol.graph(), m)); + } + } + // Int bound let mut max = Vec::new(); for (i, list) in masks.iter().enumerate() { let mut val = 0; @@ -368,24 +384,7 @@ pub fn recurse_index_search( idx += 1; } - // If any bounds would prune this assembly state or if memoization is - // enabled and this assembly state is preempted by the cached state, halt. - /*if bound_exceeded( - mol, - matches, - graph, - &state, - &subgraph, - state_index, - best_index.load(Relaxed), - largest_remove, - bounds, - &masks, - ) || cache.memoize_state(mol, &state, state_index, &removal_order) - { - return (state_index, 1); - }*/ - + // Memoization if cache.memoize_state(mol, &state, state_index, &removal_order) { return (state_index, 1); } @@ -414,6 +413,7 @@ pub fn recurse_index_search( let recurse_on_match = |v: usize| { let (h1, h2) = &matches[v]; if let Some(fragments) = fragments(mol, &state, h1, h2) { + // If using depth-one parallelism, all descendant states should be // computed serially. let new_parallel = if parallel_mode == ParallelMode::DepthOne { @@ -442,8 +442,10 @@ pub fn recurse_index_search( } else { sub_clone = BitSet::with_capacity(matches.len()); - for j in (v+1)..matches.len() { - sub_clone.insert(j); + for j in subgraph.iter() { + if j > v { + sub_clone.insert(j); + } } } @@ -486,7 +488,7 @@ pub fn recurse_index_search( } else { let _ = subgraph .iter() - .filter(|v| *v <= must_include) + .filter(|v| *v <= must_include /*&& matches[*v].0.len() >= idx*/) .collect::>() .par_iter() .for_each(|v| recurse_on_match(*v)); @@ -585,7 +587,7 @@ pub fn index_search( // Create memoization cache. //let mut cache = Cache::new(memoize_mode, canonize_mode); - let mut cache = NewCache::new(); + let mut cache = NewCache::new(memoize_mode); let graph = { if clique { @@ -624,7 +626,7 @@ pub fn index_search( kernel_mode, ); - //println!("Bounded: {}", cache.count()); + println!("Bounded: {}", cache.count()); (index as u32, matches.len() as u32, states_searched) } diff --git a/src/memoize.rs b/src/memoize.rs index 80efa27b..a251639e 100644 --- a/src/memoize.rs +++ b/src/memoize.rs @@ -140,6 +140,7 @@ impl Cache { #[derive(Clone)] pub struct NewCache { + memoize_mode: MemoizeMode, cache: Arc, (usize, Vec)>>, label_to_canon_id: Arc>, frag_to_canon_id: Arc>, @@ -148,8 +149,9 @@ pub struct NewCache { } impl NewCache { - pub fn new() -> Self { + pub fn new(mode: MemoizeMode) -> Self { Self { + memoize_mode: mode, cache: Arc::new(DashMap::, (usize, Vec)>::new()), label_to_canon_id: Arc::new(DashMap::::new()), frag_to_canon_id: Arc::new(DashMap::::new()), @@ -159,6 +161,10 @@ impl NewCache { } pub fn memoize_state(&mut self, mol: &Molecule, state: &[BitSet], state_index: usize, removal_order: &Vec) -> bool { + if self.memoize_mode == MemoizeMode::None { + return false; + } + let mut frag_ids = Vec::new(); for frag in state { let id = self.get_canon_id(mol, frag); From 66256cd67a59af98590a53c4c7f678888fbceb8a Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 28 Aug 2025 14:30:43 -0700 Subject: [PATCH 25/51] Optimize masks computation --- Cargo.lock | 92 +++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 ++ src/assembly.rs | 71 ++++++++++++++++++++++++-------------- 3 files changed, 139 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 372210a9..bc179a87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,19 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -71,12 +84,14 @@ checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" name = "assembly-theory" version = "0.4.0" dependencies = [ + "ahash", "anyhow", "bit-set", "clap 4.5.41", "criterion", "csv", "dashmap", + "fxhash", "graph-canon", "petgraph", "pyo3", @@ -168,6 +183,12 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cast" version = "0.3.0" @@ -433,6 +454,27 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi", +] + [[package]] name = "glob" version = "0.3.2" @@ -839,6 +881,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "radium" version = "0.7.0" @@ -1096,6 +1144,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" @@ -1106,6 +1160,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -1363,6 +1426,15 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags 2.9.1", +] + [[package]] name = "wyz" version = "0.5.1" @@ -1371,3 +1443,23 @@ checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" dependencies = [ "tap", ] + +[[package]] +name = "zerocopy" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index f056c248..bbd2c373 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,8 @@ petgraph = "0.6.5" pyo3 = { version = "0.24.1", features = ["abi3-py310", "extension-module"]} rayon = "1.10.0" dashmap = "6.1.0" +ahash = "0.8.12" +fxhash = "0.2.1" [dev-dependencies] criterion = "0.3" diff --git a/src/assembly.rs b/src/assembly.rs index 568c8a30..dfac7fd8 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -29,6 +29,7 @@ use bit_set::BitSet; use clap::ValueEnum; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use petgraph::graph::EdgeIndex; +use ahash::{AHasher, RandomState}; use crate::{ bounds::{bound_exceeded, Bound}, @@ -290,7 +291,7 @@ pub fn recurse_index_search( parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { - //println!("{:?}", removal_order); + println!("{:?}", removal_order); //println!("{:?}", state); let largest_remove = { @@ -303,42 +304,60 @@ pub fn recurse_index_search( }; // Prune edges of state - let mut subgraphs: HashSet = HashSet::new(); + let mut subgraphs: HashMap = HashMap::with_capacity_and_hasher(256, RandomState::new()); let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; subgraph = subgraph.iter().filter(|v| { let h1 = &matches[*v].0; let h2 = &matches[*v].1; - let pre_have_h1 = subgraphs.contains(h1); - let pre_have_h2 = subgraphs.contains(h2); - - let mut have_h1 = pre_have_h1; - let mut have_h2 = pre_have_h2; + let x1 = subgraphs.get(h1).to_owned().cloned(); + let x2 = subgraphs.get(h2).to_owned().cloned(); - let mut idx1 = 0; - let mut idx2 = 0; + let mut have_h1 = false; + let mut have_h2 = false; + let mut h1_match = false; + let mut h2_match = false; - if !pre_have_h1 { - if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)) { - idx1 = idx; - have_h1 = true; + if let Some ((i1, i2)) = x1 { + if i1 == false { + return false; } + have_h1 = i1; + h1_match = i2; + } + else if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)) { + have_h1 = true; + subgraphs.insert(h1.clone(), (true, false)); + } + else { + subgraphs.insert(h1.clone(), (false, false)); } - if !pre_have_h2 { - if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)) { - idx2 = idx; - have_h2 = true; + + if let Some ((i1, i2)) = x2 { + if i1 == false { + return false; } + have_h2 = i1; + h2_match = i2; + } + else if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)) { + have_h2 = true; + subgraphs.insert(h2.clone(), (true, false)); + } + else { + subgraphs.insert(h2.clone(), (false, false)); } if have_h1 && have_h2 { - if !pre_have_h1 { - subgraphs.insert(h1.clone()); - masks[h1.len() - 2][idx1].union_with(h1); + if h1_match == false { + let (idx, _) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)).unwrap(); + masks[h1.len() - 2][idx].union_with(h1); + subgraphs.get_mut(h1).unwrap().1 = true; } - if !pre_have_h2 { - subgraphs.insert(h2.clone()); - masks[h1.len() - 2][idx2].union_with(h2); + if h2_match == false { + let (idx, _) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)).unwrap(); + masks[h2.len() - 2][idx].union_with(h2); + subgraphs.get_mut(h2).unwrap().1 = true; } } @@ -346,14 +365,14 @@ pub fn recurse_index_search( }).collect(); /*let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; - let subgraph1 = subgraph.iter().filter(|v| { + subgraph = subgraph.iter().filter(|v| { let m = &matches[*v]; is_usable(state, &m.0, &m.1, &mut masks) - }).collect::();*/ + }).collect();*/ let mut state = Vec::new(); for m in masks[0].iter() { - if m.len() != 0 { + if m.len() >= 2 { state.extend(connected_components_under_edges(mol.graph(), m)); } } From 87428445f05fa3f2429b45ba8afcb257dd8e311e Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 28 Aug 2025 15:53:14 -0700 Subject: [PATCH 26/51] Build dag --- src/assembly.rs | 124 ++++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 47 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index dfac7fd8..f3d04666 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -29,14 +29,14 @@ use bit_set::BitSet; use clap::ValueEnum; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use petgraph::graph::EdgeIndex; -use ahash::{AHasher, RandomState}; +use ahash::RandomState; use crate::{ bounds::{bound_exceeded, Bound}, canonize::{canonize, CanonizeMode, Labeling}, enumerate::{EnumerateMode}, kernels::{KernelMode, deletion_kernel, inclusion_kernel}, - memoize::{Cache, MemoizeMode, NewCache}, + memoize::{MemoizeMode, NewCache}, molecule::Molecule, utils::{connected_components_under_edges, edge_neighbors}, reductions::CompatGraph, @@ -100,22 +100,28 @@ pub fn depth(mol: &Molecule) -> u32 { /// /// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, /// sorted to guarantee deterministic iteration. -pub fn matches(mol: &Molecule) -> Vec<(BitSet, BitSet)> { +pub fn matches(mol: &Molecule) -> (Vec<(BitSet, BitSet)>, HashMap)>) { let num_edges = mol.graph().edge_count(); - let mut subgraphs: HashSet = HashSet::new(); - let mut size_one_subgraphs: HashSet = HashSet::new(); + let mut subgraphs: HashMap = HashMap::new(); + let mut size_one_subgraphs: HashMap = HashMap::new(); let mut matches: Vec<(BitSet, BitSet)> = Vec::new(); - let mut buckets: HashMap> = HashMap::new(); + let mut buckets: HashMap> = HashMap::new(); + + let mut dag: HashMap)> = HashMap::new(); + let mut next_id = 0; // Generate subgraphs with one edge for i in 0..num_edges { let mut new_bitset = BitSet::with_capacity(num_edges); new_bitset.insert(i); - size_one_subgraphs.insert(new_bitset); + size_one_subgraphs.insert(new_bitset, i); + + dag.insert(next_id, (BitSet::new(), Vec::new())); + next_id += 1; } // Generate subgraphs with two edges - for subgraph in size_one_subgraphs.iter() { + for (subgraph, idx) in size_one_subgraphs.iter() { let mut neighborhood = BitSet::with_capacity(num_edges); for e in subgraph { neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); @@ -125,7 +131,8 @@ pub fn matches(mol: &Molecule) -> Vec<(BitSet, BitSet)> { for e in neighborhood.iter() { let mut new_subgraph = subgraph.clone(); new_subgraph.insert(e); - subgraphs.insert(new_subgraph); + subgraphs.entry(new_subgraph) + .or_insert(*idx); } } @@ -133,47 +140,70 @@ pub fn matches(mol: &Molecule) -> Vec<(BitSet, BitSet)> { while subgraphs.len() > 0 { buckets.clear(); // Generate buckets - for g in subgraphs.iter() { - let canon = canonize(mol, g, CanonizeMode::TreeNauty); + for (g, parent) in subgraphs.iter() { + let canon = canonize(mol, g, CanonizeMode::TreeNauty); buckets.entry(canon) - .and_modify(|v| v.push(g.clone())) - .or_insert(vec![g.clone()]); + .and_modify(|v| v.push((g.clone(), *parent))) + .or_insert(vec![(g.clone(), *parent)]); } subgraphs.clear(); for (_, bucket) in buckets.iter() { // Generate matches let mut has_match = BitSet::with_capacity(bucket.len()); - for (i, first) in bucket.iter().enumerate() { - for (j, second) in bucket[i+1..].iter().enumerate() { + for (i, (first, first_parent)) in bucket.iter().enumerate() { + for (j, (second, second_parent)) in bucket[i+1..].iter().enumerate() { if first.intersection(second).count() == 0 { if first > second { matches.push((first.clone(), second.clone())); } else { matches.push((second.clone(), first.clone())); } + + if !has_match.contains(i) { + dag.insert(next_id, (first.clone(), Vec::new())); + dag.entry(*first_parent) + .and_modify(|(_, children)| children.push(next_id)); + + let mut neighborhood = BitSet::with_capacity(num_edges); + for e in first { + neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); + } + neighborhood.difference_with(first); + + for e in neighborhood.iter() { + let mut new_subgraph = first.clone(); + new_subgraph.insert(e); + subgraphs.insert(new_subgraph, next_id); + } + + next_id += 1; + } + if !has_match.contains(i + 1 + j) { + dag.insert(next_id, (second.clone(), Vec::new())); + dag.entry(*second_parent) + .and_modify(|(_, children)| children.push(next_id)); + + let mut neighborhood = BitSet::with_capacity(num_edges); + for e in second { + neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); + } + neighborhood.difference_with(second); + + for e in neighborhood.iter() { + let mut new_subgraph = second.clone(); + new_subgraph.insert(e); + subgraphs.insert(new_subgraph, next_id); + } + + next_id += 1; + } + has_match.insert(i); has_match.insert(i + 1 + j); } } } - - // Generate new subgraphs - // Only extend if subgraph has a match - for i in has_match.iter() { - let subgraph = &bucket[i]; - let mut neighborhood = BitSet::with_capacity(num_edges); - for e in subgraph { - neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); - } - neighborhood.difference_with(subgraph); - - for e in neighborhood.iter() { - let mut new_subgraph = subgraph.clone(); - new_subgraph.insert(e); - subgraphs.insert(new_subgraph); - } - } } } @@ -191,7 +221,7 @@ pub fn matches(mol: &Molecule) -> Vec<(BitSet, BitSet)> { ord[i] }); - matches + (matches, dag) } /// Determine the fragments produced from the given assembly state by removing @@ -291,7 +321,7 @@ pub fn recurse_index_search( parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { - println!("{:?}", removal_order); + //println!("{:?}", removal_order); //println!("{:?}", state); let largest_remove = { @@ -304,14 +334,14 @@ pub fn recurse_index_search( }; // Prune edges of state - let mut subgraphs: HashMap = HashMap::with_capacity_and_hasher(256, RandomState::new()); + /*let mut subgraphs: HashMap<&BitSet, (bool, bool), RandomState> = HashMap::with_capacity_and_hasher(256, RandomState::new()); let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; subgraph = subgraph.iter().filter(|v| { let h1 = &matches[*v].0; let h2 = &matches[*v].1; - let x1 = subgraphs.get(h1).to_owned().cloned(); - let x2 = subgraphs.get(h2).to_owned().cloned(); + let x1 = subgraphs.get(h1).cloned(); + let x2 = subgraphs.get(h2).cloned(); let mut have_h1 = false; let mut have_h2 = false; @@ -325,12 +355,12 @@ pub fn recurse_index_search( have_h1 = i1; h1_match = i2; } - else if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)) { + else if let Some(_) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)) { have_h1 = true; - subgraphs.insert(h1.clone(), (true, false)); + subgraphs.insert(h1, (true, false)); } else { - subgraphs.insert(h1.clone(), (false, false)); + subgraphs.insert(h1, (false, false)); } if let Some ((i1, i2)) = x2 { @@ -340,12 +370,12 @@ pub fn recurse_index_search( have_h2 = i1; h2_match = i2; } - else if let Some((idx, _)) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)) { + else if let Some(_) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)) { have_h2 = true; - subgraphs.insert(h2.clone(), (true, false)); + subgraphs.insert(h2, (true, false)); } else { - subgraphs.insert(h2.clone(), (false, false)); + subgraphs.insert(h2, (false, false)); } if have_h1 && have_h2 { @@ -362,13 +392,13 @@ pub fn recurse_index_search( } have_h1 && have_h2 - }).collect(); + }).collect();*/ - /*let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; + let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; subgraph = subgraph.iter().filter(|v| { let m = &matches[*v]; is_usable(state, &m.0, &m.1, &mut masks) - }).collect();*/ + }).collect(); let mut state = Vec::new(); for m in masks[0].iter() { @@ -599,7 +629,7 @@ pub fn index_search( // Catch not-yet-implemented modes. // Enumerate non-overlapping isomorphic subgraph pairs. - let matches = matches(mol); + let (matches, _dag) = matches(mol); /*for (i, m) in matches.iter().enumerate() { println!("{i}: {:?}", m); }*/ From aaca9c8398f7d11ea0e0f4b58d7cf5d8daf52cbd Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 2 Sep 2025 16:23:11 -0700 Subject: [PATCH 27/51] New matches generation with DAG --- Cargo.lock | 14 +- Cargo.toml | 1 + src/assembly.rs | 337 +++++++++++++++++++++++++++++++++------------- src/reductions.rs | 7 +- 4 files changed, 265 insertions(+), 94 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc179a87..660f0f07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -93,6 +93,7 @@ dependencies = [ "dashmap", "fxhash", "graph-canon", + "itertools 0.14.0", "petgraph", "pyo3", "rayon", @@ -322,7 +323,7 @@ dependencies = [ "clap 2.34.0", "criterion-plot", "csv", - "itertools", + "itertools 0.10.5", "lazy_static", "num-traits", "oorandom", @@ -344,7 +345,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] [[package]] @@ -586,6 +587,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" diff --git a/Cargo.toml b/Cargo.toml index bbd2c373..84277549 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ rayon = "1.10.0" dashmap = "6.1.0" ahash = "0.8.12" fxhash = "0.2.1" +itertools = "0.14.0" [dev-dependencies] criterion = "0.3" diff --git a/src/assembly.rs b/src/assembly.rs index f3d04666..ef0ce223 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -29,7 +29,7 @@ use bit_set::BitSet; use clap::ValueEnum; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use petgraph::graph::EdgeIndex; -use ahash::RandomState; +use itertools::Itertools; use crate::{ bounds::{bound_exceeded, Bound}, @@ -96,19 +96,212 @@ pub fn depth(mol: &Molecule) -> u32 { ix } +#[derive(Debug)] +pub struct DagNode { + fragment: BitSet, + canon_id: usize, + // added_edge: usize, + children: Vec, +} + +impl DagNode { + pub fn new(fragment: BitSet, canon_id: usize) -> Self { + Self { + fragment, + canon_id, + children: Vec::new(), + } + } +} + + +pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (HashMap<(usize, usize), usize>, Vec) { + let num_edges = mol.graph().edge_count(); + + let mut subgraphs: Vec = Vec::new(); + let mut constructed_frags: HashSet = HashSet::new(); + let mut frag_to_id: HashMap = HashMap::new(); + + let mut dag: Vec = Vec::with_capacity(num_edges); + let mut matches: Vec<(usize, usize)> = Vec::new(); + + let mut next_canon_id = 0; + + // Generate subgraphs with one edge + for i in 0..num_edges { + let mut new_bitset = BitSet::with_capacity(num_edges); + new_bitset.insert(i); + + let node = DagNode::new(new_bitset, 0); + + dag.push(node); + subgraphs.push(i); + } + + next_canon_id += 1; + + while subgraphs.len() > 0 { + // Initialize new buckets and subgraph list + let mut buckets: HashMap> = HashMap::new(); + let mut child_subgraphs: Vec = Vec::new(); + + // Extend and bucket new subgraphs + for parent_idx in subgraphs.iter() { + // Get fragment to be extended + let fragment = &dag[*parent_idx].fragment; + + // Construct edge neighborhood of fragment + let mut neighborhood = BitSet::with_capacity(num_edges); + for e in fragment { + neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); + } + neighborhood.difference_with(fragment); + + // Extend fragment with edges from neighborhood + for e in neighborhood.iter() { + // Create extended fragment + let mut new_fragment = fragment.clone(); + new_fragment.insert(e); + + // Check if fragment has already been created + // If yes, continue + if constructed_frags.contains(&new_fragment) { + continue; + } + + // Bucket subgraph + // Gives ownership of new_fragment to buckets + let canon = canonize(mol, &new_fragment, canonize_mode); + buckets.entry(canon) + .and_modify(|bucket| bucket.push((new_fragment.clone(), *parent_idx))) + .or_insert(vec![(new_fragment.clone(), *parent_idx)]); + + // Add to constructed_frags + constructed_frags.insert(new_fragment); + } + } + + // Go through buckets to create matches + for fragments in buckets.values() { + // Variables to track if a match has been found + let mut frag_has_match = BitSet::with_capacity(fragments.len()); + + // Loop through pairs of bitsets to find matches + for pair in fragments.iter().enumerate().combinations(2) { + // Extract pair of fragments + let frag1 = &pair[0].1.0; + let frag2 = &pair[1].1.0; + + // Check for disjointness + // If the fragments are disjoint, they form a match + if frag1.is_disjoint(&frag2) { + // Extract indices of fragments + let idx1 = pair[0].0; + let idx2 = pair[1].0; + + // If this is the first match for a fragment, create a dag node + if !frag_has_match.contains(idx1) { + // Create new dag node + let new_dag_node = DagNode::new(frag1.clone(), next_canon_id); + let child_idx = dag.len(); + + // Add child to dag list + dag.push(new_dag_node); + + // Add index to parent's child list + let parent_idx = pair[0].1.1; + let mut parent = &mut dag[parent_idx]; + parent.children.push(child_idx); + + // Add dag node to next subgraphs list + child_subgraphs.push(child_idx); + + // Add to frag_to_id map + frag_to_id.insert(frag1.clone(), child_idx); + } + if !frag_has_match.contains(idx2) { + // Create new dag node + let new_dag_node = DagNode::new(frag2.clone(), next_canon_id); + let child_idx = dag.len(); + + // Add child to dag list + dag.push(new_dag_node); + + // Add index to parent's child list + let parent_idx = pair[1].1.1; + let mut parent = &mut dag[parent_idx]; + parent.children.push(child_idx); + + // Add dag node to next subgraphs list + child_subgraphs.push(child_idx); + + // Add to frag_to_id map + frag_to_id.insert(frag2.clone(), child_idx); + } + + // Get fragment ids and add to matches + let frag1_id = frag_to_id.get(&frag1).unwrap(); + let frag2_id = frag_to_id.get(&frag2).unwrap(); + + if frag1_id > frag2_id { + matches.push((*frag1_id, *frag2_id)); + } + else { + matches.push((*frag2_id, *frag1_id)); + } + + // Mark that these fragments have matches + frag_has_match.insert(idx1); + frag_has_match.insert(idx2); + } + } + + // If there was a match, increment canon_id counter + if !frag_has_match.is_empty() { + next_canon_id += 1; + } + } + + // Update to the new subgraphs + subgraphs = child_subgraphs; + } + + // Sort matches + // Larger frag_ids get a smaller match_id + // Thus fragments with many edges have small ids + matches.sort(); + matches.reverse(); + + // give matches ids + let mut next_match_id = 0; + let mut match_to_id: HashMap<(usize, usize), usize> = HashMap::new(); + for m in matches { + match_to_id.insert(m, next_match_id); + next_match_id += 1; + } + + (match_to_id, dag) +} + + /// Helper function for [`index_search`]; only public for benchmarking. /// /// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, /// sorted to guarantee deterministic iteration. -pub fn matches(mol: &Molecule) -> (Vec<(BitSet, BitSet)>, HashMap)>) { +/*pub fn _matches(mol: &Molecule) -> (Vec<(usize, usize)>, DagNode) { let num_edges = mol.graph().edge_count(); + let mut subgraphs: HashMap = HashMap::new(); let mut size_one_subgraphs: HashMap = HashMap::new(); - let mut matches: Vec<(BitSet, BitSet)> = Vec::new(); - let mut buckets: HashMap> = HashMap::new(); + let mut buckets: HashMap> = HashMap::new(); + + let mut dag: Vec = Vec::with_capacity(num_edges); + let mut canon_ids: HashMap = HashMap::new(); + let mut matches: HashMap<(usize, usize), usize> = Hashmap::new(); - let mut dag: HashMap)> = HashMap::new(); - let mut next_id = 0; + let mut next_frag_id = 1; + let mut next_match_id = 0; + let mut next_canon_id = 1; // Generate subgraphs with one edge for i in 0..num_edges { @@ -116,8 +309,7 @@ pub fn matches(mol: &Molecule) -> (Vec<(BitSet, BitSet)>, HashMap (Vec<(BitSet, BitSet)>, HashMap = HashMap::new(); for (_, bucket) in buckets.iter() { // Generate matches let mut has_match = BitSet::with_capacity(bucket.len()); for (i, (first, first_parent)) in bucket.iter().enumerate() { for (j, (second, second_parent)) in bucket[i+1..].iter().enumerate() { if first.intersection(second).count() == 0 { - if first > second { - matches.push((first.clone(), second.clone())); - } else { - matches.push((second.clone(), first.clone())); - } - if !has_match.contains(i) { + frag_to_id.insert(first.clone(), next_id); + dag.insert(next_id, (first.clone(), Vec::new())); dag.entry(*first_parent) .and_modify(|(_, children)| children.push(next_id)); @@ -180,6 +369,8 @@ pub fn matches(mol: &Molecule) -> (Vec<(BitSet, BitSet)>, HashMap (Vec<(BitSet, BitSet)>, HashMap second { + matches.push((*id1, *id2)); + } else { + matches.push((*id1, *id2)); + } + has_match.insert(i); has_match.insert(i + 1 + j); } @@ -222,7 +421,7 @@ pub fn matches(mol: &Molecule) -> (Vec<(BitSet, BitSet)>, HashMap, + matches: &Vec<(usize, usize)>, + dag: &HashMap)>, graph: &Option, mut subgraph: BitSet, removal_order: Vec, @@ -326,79 +526,18 @@ pub fn recurse_index_search( let largest_remove = { if let Some(v) = subgraph.iter().next() { - matches[v].0.len() + dag.get(matches[v].0).unwrap().0.len(); } else { return (state_index, 1); } }; - // Prune edges of state - /*let mut subgraphs: HashMap<&BitSet, (bool, bool), RandomState> = HashMap::with_capacity_and_hasher(256, RandomState::new()); - let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; - subgraph = subgraph.iter().filter(|v| { - let h1 = &matches[*v].0; - let h2 = &matches[*v].1; - - let x1 = subgraphs.get(h1).cloned(); - let x2 = subgraphs.get(h2).cloned(); - - let mut have_h1 = false; - let mut have_h2 = false; - let mut h1_match = false; - let mut h2_match = false; - - if let Some ((i1, i2)) = x1 { - if i1 == false { - return false; - } - have_h1 = i1; - h1_match = i2; - } - else if let Some(_) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)) { - have_h1 = true; - subgraphs.insert(h1, (true, false)); - } - else { - subgraphs.insert(h1, (false, false)); - } - - if let Some ((i1, i2)) = x2 { - if i1 == false { - return false; - } - have_h2 = i1; - h2_match = i2; - } - else if let Some(_) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)) { - have_h2 = true; - subgraphs.insert(h2, (true, false)); - } - else { - subgraphs.insert(h2, (false, false)); - } + let mut masks: Vec> = vec![ + vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; + largest_remove - 1 + ]; - if have_h1 && have_h2 { - if h1_match == false { - let (idx, _) = state.iter().enumerate().find(|(_, frag)| h1.is_subset(frag)).unwrap(); - masks[h1.len() - 2][idx].union_with(h1); - subgraphs.get_mut(h1).unwrap().1 = true; - } - if h2_match == false { - let (idx, _) = state.iter().enumerate().find(|(_, frag)| h2.is_subset(frag)).unwrap(); - masks[h2.len() - 2][idx].union_with(h2); - subgraphs.get_mut(h2).unwrap().1 = true; - } - } - - have_h1 && have_h2 - }).collect();*/ - - let mut masks: Vec> = vec![vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; largest_remove - 1]; - subgraph = subgraph.iter().filter(|v| { - let m = &matches[*v]; - is_usable(state, &m.0, &m.1, &mut masks) - }).collect(); let mut state = Vec::new(); for m in masks[0].iter() { @@ -502,6 +641,7 @@ pub fn recurse_index_search( let (child_index, child_states_searched) = recurse_index_search( mol, matches, + dag, graph, sub_clone, { @@ -551,7 +691,7 @@ pub fn recurse_index_search( best_child_index.load(Relaxed), states_searched.load(Relaxed), ) -} +}*/ /// Compute a molecule's assembly index and related information using a /// top-down recursive algorithm, parameterized by the specified options. @@ -629,18 +769,31 @@ pub fn index_search( // Catch not-yet-implemented modes. // Enumerate non-overlapping isomorphic subgraph pairs. - let (matches, _dag) = matches(mol); - /*for (i, m) in matches.iter().enumerate() { - println!("{i}: {:?}", m); - }*/ + let (matches, dag) = matches(mol, canonize_mode); + /*let mut match_list: Vec<((usize, usize), usize)> = Vec::new(); + for m in matches.iter() { + match_list.push((*m.0, *m.1)); + } + + for (idx, d) in dag.iter().enumerate() { + println!("{}: {:?}", idx, d); + } + + match_list.sort_by_key(|m| m.1); + for m in match_list { + println!("{:?}", m); + } + println!("{}", matches.len());*/ + + std::process::exit(1); // Create memoization cache. //let mut cache = Cache::new(memoize_mode, canonize_mode); - let mut cache = NewCache::new(memoize_mode); + /*let mut cache = NewCache::new(memoize_mode); let graph = { if clique { - Some(CompatGraph::new(&matches)) + Some(CompatGraph::new(&matches, &dag)) } else { None @@ -663,6 +816,7 @@ pub fn index_search( let (index, states_searched) = recurse_index_search( mol, &matches, + &dag, &graph, subgraph, Vec::new(), @@ -677,7 +831,8 @@ pub fn index_search( println!("Bounded: {}", cache.count()); - (index as u32, matches.len() as u32, states_searched) + (index as u32, matches.len() as u32, states_searched)*/ + (0, 0, 0) } /// Compute a molecule's assembly index using an efficient default strategy. diff --git a/src/reductions.rs b/src/reductions.rs index 5ac282c9..f7e699c6 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -11,6 +11,7 @@ use bit_set::BitSet; use std::time::Instant; +use std::collections::HashMap; /// Graph representation of the compatibility of duplicatable subgraph pairs. pub struct CompatGraph { @@ -21,7 +22,7 @@ pub struct CompatGraph { impl CompatGraph { /// Constructs a compatibility graph given a set of matches. - pub fn new(init_matches: &Vec<(BitSet, BitSet)>) -> Self { + pub fn new(init_matches: &Vec<(usize, usize)>, dag: &HashMap)>) -> Self { let start = Instant::now(); let size = init_matches.len(); @@ -35,6 +36,10 @@ impl CompatGraph { for (idx1, (h1, h2)) in init_matches.iter().enumerate() { for (idx2, (h1p, h2p)) in init_matches[idx1 + 1..].iter().enumerate() { let idx2 = idx2 + idx1 + 1; + let h1 = &dag.get(h1).unwrap().0; + let h1p = &dag.get(h1p).unwrap().0; + let h2 = &dag.get(h2).unwrap().0; + let h2p = &dag.get(h2p).unwrap().0; let forward_compatible = { h2.is_disjoint(h1p) && From e700ed68cc8ce363d52ad2f85878364ac1ba4e5f Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 3 Sep 2025 13:52:38 -0700 Subject: [PATCH 28/51] Enumerate matches using dag --- src/assembly.rs | 246 ++++++++++++++++++++++++++---------------------- 1 file changed, 132 insertions(+), 114 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index ef0ce223..3c52ef12 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -19,7 +19,7 @@ //! ``` use std::{ - collections::{HashMap, HashSet}, sync::{ + collections::{BTreeMap, HashMap, HashSet}, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, } @@ -27,9 +27,11 @@ use std::{ use bit_set::BitSet; use clap::ValueEnum; +use graph_canon::canon; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use petgraph::graph::EdgeIndex; use itertools::Itertools; +use std::cmp::{max, min}; use crate::{ bounds::{bound_exceeded, Bound}, @@ -142,7 +144,7 @@ pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (HashMap<(usize, while subgraphs.len() > 0 { // Initialize new buckets and subgraph list - let mut buckets: HashMap> = HashMap::new(); + let mut buckets: BTreeMap> = BTreeMap::new(); let mut child_subgraphs: Vec = Vec::new(); // Extend and bucket new subgraphs @@ -506,11 +508,10 @@ fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec, - dag: &HashMap)>, - graph: &Option, + matches: &HashMap<(usize, usize), usize>, + dag: &Vec, mut subgraph: BitSet, removal_order: Vec, state: &[BitSet], @@ -523,85 +524,144 @@ fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec (usize, usize) { //println!("{:?}", removal_order); //println!("{:?}", state); + + let num_edges = mol.graph().edge_count(); + let mut enum_matches: Vec<(usize, usize)> = Vec::new(); + + // MATCHES METHOD 1 + // EXTEND ALWAYS AND DO MATCHING AT END + let mut subgraphs: Vec = Vec::new(); + let mut buckets: HashMap> = HashMap::new(); - let largest_remove = { - if let Some(v) = subgraph.iter().next() { - dag.get(matches[v].0).unwrap().0.len(); + // create subgraphs of size 1 + for fragment in state.iter() { + for edge_idx in fragment.iter() { + subgraphs.push(edge_idx); } - else { - return (state_index, 1); + } + + while subgraphs.len() > 0 { + // Subgraphs to extend in next loop + let mut new_subgraphs = Vec::new(); + + // Extend each subgraph and bucket + for frag_id in subgraphs { + // Get ids of extensions of this subgraph + let children_ids = &dag[frag_id].children; + + for child_id in children_ids { + // Check if this extension is valid + // TODO: save fragment id + // TODO: save edge id in DAG + let child_frag = &dag[*child_id].fragment; + let valid = state.iter().any(|frag| child_frag.is_subset(frag)); + + if valid { + let canon_id = &dag[*child_id].canon_id; + + // Add fragment to bucket + buckets.entry(*canon_id) + .and_modify(|bucket| bucket.push(*child_id)) + .or_insert(vec![*child_id]); + + // Add fragment to new subgraphs + new_subgraphs.push(*child_id); + } + } } - }; - let mut masks: Vec> = vec![ - vec![BitSet::with_capacity(mol.graph().edge_count()); state.len()]; + // Update to new subgraphs + subgraphs = new_subgraphs; + } + + // Generate matches from buckets of ismorphic fragments + for fragments in buckets.values() { + // Loop over pairs of fragments + for pair in fragments.iter().combinations(2) { + let frag1 = *max(pair[0], pair[1]); + let frag2 = *min(pair[0], pair[1]); + + // Check for valid match + // If valid, add to matches to iterate over + if let Some(x) = matches.get(&(frag1, frag2)) { + enum_matches.push((frag1, frag2)); + } + } + } + + // If there are no matches, return + if enum_matches.len() == 0 { + return (state_index, 1); + } + + enum_matches.sort(); + enum_matches.reverse(); + + // Modify mask + // TODO: try storing fragment so search can be reduced + // TODO: try with a set for frag_ids to only do intersection once per frag + let largest_remove = dag[enum_matches[0].0].fragment.len(); + let mut masks = vec![ + vec![BitSet::with_capacity(num_edges); state.len()]; largest_remove - 1 ]; + for m in enum_matches.iter() { + // Extract fragments + let frag1 = &dag[m.0].fragment; + let frag2 = &dag[m.1].fragment; + let len = frag1.len() - 2; + + // Find indices of fragments that contain frag1 and frag2 + let (frag1_idx, _) = state.iter() + .enumerate() + .find(|(_, state_frag)| frag1.is_subset(state_frag)) + .unwrap(); + let (frag2_idx, _) = state.iter() + .enumerate() + .find(|(_, state_frag)| frag2.is_subset(state_frag)) + .unwrap(); + + // Union with masks to indicate that these edges are used + masks[len][frag1_idx].union_with(frag1); + masks[len][frag2_idx].union_with(frag2); + } + // Let new state be only the edges which can be matched let mut state = Vec::new(); - for m in masks[0].iter() { - if m.len() >= 2 { - state.extend(connected_components_under_edges(mol.graph(), m)); - } - } - // Int bound - let mut max = Vec::new(); - for (i, list) in masks.iter().enumerate() { - let mut val = 0; - let i = i + 2; - for (j, frag) in list.iter().enumerate() { - let mf = masks[0][j].len(); - let mi = frag.len(); - let x = mf + (mi % i) - mi; - val += mf - (mi / i) - x / (i-1) - (x % (i-1) != 0) as usize; - } - val -= (i as f32).log2().ceil() as usize; - if max.len() == 0 { - max.push(val); - } - else { - max.push(val.max(max[max.len() - 1])); - } + for frag in masks[0].iter() { + // Split into connected components + let connected_frags = connected_components_under_edges(mol.graph(), &frag); + + // Add connected components to new state + state.extend(connected_frags); } + + + // MATCHES METHOD 2 + // ONLY EXTEND IF A SUBGRAPH HAS A MATCH - let best = best_index.load(Relaxed); - let mut idx = 2; - while (idx < max.len() + 2) && best + max[idx - 2] <= state_index { - idx += 1; - } // Memoization if cache.memoize_state(mol, &state, state_index, &removal_order) { return (state_index, 1); } - // Apply kernels - let mut must_include = matches.len(); - if kernel_mode != KernelMode::None { - let g = graph.as_ref().unwrap(); - - // Deletion kernel - // Returns a subgraph without nodes that never occur in an optimal solution. - subgraph = deletion_kernel(matches, g, subgraph); - - // Inclusion kernel. - // must_include is the first match in the matches list that will be included in an optimal solution. - must_include = inclusion_kernel(matches, g, &subgraph); - } - // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. let best_child_index = AtomicUsize::from(state_index); let states_searched = AtomicUsize::from(1); - + // Define a closure that handles recursing to a new assembly state based on // the given (enumerated) pair of non-overlapping isomorphic subgraphs. - let recurse_on_match = |v: usize| { - let (h1, h2) = &matches[v]; + let recurse_on_match = |v: (usize, usize)| { + //let (h1, h2) = &matches[v]; + let h1 = &dag[v.0].fragment; + let h2 = &dag[v.1].fragment; + + // TODO: try keeping track of fragment to elimate some some search if let Some(fragments) = fragments(mol, &state, h1, h2) { - // If using depth-one parallelism, all descendant states should be // computed serially. let new_parallel = if parallel_mode == ParallelMode::DepthOne { @@ -610,43 +670,15 @@ fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec v { - sub_clone.insert(j); - } - } - } - // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search( mol, matches, dag, - graph, - sub_clone, + BitSet::new(), { let mut clone = removal_order.clone(); - clone.push(v); + clone.push(*matches.get(&v).unwrap()); clone }, &fragments, @@ -655,7 +687,7 @@ fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec= idx) - .for_each(|v| recurse_on_match(v)); + .for_each(|v| recurse_on_match(*v)); } else { - let _ = subgraph - .iter() - .filter(|v| *v <= must_include /*&& matches[*v].0.len() >= idx*/) - .collect::>() + enum_matches .par_iter() .for_each(|v| recurse_on_match(*v)); } - /*if removal_order.len() == 1 { - println!("{:?}, {}", removal_order, states_searched.load(Relaxed)); - }*/ - ( best_child_index.load(Relaxed), states_searched.load(Relaxed), ) -}*/ +} /// Compute a molecule's assembly index and related information using a /// top-down recursive algorithm, parameterized by the specified options. @@ -770,6 +792,7 @@ pub fn index_search( // Enumerate non-overlapping isomorphic subgraph pairs. let (matches, dag) = matches(mol, canonize_mode); + /*let mut match_list: Vec<((usize, usize), usize)> = Vec::new(); for m in matches.iter() { match_list.push((*m.0, *m.1)); @@ -785,20 +808,18 @@ pub fn index_search( } println!("{}", matches.len());*/ - std::process::exit(1); - // Create memoization cache. //let mut cache = Cache::new(memoize_mode, canonize_mode); - /*let mut cache = NewCache::new(memoize_mode); + let mut cache = NewCache::new(memoize_mode); - let graph = { + /*let graph = { if clique { Some(CompatGraph::new(&matches, &dag)) } else { None } - }; + };*/ // Initialize the first fragment as the entire graph. let mut init = BitSet::new(); @@ -812,12 +833,10 @@ pub fn index_search( // Search for the shortest assembly pathway recursively. let edge_count = mol.graph().edge_count(); let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); - //let best_index = Arc::new(AtomicUsize::from(12)); let (index, states_searched) = recurse_index_search( mol, &matches, &dag, - &graph, subgraph, Vec::new(), &[init], @@ -831,8 +850,7 @@ pub fn index_search( println!("Bounded: {}", cache.count()); - (index as u32, matches.len() as u32, states_searched)*/ - (0, 0, 0) + (index as u32, matches.len() as u32, states_searched) } /// Compute a molecule's assembly index using an efficient default strategy. From 0997b005d2b95b26ea08578ef53eae3e483e3121 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 3 Sep 2025 15:25:50 -0700 Subject: [PATCH 29/51] Create masks with frag_id to state_id map --- src/assembly.rs | 86 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 29 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 3c52ef12..9510b44f 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -28,10 +28,11 @@ use std::{ use bit_set::BitSet; use clap::ValueEnum; use graph_canon::canon; -use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; +use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator}; use petgraph::graph::EdgeIndex; use itertools::Itertools; use std::cmp::{max, min}; +use ahash::RandomState; use crate::{ bounds::{bound_exceeded, Bound}, @@ -102,7 +103,6 @@ pub fn depth(mol: &Molecule) -> u32 { pub struct DagNode { fragment: BitSet, canon_id: usize, - // added_edge: usize, children: Vec, } @@ -512,6 +512,7 @@ pub fn recurse_index_search( mol: &Molecule, matches: &HashMap<(usize, usize), usize>, dag: &Vec, + last_removed: usize, mut subgraph: BitSet, removal_order: Vec, state: &[BitSet], @@ -525,18 +526,18 @@ pub fn recurse_index_search( //println!("{:?}", removal_order); //println!("{:?}", state); - let num_edges = mol.graph().edge_count(); - let mut enum_matches: Vec<(usize, usize)> = Vec::new(); - // MATCHES METHOD 1 // EXTEND ALWAYS AND DO MATCHING AT END - let mut subgraphs: Vec = Vec::new(); - let mut buckets: HashMap> = HashMap::new(); + let num_edges = mol.graph().edge_count(); + + let mut enum_matches: Vec<(usize, usize)> = Vec::new(); + let mut subgraphs: Vec<(usize, usize)> = Vec::new(); + let mut buckets: HashMap> = HashMap::new(); // create subgraphs of size 1 - for fragment in state.iter() { + for (state_id, fragment) in state.iter().enumerate() { for edge_idx in fragment.iter() { - subgraphs.push(edge_idx); + subgraphs.push((edge_idx, state_id)); } } @@ -545,27 +546,29 @@ pub fn recurse_index_search( let mut new_subgraphs = Vec::new(); // Extend each subgraph and bucket - for frag_id in subgraphs { + for (frag_id, state_id) in subgraphs { + // Get state frament that this subgraph is contained in + let state_frag = &state[state_id]; + // Get ids of extensions of this subgraph let children_ids = &dag[frag_id].children; for child_id in children_ids { // Check if this extension is valid - // TODO: save fragment id - // TODO: save edge id in DAG + // i.e. the extended subgraph is contained in a fragment of state let child_frag = &dag[*child_id].fragment; - let valid = state.iter().any(|frag| child_frag.is_subset(frag)); + let valid = child_frag.is_subset(state_frag); if valid { let canon_id = &dag[*child_id].canon_id; // Add fragment to bucket buckets.entry(*canon_id) - .and_modify(|bucket| bucket.push(*child_id)) - .or_insert(vec![*child_id]); + .and_modify(|bucket| bucket.push((*child_id, state_id))) + .or_insert(vec![(*child_id, state_id)]); // Add fragment to new subgraphs - new_subgraphs.push(*child_id); + new_subgraphs.push((*child_id, state_id)); } } } @@ -574,6 +577,8 @@ pub fn recurse_index_search( subgraphs = new_subgraphs; } + let mut frag_id_to_state_idx: HashMap = HashMap::with_hasher(RandomState::new()); + // Generate matches from buckets of ismorphic fragments for fragments in buckets.values() { // Loop over pairs of fragments @@ -581,10 +586,23 @@ pub fn recurse_index_search( let frag1 = *max(pair[0], pair[1]); let frag2 = *min(pair[0], pair[1]); + let (frag1_id, frag1_state_idx) = frag1; + let (frag2_id, frag2_state_idx) = frag2; + // Check for valid match // If valid, add to matches to iterate over - if let Some(x) = matches.get(&(frag1, frag2)) { - enum_matches.push((frag1, frag2)); + if let Some(x) = matches.get(&(frag1_id, frag2_id)) { + // Only add match if it occurs later in the ordering than + // the last removed match + if *x >= last_removed { + enum_matches.push((frag1_id, frag2_id)); + + // Store which state fragments the matched subgraphs are contained in + frag_id_to_state_idx.entry(frag1_id) + .or_insert(frag1_state_idx); + frag_id_to_state_idx.entry(frag2_id) + .or_insert(frag2_state_idx); + } } } } @@ -598,15 +616,13 @@ pub fn recurse_index_search( enum_matches.reverse(); // Modify mask - // TODO: try storing fragment so search can be reduced - // TODO: try with a set for frag_ids to only do intersection once per frag let largest_remove = dag[enum_matches[0].0].fragment.len(); let mut masks = vec![ vec![BitSet::with_capacity(num_edges); state.len()]; largest_remove - 1 ]; - for m in enum_matches.iter() { + /*for m in enum_matches.iter() { // Extract fragments let frag1 = &dag[m.0].fragment; let frag2 = &dag[m.1].fragment; @@ -625,18 +641,25 @@ pub fn recurse_index_search( // Union with masks to indicate that these edges are used masks[len][frag1_idx].union_with(frag1); masks[len][frag2_idx].union_with(frag2); + }*/ + + for (frag_id, state_idx) in frag_id_to_state_idx.iter() { + let frag = &dag[*frag_id].fragment; + let len = frag.len() - 2; + + masks[len][*state_idx].union_with(frag); } // Let new state be only the edges which can be matched let mut state = Vec::new(); for frag in masks[0].iter() { - // Split into connected components - let connected_frags = connected_components_under_edges(mol.graph(), &frag); - // Add connected components to new state - state.extend(connected_frags); + state.extend(connected_components_under_edges(mol.graph(), &frag)); } + + // Remove frags of size 1 or less + state.retain(|frag| frag.len() >= 2); // MATCHES METHOD 2 @@ -655,7 +678,7 @@ pub fn recurse_index_search( // Define a closure that handles recursing to a new assembly state based on // the given (enumerated) pair of non-overlapping isomorphic subgraphs. - let recurse_on_match = |v: (usize, usize)| { + let recurse_on_match = |i: usize, v: (usize, usize)| { //let (h1, h2) = &matches[v]; let h1 = &dag[v.0].fragment; let h2 = &dag[v.1].fragment; @@ -675,10 +698,12 @@ pub fn recurse_index_search( mol, matches, dag, + *matches.get(&v).unwrap(), BitSet::new(), { let mut clone = removal_order.clone(); - clone.push(*matches.get(&v).unwrap()); + // clone.push(*matches.get(&v).unwrap()); + clone.push(i); clone }, &fragments, @@ -702,11 +727,13 @@ pub fn recurse_index_search( if parallel_mode == ParallelMode::None { enum_matches .iter() - .for_each(|v| recurse_on_match(*v)); + .enumerate() + .for_each(|(i, v)| recurse_on_match(i, *v)); } else { enum_matches .par_iter() - .for_each(|v| recurse_on_match(*v)); + .enumerate() + .for_each(|(i, v)| recurse_on_match(i, *v)); } ( @@ -837,6 +864,7 @@ pub fn index_search( mol, &matches, &dag, + 0, subgraph, Vec::new(), &[init], From 87ce547d331a706c08e69ddfae2adda641e58a45 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 3 Sep 2025 15:27:56 -0700 Subject: [PATCH 30/51] Create masks by searching through state --- src/assembly.rs | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 9510b44f..d1b0ef73 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -532,7 +532,7 @@ pub fn recurse_index_search( let mut enum_matches: Vec<(usize, usize)> = Vec::new(); let mut subgraphs: Vec<(usize, usize)> = Vec::new(); - let mut buckets: HashMap> = HashMap::new(); + let mut buckets: HashMap> = HashMap::new(); // create subgraphs of size 1 for (state_id, fragment) in state.iter().enumerate() { @@ -564,8 +564,8 @@ pub fn recurse_index_search( // Add fragment to bucket buckets.entry(*canon_id) - .and_modify(|bucket| bucket.push((*child_id, state_id))) - .or_insert(vec![(*child_id, state_id)]); + .and_modify(|bucket| bucket.push(*child_id)) + .or_insert(vec![*child_id]); // Add fragment to new subgraphs new_subgraphs.push((*child_id, state_id)); @@ -577,8 +577,6 @@ pub fn recurse_index_search( subgraphs = new_subgraphs; } - let mut frag_id_to_state_idx: HashMap = HashMap::with_hasher(RandomState::new()); - // Generate matches from buckets of ismorphic fragments for fragments in buckets.values() { // Loop over pairs of fragments @@ -586,22 +584,13 @@ pub fn recurse_index_search( let frag1 = *max(pair[0], pair[1]); let frag2 = *min(pair[0], pair[1]); - let (frag1_id, frag1_state_idx) = frag1; - let (frag2_id, frag2_state_idx) = frag2; - // Check for valid match // If valid, add to matches to iterate over - if let Some(x) = matches.get(&(frag1_id, frag2_id)) { + if let Some(x) = matches.get(&(frag1, frag2)) { // Only add match if it occurs later in the ordering than // the last removed match if *x >= last_removed { - enum_matches.push((frag1_id, frag2_id)); - - // Store which state fragments the matched subgraphs are contained in - frag_id_to_state_idx.entry(frag1_id) - .or_insert(frag1_state_idx); - frag_id_to_state_idx.entry(frag2_id) - .or_insert(frag2_state_idx); + enum_matches.push((frag1, frag2)); } } } @@ -622,7 +611,7 @@ pub fn recurse_index_search( largest_remove - 1 ]; - /*for m in enum_matches.iter() { + for m in enum_matches.iter() { // Extract fragments let frag1 = &dag[m.0].fragment; let frag2 = &dag[m.1].fragment; @@ -641,13 +630,6 @@ pub fn recurse_index_search( // Union with masks to indicate that these edges are used masks[len][frag1_idx].union_with(frag1); masks[len][frag2_idx].union_with(frag2); - }*/ - - for (frag_id, state_idx) in frag_id_to_state_idx.iter() { - let frag = &dag[*frag_id].fragment; - let len = frag.len() - 2; - - masks[len][*state_idx].union_with(frag); } // Let new state be only the edges which can be matched From e66d91143d12fa4c3eedf9737f0f8380d1bd659e Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 3 Sep 2025 16:03:26 -0700 Subject: [PATCH 31/51] Add int bound --- src/assembly.rs | 54 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index d1b0ef73..be089906 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -642,6 +642,45 @@ pub fn recurse_index_search( // Remove frags of size 1 or less state.retain(|frag| frag.len() >= 2); + + + let best = best_index.load(Relaxed); + let mut best_bound = 0; + let mut largest_length = 2; + + // Use bounding strategy to find the largest length match + // to try removing from this state. + for (i, list) in masks.iter().enumerate() { + let mut bound = 0; + let i = i + 2; + for (j, frag) in list.iter().enumerate() { + let mf = masks[0][j].len(); + let mi = frag.len(); + let x = mf + (mi % i) - mi; + bound += mf - (mi / i) - x / (i-1) - (x % (i-1) != 0) as usize; + } + bound -= (i as f32).log2().ceil() as usize; + + best_bound = max(best_bound, bound); + + // Check if removing at this length can give a more optimal answer + // If yes, stop and return the largest_length to remove + if state_index - best_bound < best { + break; + } + + largest_length += 1; + } + + // Remove from enum_matches any matches that have size smaller + // than largest_length, and thus their removal does not need to be tested + if largest_length > 2 { + let mut final_idx = 0; + while dag[enum_matches[final_idx].0].fragment.len() >= largest_length { + final_idx += 1; + } + enum_matches.truncate(final_idx); + } // MATCHES METHOD 2 @@ -802,21 +841,6 @@ pub fn index_search( // Enumerate non-overlapping isomorphic subgraph pairs. let (matches, dag) = matches(mol, canonize_mode); - /*let mut match_list: Vec<((usize, usize), usize)> = Vec::new(); - for m in matches.iter() { - match_list.push((*m.0, *m.1)); - } - - for (idx, d) in dag.iter().enumerate() { - println!("{}: {:?}", idx, d); - } - - match_list.sort_by_key(|m| m.1); - for m in match_list { - println!("{:?}", m); - } - println!("{}", matches.len());*/ - // Create memoization cache. //let mut cache = Cache::new(memoize_mode, canonize_mode); let mut cache = NewCache::new(memoize_mode); From a14e9ed6f8d4a914b619dd65c4b609fa4e44dde9 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 3 Sep 2025 16:09:13 -0700 Subject: [PATCH 32/51] Cleanup some code --- src/assembly.rs | 178 +++++------------------------------------------- 1 file changed, 18 insertions(+), 160 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index be089906..c5d1bed2 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -116,7 +116,10 @@ impl DagNode { } } - +/// Helper function for [`index_search`]; only public for benchmarking. +/// +/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, +/// sorted to guarantee deterministic iteration. pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (HashMap<(usize, usize), usize>, Vec) { let num_edges = mol.graph().edge_count(); @@ -285,146 +288,6 @@ pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (HashMap<(usize, (match_to_id, dag) } - -/// Helper function for [`index_search`]; only public for benchmarking. -/// -/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, -/// sorted to guarantee deterministic iteration. -/*pub fn _matches(mol: &Molecule) -> (Vec<(usize, usize)>, DagNode) { - let num_edges = mol.graph().edge_count(); - - let mut subgraphs: HashMap = HashMap::new(); - let mut size_one_subgraphs: HashMap = HashMap::new(); - let mut buckets: HashMap> = HashMap::new(); - - let mut dag: Vec = Vec::with_capacity(num_edges); - let mut canon_ids: HashMap = HashMap::new(); - let mut matches: HashMap<(usize, usize), usize> = Hashmap::new(); - - let mut next_frag_id = 1; - let mut next_match_id = 0; - let mut next_canon_id = 1; - - // Generate subgraphs with one edge - for i in 0..num_edges { - let mut new_bitset = BitSet::with_capacity(num_edges); - new_bitset.insert(i); - size_one_subgraphs.insert(new_bitset, i); - - dag.push(DagNode::new(new_bitset, 0, 0)); - } - - // Generate subgraphs with two edges - for (subgraph, idx) in size_one_subgraphs.iter() { - let mut neighborhood = BitSet::with_capacity(num_edges); - for e in subgraph { - neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); - } - neighborhood.difference_with(subgraph); - - for e in neighborhood.iter() { - let mut new_subgraph = subgraph.clone(); - new_subgraph.insert(e); - subgraphs.entry(new_subgraph) - .or_insert(*idx); - } - } - - // Generate subgraphs with more than two edges - while subgraphs.len() > 0 { - buckets.clear(); - // Generate buckets - for (g, parent) in subgraphs.iter() { - let canon = canonize(mol, g, CanonizeMode::TreeNauty); - buckets.entry(canon) - .and_modify(|v| v.push((g.clone(), *parent))) - .or_insert(vec![(g.clone(), *parent)]); - } - - subgraphs.clear(); - let mut frag_to_id: HashMap = HashMap::new(); - for (_, bucket) in buckets.iter() { - // Generate matches - let mut has_match = BitSet::with_capacity(bucket.len()); - for (i, (first, first_parent)) in bucket.iter().enumerate() { - for (j, (second, second_parent)) in bucket[i+1..].iter().enumerate() { - if first.intersection(second).count() == 0 { - if !has_match.contains(i) { - frag_to_id.insert(first.clone(), next_id); - - dag.insert(next_id, (first.clone(), Vec::new())); - dag.entry(*first_parent) - .and_modify(|(_, children)| children.push(next_id)); - - let mut neighborhood = BitSet::with_capacity(num_edges); - for e in first { - neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); - } - neighborhood.difference_with(first); - - for e in neighborhood.iter() { - let mut new_subgraph = first.clone(); - new_subgraph.insert(e); - subgraphs.insert(new_subgraph, next_id); - } - - next_id += 1; - } - if !has_match.contains(i + 1 + j) { - frag_to_id.insert(second.clone(), next_id); - - dag.insert(next_id, (second.clone(), Vec::new())); - dag.entry(*second_parent) - .and_modify(|(_, children)| children.push(next_id)); - - let mut neighborhood = BitSet::with_capacity(num_edges); - for e in second { - neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); - } - neighborhood.difference_with(second); - - for e in neighborhood.iter() { - let mut new_subgraph = second.clone(); - new_subgraph.insert(e); - subgraphs.insert(new_subgraph, next_id); - } - - next_id += 1; - } - - let id1 = frag_to_id.get(first).unwrap(); - let id2 = frag_to_id.get(second).unwrap(); - if first > second { - matches.push((*id1, *id2)); - } else { - matches.push((*id1, *id2)); - } - - has_match.insert(i); - has_match.insert(i + 1 + j); - } - } - } - } - } - - // Sort matches - matches.sort_by(|e1, e2| { - let ord = [ - e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size. - e1.0.cmp(&e2.0), // First subgraph lexicographical. - e1.1.cmp(&e2.1), // Second subgraph lexicographical. - ]; - let mut i = 0; - while ord[i] == std::cmp::Ordering::Equal { - i += 1; - } - ord[i] - }); - - (matches, dag) -}*/ - /// Determine the fragments produced from the given assembly state by removing /// the given pair of non-overlapping, isomorphic subgraphs and then adding one /// back; return `None` if not possible. @@ -526,15 +389,14 @@ pub fn recurse_index_search( //println!("{:?}", removal_order); //println!("{:?}", state); - // MATCHES METHOD 1 - // EXTEND ALWAYS AND DO MATCHING AT END + // Generate matches let num_edges = mol.graph().edge_count(); - let mut enum_matches: Vec<(usize, usize)> = Vec::new(); + let mut valid_matches: Vec<(usize, usize)> = Vec::new(); let mut subgraphs: Vec<(usize, usize)> = Vec::new(); let mut buckets: HashMap> = HashMap::new(); - // create subgraphs of size 1 + // Create subgraphs of size 1 for (state_id, fragment) in state.iter().enumerate() { for edge_idx in fragment.iter() { subgraphs.push((edge_idx, state_id)); @@ -590,28 +452,28 @@ pub fn recurse_index_search( // Only add match if it occurs later in the ordering than // the last removed match if *x >= last_removed { - enum_matches.push((frag1, frag2)); + valid_matches.push((frag1, frag2)); } } } } // If there are no matches, return - if enum_matches.len() == 0 { + if valid_matches.len() == 0 { return (state_index, 1); } - enum_matches.sort(); - enum_matches.reverse(); + valid_matches.sort(); + valid_matches.reverse(); // Modify mask - let largest_remove = dag[enum_matches[0].0].fragment.len(); + let largest_remove = dag[valid_matches[0].0].fragment.len(); let mut masks = vec![ vec![BitSet::with_capacity(num_edges); state.len()]; largest_remove - 1 ]; - for m in enum_matches.iter() { + for m in valid_matches.iter() { // Extract fragments let frag1 = &dag[m.0].fragment; let frag2 = &dag[m.1].fragment; @@ -672,19 +534,15 @@ pub fn recurse_index_search( largest_length += 1; } - // Remove from enum_matches any matches that have size smaller + // Remove from valid_matches any matches that have size smaller // than largest_length, and thus their removal does not need to be tested if largest_length > 2 { let mut final_idx = 0; - while dag[enum_matches[final_idx].0].fragment.len() >= largest_length { + while dag[valid_matches[final_idx].0].fragment.len() >= largest_length { final_idx += 1; } - enum_matches.truncate(final_idx); + valid_matches.truncate(final_idx); } - - - // MATCHES METHOD 2 - // ONLY EXTEND IF A SUBGRAPH HAS A MATCH // Memoization @@ -746,12 +604,12 @@ pub fn recurse_index_search( // Use the iterator type corresponding to the specified parallelism mode. if parallel_mode == ParallelMode::None { - enum_matches + valid_matches .iter() .enumerate() .for_each(|(i, v)| recurse_on_match(i, *v)); } else { - enum_matches + valid_matches .par_iter() .enumerate() .for_each(|(i, v)| recurse_on_match(i, *v)); From b2689d8c85638ee319ad475480e4bad113b12305 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 3 Sep 2025 16:47:23 -0700 Subject: [PATCH 33/51] Don't extend subgraphs without a match --- src/assembly.rs | 72 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index c5d1bed2..edc630f1 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -394,7 +394,6 @@ pub fn recurse_index_search( let mut valid_matches: Vec<(usize, usize)> = Vec::new(); let mut subgraphs: Vec<(usize, usize)> = Vec::new(); - let mut buckets: HashMap> = HashMap::new(); // Create subgraphs of size 1 for (state_id, fragment) in state.iter().enumerate() { @@ -405,6 +404,7 @@ pub fn recurse_index_search( while subgraphs.len() > 0 { // Subgraphs to extend in next loop + let mut buckets: HashMap> = HashMap::new(); let mut new_subgraphs = Vec::new(); // Extend each subgraph and bucket @@ -426,36 +426,62 @@ pub fn recurse_index_search( // Add fragment to bucket buckets.entry(*canon_id) - .and_modify(|bucket| bucket.push(*child_id)) - .or_insert(vec![*child_id]); - - // Add fragment to new subgraphs - new_subgraphs.push((*child_id, state_id)); + .and_modify(|bucket| bucket.push((*child_id, state_id))) + .or_insert(vec![(*child_id, state_id)]); } } } - // Update to new subgraphs - subgraphs = new_subgraphs; - } + // Search through buckets and create matches + for fragments in buckets.values() { + let mut has_match = BitSet::with_capacity(fragments.len()); + // Loop over pairs of fragments + for pair in fragments.iter().enumerate().combinations(2) { + let mut frag1; + let mut frag2; - // Generate matches from buckets of ismorphic fragments - for fragments in buckets.values() { - // Loop over pairs of fragments - for pair in fragments.iter().combinations(2) { - let frag1 = *max(pair[0], pair[1]); - let frag2 = *min(pair[0], pair[1]); - - // Check for valid match - // If valid, add to matches to iterate over - if let Some(x) = matches.get(&(frag1, frag2)) { - // Only add match if it occurs later in the ordering than - // the last removed match - if *x >= last_removed { - valid_matches.push((frag1, frag2)); + // Order fragments + if pair[0].1.0 > pair[1].1.0 { + frag1 = pair[0]; + frag2 = pair[1]; + } + else { + frag1 = pair[1]; + frag2 = pair[0]; + } + + let frag1_bucket_idx = frag1.0; + let frag1_id = (*frag1.1).0; + let frag1_state_id = (*frag1.1).1; + + let frag2_bucket_idx = frag2.0; + let frag2_id = (*frag2.1).0; + let frag2_state_id = (*frag2.1).1; + + // Check for valid match + if let Some(x) = matches.get(&(frag1_id, frag2_id)) { + // Only add match if it occurs later in the ordering than + // the last removed match + if *x >= last_removed { + valid_matches.push((frag1_id, frag2_id)); + + // If this is the first time seeing that this frag has a match, + // add it to the new_subgraphs list + if !has_match.contains(frag1_bucket_idx) { + new_subgraphs.push((frag1_id, frag1_state_id)); + has_match.insert(frag1_bucket_idx); + } + if !has_match.contains(frag2_bucket_idx) { + new_subgraphs.push((frag2_id, frag2_state_id)); + has_match.insert(frag2_bucket_idx); + } + } } } } + + // Update to new subgraphs + subgraphs = new_subgraphs; } // If there are no matches, return From fc86feea86ccbed4c8219bfef93719b7bf47bade Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 4 Sep 2025 12:30:21 -0700 Subject: [PATCH 34/51] Add timers --- src/assembly.rs | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index edc630f1..211d126e 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -22,7 +22,7 @@ use std::{ collections::{BTreeMap, HashMap, HashSet}, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, - } + }, time::{Duration, Instant} }; use bit_set::BitSet; @@ -33,6 +33,7 @@ use petgraph::graph::EdgeIndex; use itertools::Itertools; use std::cmp::{max, min}; use ahash::RandomState; +use fxhash::FxHashMap; use crate::{ bounds::{bound_exceeded, Bound}, @@ -120,7 +121,7 @@ impl DagNode { /// /// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, /// sorted to guarantee deterministic iteration. -pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (HashMap<(usize, usize), usize>, Vec) { +pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (FxHashMap<(usize, usize), usize>, Vec) { let num_edges = mol.graph().edge_count(); let mut subgraphs: Vec = Vec::new(); @@ -279,7 +280,7 @@ pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (HashMap<(usize, // give matches ids let mut next_match_id = 0; - let mut match_to_id: HashMap<(usize, usize), usize> = HashMap::new(); + let mut match_to_id: FxHashMap<(usize, usize), usize> = FxHashMap::default(); for m in matches { match_to_id.insert(m, next_match_id); next_match_id += 1; @@ -373,7 +374,7 @@ fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec, + matches: &FxHashMap<(usize, usize), usize>, dag: &Vec, last_removed: usize, mut subgraph: BitSet, @@ -388,6 +389,12 @@ pub fn recurse_index_search( ) -> (usize, usize) { //println!("{:?}", removal_order); //println!("{:?}", state); + + let mut bucket_time = Duration::new(0, 0); + let mut sort_time = Duration::new(0, 0); + let mut create_matches_time = Duration::new(0, 0); + let mut mask_time = Duration::new(0, 0); + let mut bound_time = Duration::new(0, 0); // Generate matches let num_edges = mol.graph().edge_count(); @@ -421,6 +428,7 @@ pub fn recurse_index_search( let child_frag = &dag[*child_id].fragment; let valid = child_frag.is_subset(state_frag); + let start = Instant::now(); if valid { let canon_id = &dag[*child_id].canon_id; @@ -429,9 +437,11 @@ pub fn recurse_index_search( .and_modify(|bucket| bucket.push((*child_id, state_id))) .or_insert(vec![(*child_id, state_id)]); } + bucket_time += start.elapsed(); } } + let start = Instant::now(); // Search through buckets and create matches for fragments in buckets.values() { let mut has_match = BitSet::with_capacity(fragments.len()); @@ -459,7 +469,9 @@ pub fn recurse_index_search( let frag2_state_id = (*frag2.1).1; // Check for valid match - if let Some(x) = matches.get(&(frag1_id, frag2_id)) { + // if let Some(x) = matches.get(&(frag1_id, frag2_id)) { + if dag[frag1_id].fragment.is_disjoint(&dag[frag2_id].fragment) { + let x = matches.get(&(frag1_id, frag2_id)).unwrap(); // Only add match if it occurs later in the ordering than // the last removed match if *x >= last_removed { @@ -479,6 +491,8 @@ pub fn recurse_index_search( } } } + create_matches_time += start.elapsed(); + // Update to new subgraphs subgraphs = new_subgraphs; @@ -489,10 +503,13 @@ pub fn recurse_index_search( return (state_index, 1); } + let start = Instant::now(); valid_matches.sort(); valid_matches.reverse(); + sort_time += start.elapsed(); // Modify mask + let start = Instant::now(); let largest_remove = dag[valid_matches[0].0].fragment.len(); let mut masks = vec![ vec![BitSet::with_capacity(num_edges); state.len()]; @@ -530,8 +547,9 @@ pub fn recurse_index_search( // Remove frags of size 1 or less state.retain(|frag| frag.len() >= 2); + mask_time += start.elapsed(); - + let start = Instant::now(); let best = best_index.load(Relaxed); let mut best_bound = 0; let mut largest_length = 2; @@ -569,6 +587,9 @@ pub fn recurse_index_search( } valid_matches.truncate(final_idx); } + bound_time += start.elapsed(); + + // println!("bucket: {} match: {} sort: {} mask: {} bound: {}", bucket_time.as_nanos(), create_matches_time.as_nanos(), sort_time.as_nanos(), mask_time.as_nanos(), bound_time.as_nanos()); // Memoization From 266cdca547acbd2d135cf24d2028fc2c976c3a95 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 4 Sep 2025 13:39:55 -0700 Subject: [PATCH 35/51] Remove itertools.combinations from search --- src/assembly.rs | 95 ++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 57 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 211d126e..0bc523d4 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -389,12 +389,6 @@ pub fn recurse_index_search( ) -> (usize, usize) { //println!("{:?}", removal_order); //println!("{:?}", state); - - let mut bucket_time = Duration::new(0, 0); - let mut sort_time = Duration::new(0, 0); - let mut create_matches_time = Duration::new(0, 0); - let mut mask_time = Duration::new(0, 0); - let mut bound_time = Duration::new(0, 0); // Generate matches let num_edges = mol.graph().edge_count(); @@ -428,7 +422,6 @@ pub fn recurse_index_search( let child_frag = &dag[*child_id].fragment; let valid = child_frag.is_subset(state_frag); - let start = Instant::now(); if valid { let canon_id = &dag[*child_id].canon_id; @@ -437,61 +430,57 @@ pub fn recurse_index_search( .and_modify(|bucket| bucket.push((*child_id, state_id))) .or_insert(vec![(*child_id, state_id)]); } - bucket_time += start.elapsed(); } } - let start = Instant::now(); // Search through buckets and create matches for fragments in buckets.values() { let mut has_match = BitSet::with_capacity(fragments.len()); // Loop over pairs of fragments - for pair in fragments.iter().enumerate().combinations(2) { - let mut frag1; - let mut frag2; - - // Order fragments - if pair[0].1.0 > pair[1].1.0 { - frag1 = pair[0]; - frag2 = pair[1]; - } - else { - frag1 = pair[1]; - frag2 = pair[0]; - } + for i in 0..fragments.len() { + for j in i+1..fragments.len() { + let mut frag1 = (i, fragments[i]); + let mut frag2 = (j, fragments[j]); + + // Order fragments + if frag1.1.0 < frag2.1.0 { + let temp = frag1; + frag1 = frag2; + frag2 = temp; + } - let frag1_bucket_idx = frag1.0; - let frag1_id = (*frag1.1).0; - let frag1_state_id = (*frag1.1).1; - - let frag2_bucket_idx = frag2.0; - let frag2_id = (*frag2.1).0; - let frag2_state_id = (*frag2.1).1; - - // Check for valid match - // if let Some(x) = matches.get(&(frag1_id, frag2_id)) { - if dag[frag1_id].fragment.is_disjoint(&dag[frag2_id].fragment) { - let x = matches.get(&(frag1_id, frag2_id)).unwrap(); - // Only add match if it occurs later in the ordering than - // the last removed match - if *x >= last_removed { - valid_matches.push((frag1_id, frag2_id)); - - // If this is the first time seeing that this frag has a match, - // add it to the new_subgraphs list - if !has_match.contains(frag1_bucket_idx) { - new_subgraphs.push((frag1_id, frag1_state_id)); - has_match.insert(frag1_bucket_idx); - } - if !has_match.contains(frag2_bucket_idx) { - new_subgraphs.push((frag2_id, frag2_state_id)); - has_match.insert(frag2_bucket_idx); + let frag1_bucket_idx = frag1.0; + let frag1_id = frag1.1.0; + let frag1_state_id = frag1.1.1; + + let frag2_bucket_idx = frag2.0; + let frag2_id = frag2.1.0; + let frag2_state_id = frag2.1.1; + + // Check for valid match + // if let Some(x) = matches.get(&(frag1_id, frag2_id)) { + if dag[frag1_id].fragment.is_disjoint(&dag[frag2_id].fragment) { + let x = matches.get(&(frag1_id, frag2_id)).unwrap(); + // Only add match if it occurs later in the ordering than + // the last removed match + if *x >= last_removed { + valid_matches.push((frag1_id, frag2_id)); + + // If this is the first time seeing that this frag has a match, + // add it to the new_subgraphs list + if !has_match.contains(frag1_bucket_idx) { + new_subgraphs.push((frag1_id, frag1_state_id)); + has_match.insert(frag1_bucket_idx); + } + if !has_match.contains(frag2_bucket_idx) { + new_subgraphs.push((frag2_id, frag2_state_id)); + has_match.insert(frag2_bucket_idx); + } } } } } } - create_matches_time += start.elapsed(); // Update to new subgraphs @@ -503,13 +492,10 @@ pub fn recurse_index_search( return (state_index, 1); } - let start = Instant::now(); valid_matches.sort(); valid_matches.reverse(); - sort_time += start.elapsed(); // Modify mask - let start = Instant::now(); let largest_remove = dag[valid_matches[0].0].fragment.len(); let mut masks = vec![ vec![BitSet::with_capacity(num_edges); state.len()]; @@ -547,9 +533,7 @@ pub fn recurse_index_search( // Remove frags of size 1 or less state.retain(|frag| frag.len() >= 2); - mask_time += start.elapsed(); - let start = Instant::now(); let best = best_index.load(Relaxed); let mut best_bound = 0; let mut largest_length = 2; @@ -587,9 +571,6 @@ pub fn recurse_index_search( } valid_matches.truncate(final_idx); } - bound_time += start.elapsed(); - - // println!("bucket: {} match: {} sort: {} mask: {} bound: {}", bucket_time.as_nanos(), create_matches_time.as_nanos(), sort_time.as_nanos(), mask_time.as_nanos(), bound_time.as_nanos()); // Memoization From f7b15f3e3783a7e905e68f30c7a90679c2d1d59a Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 4 Sep 2025 14:23:38 -0700 Subject: [PATCH 36/51] Move mask creation into match enumeration --- src/assembly.rs | 47 ++++++++++++----------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 0bc523d4..9efc5b61 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -28,7 +28,7 @@ use std::{ use bit_set::BitSet; use clap::ValueEnum; use graph_canon::canon; -use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator}; +use rayon::iter::{IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator}; use petgraph::graph::EdgeIndex; use itertools::Itertools; use std::cmp::{max, min}; @@ -216,7 +216,7 @@ pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (FxHashMap<(usize // Add index to parent's child list let parent_idx = pair[0].1.1; - let mut parent = &mut dag[parent_idx]; + let parent = &mut dag[parent_idx]; parent.children.push(child_idx); // Add dag node to next subgraphs list @@ -235,7 +235,7 @@ pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (FxHashMap<(usize // Add index to parent's child list let parent_idx = pair[1].1.1; - let mut parent = &mut dag[parent_idx]; + let parent = &mut dag[parent_idx]; parent.children.push(child_idx); // Add dag node to next subgraphs list @@ -395,6 +395,7 @@ pub fn recurse_index_search( let mut valid_matches: Vec<(usize, usize)> = Vec::new(); let mut subgraphs: Vec<(usize, usize)> = Vec::new(); + let mut masks: Vec> = Vec::new(); // Create subgraphs of size 1 for (state_id, fragment) in state.iter().enumerate() { @@ -407,6 +408,7 @@ pub fn recurse_index_search( // Subgraphs to extend in next loop let mut buckets: HashMap> = HashMap::new(); let mut new_subgraphs = Vec::new(); + let mut used_edge_mask = vec![BitSet::with_capacity(num_edges); state.len()]; // Extend each subgraph and bucket for (frag_id, state_id) in subgraphs { @@ -458,22 +460,23 @@ pub fn recurse_index_search( let frag2_state_id = frag2.1.1; // Check for valid match - // if let Some(x) = matches.get(&(frag1_id, frag2_id)) { - if dag[frag1_id].fragment.is_disjoint(&dag[frag2_id].fragment) { - let x = matches.get(&(frag1_id, frag2_id)).unwrap(); + // TODO: try alternatives to this + if let Some(x) = matches.get(&(frag1_id, frag2_id)) { // Only add match if it occurs later in the ordering than // the last removed match if *x >= last_removed { valid_matches.push((frag1_id, frag2_id)); // If this is the first time seeing that this frag has a match, - // add it to the new_subgraphs list + // add it to the new_subgraphs list and modify mask if !has_match.contains(frag1_bucket_idx) { new_subgraphs.push((frag1_id, frag1_state_id)); + used_edge_mask[frag1_state_id].union_with(&dag[frag1_id].fragment); has_match.insert(frag1_bucket_idx); } if !has_match.contains(frag2_bucket_idx) { new_subgraphs.push((frag2_id, frag2_state_id)); + used_edge_mask[frag2_state_id].union_with(&dag[frag2_id].fragment); has_match.insert(frag2_bucket_idx); } } @@ -482,6 +485,8 @@ pub fn recurse_index_search( } } + // Add mask to list + masks.push(used_edge_mask); // Update to new subgraphs subgraphs = new_subgraphs; @@ -495,34 +500,6 @@ pub fn recurse_index_search( valid_matches.sort(); valid_matches.reverse(); - // Modify mask - let largest_remove = dag[valid_matches[0].0].fragment.len(); - let mut masks = vec![ - vec![BitSet::with_capacity(num_edges); state.len()]; - largest_remove - 1 - ]; - - for m in valid_matches.iter() { - // Extract fragments - let frag1 = &dag[m.0].fragment; - let frag2 = &dag[m.1].fragment; - let len = frag1.len() - 2; - - // Find indices of fragments that contain frag1 and frag2 - let (frag1_idx, _) = state.iter() - .enumerate() - .find(|(_, state_frag)| frag1.is_subset(state_frag)) - .unwrap(); - let (frag2_idx, _) = state.iter() - .enumerate() - .find(|(_, state_frag)| frag2.is_subset(state_frag)) - .unwrap(); - - // Union with masks to indicate that these edges are used - masks[len][frag1_idx].union_with(frag1); - masks[len][frag2_idx].union_with(frag2); - } - // Let new state be only the edges which can be matched let mut state = Vec::new(); From 0b90e2edc439a99e0c3162c2e43a6cc083d7b376 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 4 Sep 2025 15:22:25 -0700 Subject: [PATCH 37/51] Build matches after bucketing --- src/assembly.rs | 70 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 9efc5b61..c0a11180 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -396,6 +396,7 @@ pub fn recurse_index_search( let mut valid_matches: Vec<(usize, usize)> = Vec::new(); let mut subgraphs: Vec<(usize, usize)> = Vec::new(); let mut masks: Vec> = Vec::new(); + let mut buckets_by_len: Vec>> = Vec::new(); // Create subgraphs of size 1 for (state_id, fragment) in state.iter().enumerate() { @@ -436,11 +437,15 @@ pub fn recurse_index_search( } // Search through buckets and create matches - for fragments in buckets.values() { + for fragments in buckets.values_mut() { let mut has_match = BitSet::with_capacity(fragments.len()); // Loop over pairs of fragments for i in 0..fragments.len() { for j in i+1..fragments.len() { + if has_match.contains(i) && has_match.contains(j) { + continue; + } + let mut frag1 = (i, fragments[i]); let mut frag2 = (j, fragments[j]); @@ -460,12 +465,11 @@ pub fn recurse_index_search( let frag2_state_id = frag2.1.1; // Check for valid match - // TODO: try alternatives to this if let Some(x) = matches.get(&(frag1_id, frag2_id)) { // Only add match if it occurs later in the ordering than // the last removed match if *x >= last_removed { - valid_matches.push((frag1_id, frag2_id)); + // valid_matches.push((frag1_id, frag2_id)); // If this is the first time seeing that this frag has a match, // add it to the new_subgraphs list and modify mask @@ -483,8 +487,19 @@ pub fn recurse_index_search( } } } + + // Remove any matchless fragments from bucket + let len = fragments.len(); + for i in (0..len).rev() { + if !has_match.contains(i) { + fragments.remove(i); + } + } } + // Add buckets to list + buckets_by_len.push(buckets); + // Add mask to list masks.push(used_edge_mask); @@ -492,14 +507,6 @@ pub fn recurse_index_search( subgraphs = new_subgraphs; } - // If there are no matches, return - if valid_matches.len() == 0 { - return (state_index, 1); - } - - valid_matches.sort(); - valid_matches.reverse(); - // Let new state be only the edges which can be matched let mut state = Vec::new(); @@ -511,6 +518,11 @@ pub fn recurse_index_search( // Remove frags of size 1 or less state.retain(|frag| frag.len() >= 2); + // Memoization + if cache.memoize_state(mol, &state, state_index, &removal_order) { + return (state_index, 1); + } + let best = best_index.load(Relaxed); let mut best_bound = 0; let mut largest_length = 2; @@ -539,22 +551,34 @@ pub fn recurse_index_search( largest_length += 1; } - // Remove from valid_matches any matches that have size smaller - // than largest_length, and thus their removal does not need to be tested - if largest_length > 2 { - let mut final_idx = 0; - while dag[valid_matches[final_idx].0].fragment.len() >= largest_length { - final_idx += 1; - } - valid_matches.truncate(final_idx); - } + // Create matches + for bucket in buckets_by_len[largest_length-2..].iter().rev() { + for fragments in bucket.values() { + for i in 0..fragments.len() { + for j in i+1..fragments.len() { + let mut frag1_id = fragments[i].0; + let mut frag2_id = fragments[j].0; + if frag1_id < frag2_id { + let temp = frag1_id; + frag1_id = frag2_id; + frag2_id = temp; + } - // Memoization - if cache.memoize_state(mol, &state, state_index, &removal_order) { - return (state_index, 1); + if let Some(x) = matches.get(&(frag1_id, frag2_id)){ + if *x >= last_removed { + valid_matches.push((frag1_id, frag2_id)); + } + } + } + } + } } + // Sort matches + valid_matches.sort(); + valid_matches.reverse(); + // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. let best_child_index = AtomicUsize::from(state_index); From 9e7e1f57f3a5f5c4149ee2cb32668b462c5e0dbb Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 4 Sep 2025 15:34:15 -0700 Subject: [PATCH 38/51] Move when memoization happens --- src/assembly.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index c0a11180..8a46a5f7 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -389,6 +389,11 @@ pub fn recurse_index_search( ) -> (usize, usize) { //println!("{:?}", removal_order); //println!("{:?}", state); + + // Memoization + if cache.memoize_state(mol, &state, state_index, &removal_order) { + return (state_index, 1); + } // Generate matches let num_edges = mol.graph().edge_count(); @@ -518,10 +523,6 @@ pub fn recurse_index_search( // Remove frags of size 1 or less state.retain(|frag| frag.len() >= 2); - // Memoization - if cache.memoize_state(mol, &state, state_index, &removal_order) { - return (state_index, 1); - } let best = best_index.load(Relaxed); let mut best_bound = 0; From afb46b32752aab903b21a27af6bd3fc9dc77f429 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Fri, 5 Sep 2025 14:06:08 -0700 Subject: [PATCH 39/51] Add simple vector bound --- src/assembly.rs | 115 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 10 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 8a46a5f7..48a5dde8 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -32,7 +32,6 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator, IndexedParallelIter use petgraph::graph::EdgeIndex; use itertools::Itertools; use std::cmp::{max, min}; -use ahash::RandomState; use fxhash::FxHashMap; use crate::{ @@ -41,7 +40,7 @@ use crate::{ enumerate::{EnumerateMode}, kernels::{KernelMode, deletion_kernel, inclusion_kernel}, memoize::{MemoizeMode, NewCache}, - molecule::Molecule, + molecule::{Bond, Element, Molecule}, utils::{connected_components_under_edges, edge_neighbors}, reductions::CompatGraph, }; @@ -117,11 +116,17 @@ impl DagNode { } } +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct EdgeType { + bond: Bond, + ends: (Element, Element), +} + /// Helper function for [`index_search`]; only public for benchmarking. /// /// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, /// sorted to guarantee deterministic iteration. -pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (FxHashMap<(usize, usize), usize>, Vec) { +pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (FxHashMap<(usize, usize), usize>, Vec, Vec) { let num_edges = mol.graph().edge_count(); let mut subgraphs: Vec = Vec::new(); @@ -286,7 +291,48 @@ pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (FxHashMap<(usize next_match_id += 1; } - (match_to_id, dag) + // Create edge type ids + let graph = mol.graph(); + let mut edgetype_to_id: HashMap = HashMap::new(); + let mut edgetypes: Vec = vec![0; num_edges]; + let mut next_id = 0; + + // Create a list of the elements of the nodes + // The list can be indexed by NodeIndex-es + let mut node_elements: Vec = Vec::new(); + for v in graph.node_weights() { + node_elements.push(v.element()); + } + + // Extract bond types + let weights: Vec = graph.edge_weights().copied().collect(); + + // types will hold an element for every unique edge type in fragment + for e in graph.edge_indices() { + // Extract bond type and endpoint atom types + let bond = weights[e.index()]; + let (end1, end2) = graph.edge_endpoints(e).expect("Edge Endpoint Error"); + + let atom1 = node_elements[end1.index()]; + let atom2 = node_elements[end2.index()]; + + // Create edgetype + let ends = if atom1 < atom2 {(atom1, atom2)} else {(atom2, atom1)}; + let edgetype = EdgeType{bond, ends}; + + // Find or create id for this edgetype + // and store the edgetype id for this edge + if let Some(id) = edgetype_to_id.get(&edgetype) { + edgetypes[e.index()] = *id; + } + else { + edgetypes[e.index()] = next_id; + edgetype_to_id.insert(edgetype, next_id); + next_id += 1; + } + } + + (match_to_id, dag, edgetypes) } /// Determine the fragments produced from the given assembly state by removing @@ -376,6 +422,7 @@ pub fn recurse_index_search( mol: &Molecule, matches: &FxHashMap<(usize, usize), usize>, dag: &Vec, + edgetypes: &Vec, last_removed: usize, mut subgraph: BitSet, removal_order: Vec, @@ -387,8 +434,16 @@ pub fn recurse_index_search( parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { - //println!("{:?}", removal_order); - //println!("{:?}", state); + // println!("{:?}", removal_order); + // println!("{:?}", state); + + // let mut memoize_time = Duration::new(0, 0); + // let mut bucket_time = Duration::new(0, 0); + // let mut component_time = Duration::new(0, 0); + // let mut int_time = Duration::new(0, 0); + // let mut vec_time = Duration::new(0, 0); + // let mut matches_time = Duration::new(0, 0); + // let mut sort_time = Duration::new(0, 0); // Memoization if cache.memoize_state(mol, &state, state_index, &removal_order) { @@ -552,6 +607,36 @@ pub fn recurse_index_search( largest_length += 1; } + let mut largest_length2 = 2; + let mut best_bound = 0; + + // Vector bounding strategy + // Uses edge types of the fragments. + for (i, list) in masks.iter().enumerate() { + let i = i + 2; + + let mut s: usize = list.iter().map(|frag| frag.len()).sum(); + + let mut set = HashSet::new(); + for frag in list.iter() { + for edge in frag.iter() { + set.insert(edgetypes[edge]); + } + } + let z = set.len(); + + let bound = (s - z) - ((s - z) as f32 / i as f32).ceil() as usize; + best_bound = max(best_bound, bound); + + if state_index - best_bound < best { + break; + } + + largest_length2 += 1; + } + + largest_length = max(largest_length, largest_length2); + // Create matches for bucket in buckets_by_len[largest_length-2..].iter().rev() { for fragments in bucket.values() { @@ -580,6 +665,14 @@ pub fn recurse_index_search( valid_matches.sort(); valid_matches.reverse(); + // print!("Mem: {} ", memoize_time.as_nanos()); + // print!("Buk: {} ", bucket_time.as_nanos()); + // print!("Comp: {} ", component_time.as_nanos()); + // print!("Int: {} ", int_time.as_nanos()); + // print!("Vec: {} ", vec_time.as_nanos()); + // print!("Mat: {} ", matches_time.as_nanos()); + // print!("Sort: {}\n", sort_time.as_nanos()); + // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. let best_child_index = AtomicUsize::from(state_index); @@ -587,7 +680,7 @@ pub fn recurse_index_search( // Define a closure that handles recursing to a new assembly state based on // the given (enumerated) pair of non-overlapping isomorphic subgraphs. - let recurse_on_match = |i: usize, v: (usize, usize)| { + let recurse_on_match = |i: usize, v: (usize, usize)| { //let (h1, h2) = &matches[v]; let h1 = &dag[v.0].fragment; let h2 = &dag[v.1].fragment; @@ -607,12 +700,13 @@ pub fn recurse_index_search( mol, matches, dag, + edgetypes, *matches.get(&v).unwrap(), BitSet::new(), { let mut clone = removal_order.clone(); - // clone.push(*matches.get(&v).unwrap()); - clone.push(i); + clone.push(*matches.get(&v).unwrap()); + // clone.push(i); clone }, &fragments, @@ -727,7 +821,7 @@ pub fn index_search( // Catch not-yet-implemented modes. // Enumerate non-overlapping isomorphic subgraph pairs. - let (matches, dag) = matches(mol, canonize_mode); + let (matches, dag, edgetypes) = matches(mol, canonize_mode); // Create memoization cache. //let mut cache = Cache::new(memoize_mode, canonize_mode); @@ -758,6 +852,7 @@ pub fn index_search( mol, &matches, &dag, + &edgetypes, 0, subgraph, Vec::new(), From e1650d96bd6c9cb177df714fd7464a0afb7a9543 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Fri, 5 Sep 2025 14:30:35 -0700 Subject: [PATCH 40/51] Move vector bound --- src/assembly.rs | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 48a5dde8..8c8cdc5f 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -580,7 +580,6 @@ pub fn recurse_index_search( let best = best_index.load(Relaxed); - let mut best_bound = 0; let mut largest_length = 2; // Use bounding strategy to find the largest length match @@ -596,46 +595,29 @@ pub fn recurse_index_search( } bound -= (i as f32).log2().ceil() as usize; - best_bound = max(best_bound, bound); - // Check if removing at this length can give a more optimal answer // If yes, stop and return the largest_length to remove - if state_index - best_bound < best { - break; - } - - largest_length += 1; - } - - let mut largest_length2 = 2; - let mut best_bound = 0; - - // Vector bounding strategy - // Uses edge types of the fragments. - for (i, list) in masks.iter().enumerate() { - let i = i + 2; - - let mut s: usize = list.iter().map(|frag| frag.len()).sum(); + if state_index - bound < best { + let s: usize = list.iter().map(|frag| frag.len()).sum(); - let mut set = HashSet::new(); - for frag in list.iter() { - for edge in frag.iter() { - set.insert(edgetypes[edge]); + let mut set = HashSet::new(); + for frag in list.iter() { + for edge in frag.iter() { + set.insert(edgetypes[edge]); + } } - } - let z = set.len(); + let z = set.len(); - let bound = (s - z) - ((s - z) as f32 / i as f32).ceil() as usize; - best_bound = max(best_bound, bound); + let bound = (s - z) - ((s - z) as f32 / i as f32).ceil() as usize; - if state_index - best_bound < best { - break; + if state_index - bound < best { + break; + } } - largest_length2 += 1; + largest_length += 1; } - - largest_length = max(largest_length, largest_length2); + // Create matches for bucket in buckets_by_len[largest_length-2..].iter().rev() { From 883444f33512f5d9dd99fd995a2f9468d7790b87 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 9 Sep 2025 11:16:33 -0700 Subject: [PATCH 41/51] Replace bugged vector bound --- src/assembly.rs | 85 ++++++++++++++++++++++++++++++++----------------- src/main.rs | 76 +++++++++++++++++++++---------------------- 2 files changed, 93 insertions(+), 68 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 8c8cdc5f..bbf0bdb8 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -434,7 +434,7 @@ pub fn recurse_index_search( parallel_mode: ParallelMode, kernel_mode: KernelMode, ) -> (usize, usize) { - // println!("{:?}", removal_order); + // if removal_order.len() == 1 {println!("{:?}", removal_order);} // println!("{:?}", state); // let mut memoize_time = Duration::new(0, 0); @@ -579,45 +579,70 @@ pub fn recurse_index_search( state.retain(|frag| frag.len() >= 2); + // Use bounding strategy to find the largest length match + // to try removing from this state. let best = best_index.load(Relaxed); let mut largest_length = 2; - // Use bounding strategy to find the largest length match - // to try removing from this state. for (i, list) in masks.iter().enumerate() { - let mut bound = 0; - let i = i + 2; - for (j, frag) in list.iter().enumerate() { - let mf = masks[0][j].len(); - let mi = frag.len(); - let x = mf + (mi % i) - mi; - bound += mf - (mi / i) - x / (i-1) - (x % (i-1) != 0) as usize; - } - bound -= (i as f32).log2().ceil() as usize; + let mut stop = false; + for bound_type in bounds { + if stop { + break; + } - // Check if removing at this length can give a more optimal answer - // If yes, stop and return the largest_length to remove - if state_index - bound < best { - let s: usize = list.iter().map(|frag| frag.len()).sum(); + match bound_type { + Bound::Int => { + let mut bound = 0; + let i = i + 2; + for (j, frag) in list.iter().enumerate() { + let mf = masks[0][j].len(); + let mi = frag.len(); + let x = mf + (mi % i) - mi; + bound += mf - (mi / i) - (x+i-2) / (i-1); + } + bound -= (i as f32).log2().ceil() as usize; - let mut set = HashSet::new(); - for frag in list.iter() { - for edge in frag.iter() { - set.insert(edgetypes[edge]); + if state_index - bound >= best { + stop = true; + } } - } - let z = set.len(); + Bound::VecSimple => { + let mut set = HashSet::new(); + for frag in state.iter() { + for edge in frag.iter() { + set.insert(edgetypes[edge]); + } + } + let z = set.len(); - let bound = (s - z) - ((s - z) as f32 / i as f32).ceil() as usize; + let i = i + 2; + let s: usize = state.iter().map(|frag| frag.len()).sum(); - if state_index - bound < best { - break; + let bound = if s == 0 { + 0 + } + else { + (s - z) - ((s - z) as f32 / i as f32).ceil() as usize + }; + + + if state_index - bound >= best { + stop = true; + } + } + _ => () } } - largest_length += 1; + if stop { + largest_length += 1; + } + else { + break; + } } - + // Create matches for bucket in buckets_by_len[largest_length-2..].iter().rev() { @@ -687,8 +712,8 @@ pub fn recurse_index_search( BitSet::new(), { let mut clone = removal_order.clone(); - clone.push(*matches.get(&v).unwrap()); - // clone.push(i); + // clone.push(*matches.get(&v).unwrap()); + clone.push(i); clone }, &fragments, @@ -847,7 +872,7 @@ pub fn index_search( kernel_mode, ); - println!("Bounded: {}", cache.count()); + // println!("Bounded: {}", cache.count()); (index as u32, matches.len() as u32, states_searched) } diff --git a/src/main.rs b/src/main.rs index 36f2f937..84b0df29 100644 --- a/src/main.rs +++ b/src/main.rs @@ -123,44 +123,44 @@ fn main() -> Result<()> { } - /*use std::ffi::{OsStr, OsString}; - use std::path::Path; - use assembly_theory::molecule::Molecule; - use rayon::iter::{IntoParallelIterator, ParallelIterator, IntoParallelRefIterator, IndexedParallelIterator}; - - let paths = fs::read_dir(Path::new("data").join("coconut_220")).unwrap(); - let mut mol_list: Vec = Vec::new(); - let mut names: Vec = Vec::new(); - - for path in paths { - let name = path.unwrap().path(); - if name.extension().and_then(OsStr::to_str) != Some("mol") { - continue; - } - names.push(name.file_name().unwrap().to_os_string()); - mol_list.push( - parse_molfile_str( - &fs::read_to_string(name.clone()) - .expect(&format!("Could not read file {name:?}")), - ) - .expect(&format!("Failed to parse {name:?}")), - ); - } - - names.par_iter().zip(mol_list.par_iter()).for_each(|(n, mol)| { - let index = index_search( - &mol, - cli.enumerate, - cli.canonize, - cli.parallel, - cli.memoize, - cli.kernel, - boundlist, - cli.clique, - ); - println!("{:?}: MA: {} Space: {}, Searched: {}", n, index.0, index.1, index.2); - }); - std::process::exit(1);*/ + // use std::ffi::{OsStr, OsString}; + // use std::path::Path; + // use assembly_theory::molecule::Molecule; + // use rayon::iter::{IntoParallelIterator, ParallelIterator, IntoParallelRefIterator, IndexedParallelIterator}; + + // let paths = fs::read_dir(Path::new("data").join("coconut_220")).unwrap(); + // let mut mol_list: Vec = Vec::new(); + // let mut names: Vec = Vec::new(); + + // for path in paths { + // let name = path.unwrap().path(); + // if name.extension().and_then(OsStr::to_str) != Some("mol") { + // continue; + // } + // names.push(name.file_name().unwrap().to_os_string()); + // mol_list.push( + // parse_molfile_str( + // &fs::read_to_string(name.clone()) + // .expect(&format!("Could not read file {name:?}")), + // ) + // .expect(&format!("Failed to parse {name:?}")), + // ); + // } + + // names.par_iter().zip(mol_list.par_iter()).for_each(|(n, mol)| { + // let index = index_search( + // &mol, + // cli.enumerate, + // cli.canonize, + // cli.parallel, + // cli.memoize, + // cli.kernel, + // boundlist, + // cli.clique, + // ); + // println!("{:?}: MA: {} Space: {}, Searched: {}", n, index.0, index.1, index.2); + // }); + // std::process::exit(1); // Call index calculation with all the various options. From 09c2cf51220de5369dc589d425d601a8048e7759 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 9 Sep 2025 11:19:42 -0700 Subject: [PATCH 42/51] Update cargo.toml --- Cargo.lock | 76 ------------------------------------------------------ Cargo.toml | 1 - 2 files changed, 77 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 660f0f07..d1c16b66 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,19 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "getrandom", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.3" @@ -84,7 +71,6 @@ checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" name = "assembly-theory" version = "0.4.0" dependencies = [ - "ahash", "anyhow", "bit-set", "clap 4.5.41", @@ -464,18 +450,6 @@ dependencies = [ "byteorder", ] -[[package]] -name = "getrandom" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" -dependencies = [ - "cfg-if", - "libc", - "r-efi", - "wasi", -] - [[package]] name = "glob" version = "0.3.2" @@ -891,12 +865,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "r-efi" -version = "5.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" - [[package]] name = "radium" version = "0.7.0" @@ -1154,12 +1122,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - [[package]] name = "walkdir" version = "2.5.0" @@ -1170,15 +1132,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "wasi" -version = "0.14.2+wasi-0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" -dependencies = [ - "wit-bindgen-rt", -] - [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -1436,15 +1389,6 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" -[[package]] -name = "wit-bindgen-rt" -version = "0.39.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" -dependencies = [ - "bitflags 2.9.1", -] - [[package]] name = "wyz" version = "0.5.1" @@ -1453,23 +1397,3 @@ checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" dependencies = [ "tap", ] - -[[package]] -name = "zerocopy" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] diff --git a/Cargo.toml b/Cargo.toml index 84277549..c24dc767 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,6 @@ petgraph = "0.6.5" pyo3 = { version = "0.24.1", features = ["abi3-py310", "extension-module"]} rayon = "1.10.0" dashmap = "6.1.0" -ahash = "0.8.12" fxhash = "0.2.1" itertools = "0.14.0" From 535c25174aebfb99953f19100c2a9ee653160df4 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Fri, 12 Sep 2025 14:12:24 -0700 Subject: [PATCH 43/51] Change vector bound --- src/assembly.rs | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index bbf0bdb8..4453f1eb 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -585,6 +585,8 @@ pub fn recurse_index_search( let mut largest_length = 2; for (i, list) in masks.iter().enumerate() { + let mut b1 = 0; + let mut stop = false; for bound_type in bounds { if stop { @@ -608,25 +610,41 @@ pub fn recurse_index_search( } } Bound::VecSimple => { - let mut set = HashSet::new(); - for frag in state.iter() { - for edge in frag.iter() { - set.insert(edgetypes[edge]); + let size_two_list = &masks[0]; + let mut total_set = HashSet::new(); + let mut small_set = HashSet::new(); + + for frag in size_two_list.iter() { + for edge in frag { + total_set.insert(edgetypes[edge]); + } + } + for frag in list { + for edge in frag { + small_set.insert(edgetypes[edge]); } } - let z = set.len(); + let z = total_set.len(); + let zi = small_set.len(); let i = i + 2; - let s: usize = state.iter().map(|frag| frag.len()).sum(); + let s: usize = size_two_list.iter().map(|frag| frag.len()).sum(); + let mi: usize = list.iter().map(|frag| frag.len()).sum(); + let mf = s - mi; - let bound = if s == 0 { - 0 - } - else { - (s - z) - ((s - z) as f32 / i as f32).ceil() as usize + let bound = s - { + if z < i { + z-1 + (i as f32 / z as f32).log2().ceil() as usize + (mi/i) + ((mi%i)+mf - (z - zi) + i-2)/(i-1) + } + else { + z + (mi-zi)/i + ((mi-zi)%i + mf - (z - zi) + i-2)/(i-1) + } }; - + if mi == 0 { + break; + } + if state_index - bound >= best { stop = true; } From d820b0f9883749e9ad095110b59193c76c2cc4f7 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 14:14:17 -0700 Subject: [PATCH 44/51] New matches struct --- src/assembly.rs | 528 +----------------------------------------------- src/lib.rs | 1 + src/main.rs | 4 +- src/matches.rs | 512 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 525 insertions(+), 520 deletions(-) create mode 100644 src/matches.rs diff --git a/src/assembly.rs b/src/assembly.rs index 4453f1eb..958deaa3 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -19,7 +19,7 @@ //! ``` use std::{ - collections::{BTreeMap, HashMap, HashSet}, sync::{ + sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, }, time::{Duration, Instant} @@ -27,22 +27,10 @@ use std::{ use bit_set::BitSet; use clap::ValueEnum; -use graph_canon::canon; use rayon::iter::{IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator}; -use petgraph::graph::EdgeIndex; -use itertools::Itertools; -use std::cmp::{max, min}; -use fxhash::FxHashMap; use crate::{ - bounds::{bound_exceeded, Bound}, - canonize::{canonize, CanonizeMode, Labeling}, - enumerate::{EnumerateMode}, - kernels::{KernelMode, deletion_kernel, inclusion_kernel}, - memoize::{MemoizeMode, NewCache}, - molecule::{Bond, Element, Molecule}, - utils::{connected_components_under_edges, edge_neighbors}, - reductions::CompatGraph, + bounds::Bound, canonize::CanonizeMode, enumerate::EnumerateMode, kernels::KernelMode, matches::Matches, memoize::{MemoizeMode, NewCache}, molecule:: Molecule, utils::{connected_components_under_edges} }; /// Parallelization strategy for the recursive search phase. @@ -99,242 +87,6 @@ pub fn depth(mol: &Molecule) -> u32 { ix } -#[derive(Debug)] -pub struct DagNode { - fragment: BitSet, - canon_id: usize, - children: Vec, -} - -impl DagNode { - pub fn new(fragment: BitSet, canon_id: usize) -> Self { - Self { - fragment, - canon_id, - children: Vec::new(), - } - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -struct EdgeType { - bond: Bond, - ends: (Element, Element), -} - -/// Helper function for [`index_search`]; only public for benchmarking. -/// -/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, -/// sorted to guarantee deterministic iteration. -pub fn matches(mol: &Molecule, canonize_mode: CanonizeMode) -> (FxHashMap<(usize, usize), usize>, Vec, Vec) { - let num_edges = mol.graph().edge_count(); - - let mut subgraphs: Vec = Vec::new(); - let mut constructed_frags: HashSet = HashSet::new(); - let mut frag_to_id: HashMap = HashMap::new(); - - let mut dag: Vec = Vec::with_capacity(num_edges); - let mut matches: Vec<(usize, usize)> = Vec::new(); - - let mut next_canon_id = 0; - - // Generate subgraphs with one edge - for i in 0..num_edges { - let mut new_bitset = BitSet::with_capacity(num_edges); - new_bitset.insert(i); - - let node = DagNode::new(new_bitset, 0); - - dag.push(node); - subgraphs.push(i); - } - - next_canon_id += 1; - - while subgraphs.len() > 0 { - // Initialize new buckets and subgraph list - let mut buckets: BTreeMap> = BTreeMap::new(); - let mut child_subgraphs: Vec = Vec::new(); - - // Extend and bucket new subgraphs - for parent_idx in subgraphs.iter() { - // Get fragment to be extended - let fragment = &dag[*parent_idx].fragment; - - // Construct edge neighborhood of fragment - let mut neighborhood = BitSet::with_capacity(num_edges); - for e in fragment { - neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); - } - neighborhood.difference_with(fragment); - - // Extend fragment with edges from neighborhood - for e in neighborhood.iter() { - // Create extended fragment - let mut new_fragment = fragment.clone(); - new_fragment.insert(e); - - // Check if fragment has already been created - // If yes, continue - if constructed_frags.contains(&new_fragment) { - continue; - } - - // Bucket subgraph - // Gives ownership of new_fragment to buckets - let canon = canonize(mol, &new_fragment, canonize_mode); - buckets.entry(canon) - .and_modify(|bucket| bucket.push((new_fragment.clone(), *parent_idx))) - .or_insert(vec![(new_fragment.clone(), *parent_idx)]); - - // Add to constructed_frags - constructed_frags.insert(new_fragment); - } - } - - // Go through buckets to create matches - for fragments in buckets.values() { - // Variables to track if a match has been found - let mut frag_has_match = BitSet::with_capacity(fragments.len()); - - // Loop through pairs of bitsets to find matches - for pair in fragments.iter().enumerate().combinations(2) { - // Extract pair of fragments - let frag1 = &pair[0].1.0; - let frag2 = &pair[1].1.0; - - // Check for disjointness - // If the fragments are disjoint, they form a match - if frag1.is_disjoint(&frag2) { - // Extract indices of fragments - let idx1 = pair[0].0; - let idx2 = pair[1].0; - - // If this is the first match for a fragment, create a dag node - if !frag_has_match.contains(idx1) { - // Create new dag node - let new_dag_node = DagNode::new(frag1.clone(), next_canon_id); - let child_idx = dag.len(); - - // Add child to dag list - dag.push(new_dag_node); - - // Add index to parent's child list - let parent_idx = pair[0].1.1; - let parent = &mut dag[parent_idx]; - parent.children.push(child_idx); - - // Add dag node to next subgraphs list - child_subgraphs.push(child_idx); - - // Add to frag_to_id map - frag_to_id.insert(frag1.clone(), child_idx); - } - if !frag_has_match.contains(idx2) { - // Create new dag node - let new_dag_node = DagNode::new(frag2.clone(), next_canon_id); - let child_idx = dag.len(); - - // Add child to dag list - dag.push(new_dag_node); - - // Add index to parent's child list - let parent_idx = pair[1].1.1; - let parent = &mut dag[parent_idx]; - parent.children.push(child_idx); - - // Add dag node to next subgraphs list - child_subgraphs.push(child_idx); - - // Add to frag_to_id map - frag_to_id.insert(frag2.clone(), child_idx); - } - - // Get fragment ids and add to matches - let frag1_id = frag_to_id.get(&frag1).unwrap(); - let frag2_id = frag_to_id.get(&frag2).unwrap(); - - if frag1_id > frag2_id { - matches.push((*frag1_id, *frag2_id)); - } - else { - matches.push((*frag2_id, *frag1_id)); - } - - // Mark that these fragments have matches - frag_has_match.insert(idx1); - frag_has_match.insert(idx2); - } - } - - // If there was a match, increment canon_id counter - if !frag_has_match.is_empty() { - next_canon_id += 1; - } - } - - // Update to the new subgraphs - subgraphs = child_subgraphs; - } - - // Sort matches - // Larger frag_ids get a smaller match_id - // Thus fragments with many edges have small ids - matches.sort(); - matches.reverse(); - - // give matches ids - let mut next_match_id = 0; - let mut match_to_id: FxHashMap<(usize, usize), usize> = FxHashMap::default(); - for m in matches { - match_to_id.insert(m, next_match_id); - next_match_id += 1; - } - - // Create edge type ids - let graph = mol.graph(); - let mut edgetype_to_id: HashMap = HashMap::new(); - let mut edgetypes: Vec = vec![0; num_edges]; - let mut next_id = 0; - - // Create a list of the elements of the nodes - // The list can be indexed by NodeIndex-es - let mut node_elements: Vec = Vec::new(); - for v in graph.node_weights() { - node_elements.push(v.element()); - } - - // Extract bond types - let weights: Vec = graph.edge_weights().copied().collect(); - - // types will hold an element for every unique edge type in fragment - for e in graph.edge_indices() { - // Extract bond type and endpoint atom types - let bond = weights[e.index()]; - let (end1, end2) = graph.edge_endpoints(e).expect("Edge Endpoint Error"); - - let atom1 = node_elements[end1.index()]; - let atom2 = node_elements[end2.index()]; - - // Create edgetype - let ends = if atom1 < atom2 {(atom1, atom2)} else {(atom2, atom1)}; - let edgetype = EdgeType{bond, ends}; - - // Find or create id for this edgetype - // and store the edgetype id for this edge - if let Some(id) = edgetype_to_id.get(&edgetype) { - edgetypes[e.index()] = *id; - } - else { - edgetypes[e.index()] = next_id; - edgetype_to_id.insert(edgetype, next_id); - next_id += 1; - } - } - - (match_to_id, dag, edgetypes) -} - /// Determine the fragments produced from the given assembly state by removing /// the given pair of non-overlapping, isomorphic subgraphs and then adding one /// back; return `None` if not possible. @@ -382,7 +134,7 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti Some(fragments) } -fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec>) -> bool { +fn _is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec>) -> bool { let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c)); let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c)); @@ -420,11 +172,8 @@ fn is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec, - dag: &Vec, - edgetypes: &Vec, + matches: &Matches, last_removed: usize, - mut subgraph: BitSet, removal_order: Vec, state: &[BitSet], state_index: usize, @@ -449,254 +198,8 @@ pub fn recurse_index_search( if cache.memoize_state(mol, &state, state_index, &removal_order) { return (state_index, 1); } - - // Generate matches - let num_edges = mol.graph().edge_count(); - - let mut valid_matches: Vec<(usize, usize)> = Vec::new(); - let mut subgraphs: Vec<(usize, usize)> = Vec::new(); - let mut masks: Vec> = Vec::new(); - let mut buckets_by_len: Vec>> = Vec::new(); - - // Create subgraphs of size 1 - for (state_id, fragment) in state.iter().enumerate() { - for edge_idx in fragment.iter() { - subgraphs.push((edge_idx, state_id)); - } - } - while subgraphs.len() > 0 { - // Subgraphs to extend in next loop - let mut buckets: HashMap> = HashMap::new(); - let mut new_subgraphs = Vec::new(); - let mut used_edge_mask = vec![BitSet::with_capacity(num_edges); state.len()]; - - // Extend each subgraph and bucket - for (frag_id, state_id) in subgraphs { - // Get state frament that this subgraph is contained in - let state_frag = &state[state_id]; - - // Get ids of extensions of this subgraph - let children_ids = &dag[frag_id].children; - - for child_id in children_ids { - // Check if this extension is valid - // i.e. the extended subgraph is contained in a fragment of state - let child_frag = &dag[*child_id].fragment; - let valid = child_frag.is_subset(state_frag); - - if valid { - let canon_id = &dag[*child_id].canon_id; - - // Add fragment to bucket - buckets.entry(*canon_id) - .and_modify(|bucket| bucket.push((*child_id, state_id))) - .or_insert(vec![(*child_id, state_id)]); - } - } - } - - // Search through buckets and create matches - for fragments in buckets.values_mut() { - let mut has_match = BitSet::with_capacity(fragments.len()); - // Loop over pairs of fragments - for i in 0..fragments.len() { - for j in i+1..fragments.len() { - if has_match.contains(i) && has_match.contains(j) { - continue; - } - - let mut frag1 = (i, fragments[i]); - let mut frag2 = (j, fragments[j]); - - // Order fragments - if frag1.1.0 < frag2.1.0 { - let temp = frag1; - frag1 = frag2; - frag2 = temp; - } - - let frag1_bucket_idx = frag1.0; - let frag1_id = frag1.1.0; - let frag1_state_id = frag1.1.1; - - let frag2_bucket_idx = frag2.0; - let frag2_id = frag2.1.0; - let frag2_state_id = frag2.1.1; - - // Check for valid match - if let Some(x) = matches.get(&(frag1_id, frag2_id)) { - // Only add match if it occurs later in the ordering than - // the last removed match - if *x >= last_removed { - // valid_matches.push((frag1_id, frag2_id)); - - // If this is the first time seeing that this frag has a match, - // add it to the new_subgraphs list and modify mask - if !has_match.contains(frag1_bucket_idx) { - new_subgraphs.push((frag1_id, frag1_state_id)); - used_edge_mask[frag1_state_id].union_with(&dag[frag1_id].fragment); - has_match.insert(frag1_bucket_idx); - } - if !has_match.contains(frag2_bucket_idx) { - new_subgraphs.push((frag2_id, frag2_state_id)); - used_edge_mask[frag2_state_id].union_with(&dag[frag2_id].fragment); - has_match.insert(frag2_bucket_idx); - } - } - } - } - } - - // Remove any matchless fragments from bucket - let len = fragments.len(); - for i in (0..len).rev() { - if !has_match.contains(i) { - fragments.remove(i); - } - } - } - - // Add buckets to list - buckets_by_len.push(buckets); - - // Add mask to list - masks.push(used_edge_mask); - - // Update to new subgraphs - subgraphs = new_subgraphs; - } - - // Let new state be only the edges which can be matched - let mut state = Vec::new(); - - for frag in masks[0].iter() { - // Add connected components to new state - state.extend(connected_components_under_edges(mol.graph(), &frag)); - } - - // Remove frags of size 1 or less - state.retain(|frag| frag.len() >= 2); - - - // Use bounding strategy to find the largest length match - // to try removing from this state. - let best = best_index.load(Relaxed); - let mut largest_length = 2; - - for (i, list) in masks.iter().enumerate() { - let mut b1 = 0; - - let mut stop = false; - for bound_type in bounds { - if stop { - break; - } - - match bound_type { - Bound::Int => { - let mut bound = 0; - let i = i + 2; - for (j, frag) in list.iter().enumerate() { - let mf = masks[0][j].len(); - let mi = frag.len(); - let x = mf + (mi % i) - mi; - bound += mf - (mi / i) - (x+i-2) / (i-1); - } - bound -= (i as f32).log2().ceil() as usize; - - if state_index - bound >= best { - stop = true; - } - } - Bound::VecSimple => { - let size_two_list = &masks[0]; - let mut total_set = HashSet::new(); - let mut small_set = HashSet::new(); - - for frag in size_two_list.iter() { - for edge in frag { - total_set.insert(edgetypes[edge]); - } - } - for frag in list { - for edge in frag { - small_set.insert(edgetypes[edge]); - } - } - - let z = total_set.len(); - let zi = small_set.len(); - let i = i + 2; - let s: usize = size_two_list.iter().map(|frag| frag.len()).sum(); - let mi: usize = list.iter().map(|frag| frag.len()).sum(); - let mf = s - mi; - - let bound = s - { - if z < i { - z-1 + (i as f32 / z as f32).log2().ceil() as usize + (mi/i) + ((mi%i)+mf - (z - zi) + i-2)/(i-1) - } - else { - z + (mi-zi)/i + ((mi-zi)%i + mf - (z - zi) + i-2)/(i-1) - } - }; - - if mi == 0 { - break; - } - - if state_index - bound >= best { - stop = true; - } - } - _ => () - } - } - - if stop { - largest_length += 1; - } - else { - break; - } - } - - - // Create matches - for bucket in buckets_by_len[largest_length-2..].iter().rev() { - for fragments in bucket.values() { - for i in 0..fragments.len() { - for j in i+1..fragments.len() { - let mut frag1_id = fragments[i].0; - let mut frag2_id = fragments[j].0; - - if frag1_id < frag2_id { - let temp = frag1_id; - frag1_id = frag2_id; - frag2_id = temp; - } - - if let Some(x) = matches.get(&(frag1_id, frag2_id)){ - if *x >= last_removed { - valid_matches.push((frag1_id, frag2_id)); - } - } - } - } - } - } - - // Sort matches - valid_matches.sort(); - valid_matches.reverse(); - - // print!("Mem: {} ", memoize_time.as_nanos()); - // print!("Buk: {} ", bucket_time.as_nanos()); - // print!("Comp: {} ", component_time.as_nanos()); - // print!("Int: {} ", int_time.as_nanos()); - // print!("Vec: {} ", vec_time.as_nanos()); - // print!("Mat: {} ", matches_time.as_nanos()); - // print!("Sort: {}\n", sort_time.as_nanos()); + let valid_matches = matches.generate_matches(mol, state, state_index, last_removed, best_index.load(Relaxed), bounds); // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. @@ -707,8 +210,8 @@ pub fn recurse_index_search( // the given (enumerated) pair of non-overlapping isomorphic subgraphs. let recurse_on_match = |i: usize, v: (usize, usize)| { //let (h1, h2) = &matches[v]; - let h1 = &dag[v.0].fragment; - let h2 = &dag[v.1].fragment; + let h1 = matches.get_frag(v.0); + let h2 = matches.get_frag(v.1); // TODO: try keeping track of fragment to elimate some some search if let Some(fragments) = fragments(mol, &state, h1, h2) { @@ -724,10 +227,7 @@ pub fn recurse_index_search( let (child_index, child_states_searched) = recurse_index_search( mol, matches, - dag, - edgetypes, - *matches.get(&v).unwrap(), - BitSet::new(), + matches.get_match_id(&v).unwrap(), { let mut clone = removal_order.clone(); // clone.push(*matches.get(&v).unwrap()); @@ -841,12 +341,12 @@ pub fn index_search( memoize_mode: MemoizeMode, kernel_mode: KernelMode, bounds: &[Bound], - clique: bool, + _clique: bool, ) -> (u32, u32, usize) { // Catch not-yet-implemented modes. // Enumerate non-overlapping isomorphic subgraph pairs. - let (matches, dag, edgetypes) = matches(mol, canonize_mode); + let matches = Matches::new(mol, canonize_mode); // Create memoization cache. //let mut cache = Cache::new(memoize_mode, canonize_mode); @@ -865,21 +365,13 @@ pub fn index_search( let mut init = BitSet::new(); init.extend(mol.graph().edge_indices().map(|ix| ix.index())); - let mut subgraph = BitSet::with_capacity(matches.len()); - for i in 0..matches.len() { - subgraph.insert(i); - } - // Search for the shortest assembly pathway recursively. let edge_count = mol.graph().edge_count(); let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); let (index, states_searched) = recurse_index_search( mol, &matches, - &dag, - &edgetypes, 0, - subgraph, Vec::new(), &[init], edge_count - 1, diff --git a/src/lib.rs b/src/lib.rs index 4310afda..25cd9a3a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -53,6 +53,7 @@ pub mod enumerate; pub mod kernels; pub mod memoize; pub mod reductions; +pub mod matches; mod vf3; // Python wrapper. diff --git a/src/main.rs b/src/main.rs index 84b0df29..f526425c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -98,9 +98,9 @@ fn main() -> Result<()> { // Handle bounding strategy CLI arguments. let boundlist: &[Bound] = match (cli.boundsgroup, cli.clique) { // If clique not enable, default to a combination of the integer and vector bounds. - (None, false) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], + (None, false) => &[Bound::Int, Bound::VecSimple, /*Bound::VecSmallFrags*/], // If clique is enabled, default to combination of int, vec, and clique bounds. - (None, true) => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags, /*Bound::CliqueBudget,*/ Bound::CoverNoSort], + (None, true) => &[Bound::Int, Bound::VecSimple], // If --no-bounds is set, do not use any bounds. (Some(BoundsGroup { no_bounds: true, .. diff --git a/src/matches.rs b/src/matches.rs new file mode 100644 index 00000000..43a46f43 --- /dev/null +++ b/src/matches.rs @@ -0,0 +1,512 @@ +use std::collections::{BTreeMap, HashMap, HashSet}; + +use bit_set::BitSet; +use fxhash::FxHashMap; +use itertools::Itertools; +use petgraph::graph::EdgeIndex; + +use crate::{bounds::Bound, canonize::{canonize, CanonizeMode, Labeling}, molecule::{Bond, Element, Molecule}, reductions::CompatGraph, utils::{connected_components_under_edges, edge_neighbors}}; + +pub struct DagNode { + fragment: BitSet, + canon_id: usize, + children: Vec, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct EdgeType { + bond: Bond, + ends: (Element, Element), +} + +pub struct Matches { + id_to_match: Vec<(usize, usize)>, + match_to_id: FxHashMap<(usize, usize), usize>, + dag: Vec, + // clique: CompatGraph, + // clique_offset: usize, + edge_types: Vec, +} + + +impl DagNode { + pub fn new(fragment: BitSet, canon_id: usize) -> Self { + Self { + fragment, + canon_id, + children: Vec::new(), + } + } +} + +impl Matches { + // Generate DAG, clique, and other info from the base molecule. + pub fn new(mol: &Molecule, canonize_mode: CanonizeMode) -> Self { + let num_edges = mol.graph().edge_count(); + + let mut subgraphs: Vec = Vec::new(); + let mut constructed_frags: HashSet = HashSet::new(); + let mut frag_to_id: HashMap = HashMap::new(); + + let mut dag: Vec = Vec::with_capacity(num_edges); + let mut matches: Vec<(usize, usize)> = Vec::new(); + + let mut next_canon_id = 0; + + // Generate subgraphs with one edge + for i in 0..num_edges { + let mut new_bitset = BitSet::with_capacity(num_edges); + new_bitset.insert(i); + + let node = DagNode::new(new_bitset, 0); + + dag.push(node); + subgraphs.push(i); + } + + next_canon_id += 1; + + while subgraphs.len() > 0 { + // Initialize new buckets and subgraph list + let mut buckets: BTreeMap> = BTreeMap::new(); + let mut child_subgraphs: Vec = Vec::new(); + + // Extend and bucket new subgraphs + for parent_idx in subgraphs.iter() { + // Get fragment to be extended + let fragment = &dag[*parent_idx].fragment; + + // Construct edge neighborhood of fragment + let mut neighborhood = BitSet::with_capacity(num_edges); + for e in fragment { + neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); + } + neighborhood.difference_with(fragment); + + // Extend fragment with edges from neighborhood + for e in neighborhood.iter() { + // Create extended fragment + let mut new_fragment = fragment.clone(); + new_fragment.insert(e); + + // Check if fragment has already been created + // If yes, continue + if constructed_frags.contains(&new_fragment) { + continue; + } + + // Bucket subgraph + // Gives ownership of new_fragment to buckets + let canon = canonize(mol, &new_fragment, canonize_mode); + buckets.entry(canon) + .and_modify(|bucket| bucket.push((new_fragment.clone(), *parent_idx))) + .or_insert(vec![(new_fragment.clone(), *parent_idx)]); + + // Add to constructed_frags + constructed_frags.insert(new_fragment); + } + } + + // Go through buckets to create matches + for fragments in buckets.values() { + // Variables to track if a match has been found + let mut frag_has_match = BitSet::with_capacity(fragments.len()); + + // Loop through pairs of bitsets to find matches + for pair in fragments.iter().enumerate().combinations(2) { + // Extract pair of fragments + let frag1 = &pair[0].1.0; + let frag2 = &pair[1].1.0; + + // Check for disjointness + // If the fragments are disjoint, they form a match + if frag1.is_disjoint(&frag2) { + // Extract indices of fragments + let idx1 = pair[0].0; + let idx2 = pair[1].0; + + // If this is the first match for a fragment, create a dag node + if !frag_has_match.contains(idx1) { + // Create new dag node + let new_dag_node = DagNode::new(frag1.clone(), next_canon_id); + let child_idx = dag.len(); + + // Add child to dag list + dag.push(new_dag_node); + + // Add index to parent's child list + let parent_idx = pair[0].1.1; + let parent = &mut dag[parent_idx]; + parent.children.push(child_idx); + + // Add dag node to next subgraphs list + child_subgraphs.push(child_idx); + + // Add to frag_to_id map + frag_to_id.insert(frag1.clone(), child_idx); + } + if !frag_has_match.contains(idx2) { + // Create new dag node + let new_dag_node = DagNode::new(frag2.clone(), next_canon_id); + let child_idx = dag.len(); + + // Add child to dag list + dag.push(new_dag_node); + + // Add index to parent's child list + let parent_idx = pair[1].1.1; + let parent = &mut dag[parent_idx]; + parent.children.push(child_idx); + + // Add dag node to next subgraphs list + child_subgraphs.push(child_idx); + + // Add to frag_to_id map + frag_to_id.insert(frag2.clone(), child_idx); + } + + // Get fragment ids and add to matches + let frag1_id = frag_to_id.get(&frag1).unwrap(); + let frag2_id = frag_to_id.get(&frag2).unwrap(); + + if frag1_id > frag2_id { + matches.push((*frag1_id, *frag2_id)); + } + else { + matches.push((*frag2_id, *frag1_id)); + } + + // Mark that these fragments have matches + frag_has_match.insert(idx1); + frag_has_match.insert(idx2); + } + } + + // If there was a match, increment canon_id counter + if !frag_has_match.is_empty() { + next_canon_id += 1; + } + } + + // Update to the new subgraphs + subgraphs = child_subgraphs; + } + + // Sort matches + // Larger frag_ids get a smaller match_id + // Thus fragments with many edges have small ids + matches.sort(); + matches.reverse(); + + // Give ids to matches + let mut next_match_id = 0; + let mut id_to_match = Vec::new(); + let mut match_to_id: FxHashMap<(usize, usize), usize> = FxHashMap::default(); + + for m in matches { + id_to_match.push(m); + match_to_id.insert(m, next_match_id); + next_match_id += 1; + } + + // Create edge type ids + let graph = mol.graph(); + let mut edgetype_to_id: HashMap = HashMap::new(); + let mut edge_types: Vec = vec![0; num_edges]; + let mut next_id = 0; + + // Create a list of the elements of the nodes + // The list can be indexed by NodeIndex-es + let mut node_elements: Vec = Vec::new(); + for v in graph.node_weights() { + node_elements.push(v.element()); + } + + // Extract bond types + let weights: Vec = graph.edge_weights().copied().collect(); + + // types will hold an element for every unique edge type in fragment + for e in graph.edge_indices() { + // Extract bond type and endpoint atom types + let bond = weights[e.index()]; + let (end1, end2) = graph.edge_endpoints(e).expect("Edge Endpoint Error"); + + let atom1 = node_elements[end1.index()]; + let atom2 = node_elements[end2.index()]; + + // Create edgetype + let ends = if atom1 < atom2 {(atom1, atom2)} else {(atom2, atom1)}; + let edgetype = EdgeType{bond, ends}; + + // Find or create id for this edgetype + // and store the edgetype id for this edge + if let Some(id) = edgetype_to_id.get(&edgetype) { + edge_types[e.index()] = *id; + } + else { + edge_types[e.index()] = next_id; + edgetype_to_id.insert(edgetype, next_id); + next_id += 1; + } + } + + Self { + id_to_match, + match_to_id, + dag, + edge_types, + } + } + + pub fn generate_matches(&self, mol: &Molecule, state: &[BitSet], state_index: usize, last_removed: usize, best: usize, bounds: &[Bound]) -> Vec<(usize, usize)> { + let num_edges = mol.graph().edge_count(); + + let mut valid_matches: Vec<(usize, usize)> = Vec::new(); + let mut subgraphs: Vec<(usize, usize)> = Vec::new(); + let mut masks: Vec> = Vec::new(); + let mut buckets_by_len: Vec>> = Vec::new(); + + // Create subgraphs of size 1 + for (state_id, fragment) in state.iter().enumerate() { + for edge_idx in fragment.iter() { + subgraphs.push((edge_idx, state_id)); + } + } + + while subgraphs.len() > 0 { + // Subgraphs to extend in next loop + let mut buckets: HashMap> = HashMap::new(); + let mut new_subgraphs = Vec::new(); + let mut used_edge_mask = vec![BitSet::with_capacity(num_edges); state.len()]; + + // Extend each subgraph and bucket + for (frag_id, state_id) in subgraphs { + // Get state frament that this subgraph is contained in + let state_frag = &state[state_id]; + + // Get ids of extensions of this subgraph + let children_ids = &self.dag[frag_id].children; + + for child_id in children_ids { + // Check if this extension is valid + // i.e. the extended subgraph is contained in a fragment of state + let child_frag = &self.dag[*child_id].fragment; + let valid = child_frag.is_subset(state_frag); + + if valid { + let canon_id = &self.dag[*child_id].canon_id; + + // Add fragment to bucket + buckets.entry(*canon_id) + .and_modify(|bucket| bucket.push((*child_id, state_id))) + .or_insert(vec![(*child_id, state_id)]); + } + } + } + + // Search through buckets and create matches + for fragments in buckets.values_mut() { + let mut has_match = BitSet::with_capacity(fragments.len()); + // Loop over pairs of fragments + for i in 0..fragments.len() { + for j in i+1..fragments.len() { + if has_match.contains(i) && has_match.contains(j) { + continue; + } + + let mut frag1 = (i, fragments[i]); + let mut frag2 = (j, fragments[j]); + + // Order fragments + if frag1.1.0 < frag2.1.0 { + let temp = frag1; + frag1 = frag2; + frag2 = temp; + } + + let frag1_bucket_idx = frag1.0; + let frag1_id = frag1.1.0; + let frag1_state_id = frag1.1.1; + + let frag2_bucket_idx = frag2.0; + let frag2_id = frag2.1.0; + let frag2_state_id = frag2.1.1; + + // Check for valid match + if let Some(x) = self.match_to_id.get(&(frag1_id, frag2_id)) { + // Only add match if it occurs later in the ordering than + // the last removed match + if *x >= last_removed { + // valid_matches.push((frag1_id, frag2_id)); + + // If this is the first time seeing that this frag has a match, + // add it to the new_subgraphs list and modify mask + if !has_match.contains(frag1_bucket_idx) { + new_subgraphs.push((frag1_id, frag1_state_id)); + used_edge_mask[frag1_state_id].union_with(&self.dag[frag1_id].fragment); + has_match.insert(frag1_bucket_idx); + } + if !has_match.contains(frag2_bucket_idx) { + new_subgraphs.push((frag2_id, frag2_state_id)); + used_edge_mask[frag2_state_id].union_with(&self.dag[frag2_id].fragment); + has_match.insert(frag2_bucket_idx); + } + } + } + } + } + + // Remove any matchless fragments from bucket + let len = fragments.len(); + for i in (0..len).rev() { + if !has_match.contains(i) { + fragments.remove(i); + } + } + } + + // Add buckets to list + buckets_by_len.push(buckets); + + // Add mask to list + masks.push(used_edge_mask); + + // Update to new subgraphs + subgraphs = new_subgraphs; + } + + // Let new state be only the edges which can be matched + let mut state = Vec::new(); + + for frag in masks[0].iter() { + // Add connected components to new state + state.extend(connected_components_under_edges(mol.graph(), &frag)); + } + + // Remove frags of size 1 or less + state.retain(|frag| frag.len() >= 2); + + + // Use bounding strategy to find the largest length match + // to try removing from this state. + let mut largest_length = 2; + + for (i, list) in masks.iter().enumerate() { + let mut stop = false; + for bound_type in bounds { + if stop { + break; + } + + match bound_type { + Bound::Int => { + let mut bound = 0; + let i = i + 2; + for (j, frag) in list.iter().enumerate() { + let mf = masks[0][j].len(); + let mi = frag.len(); + let x = mf + (mi % i) - mi; + bound += mf - (mi / i) - (x+i-2) / (i-1); + } + bound -= (i as f32).log2().ceil() as usize; + + if state_index - bound >= best { + stop = true; + } + } + Bound::VecSimple => { + let size_two_list = &masks[0]; + let mut total_set = HashSet::new(); + let mut small_set = HashSet::new(); + + for frag in size_two_list.iter() { + for edge in frag { + total_set.insert(self.edge_types[edge]); + } + } + for frag in list { + for edge in frag { + small_set.insert(self.edge_types[edge]); + } + } + + let z = total_set.len(); + let zi = small_set.len(); + let i = i + 2; + let s: usize = size_two_list.iter().map(|frag| frag.len()).sum(); + let mi: usize = list.iter().map(|frag| frag.len()).sum(); + let mf = s - mi; + + let bound = s - { + if z < i { + z-1 + (i as f32 / z as f32).log2().ceil() as usize + (mi/i) + ((mi%i)+mf - (z - zi) + i-2)/(i-1) + } + else { + z + (mi-zi)/i + ((mi-zi)%i + mf - (z - zi) + i-2)/(i-1) + } + }; + + if mi == 0 { + break; + } + + if state_index - bound >= best { + stop = true; + } + } + _ => () + } + } + + if stop { + largest_length += 1; + } + else { + break; + } + } + + // Create matches + for bucket in buckets_by_len[largest_length-2..].iter().rev() { + for fragments in bucket.values() { + for i in 0..fragments.len() { + for j in i+1..fragments.len() { + let mut frag1_id = fragments[i].0; + let mut frag2_id = fragments[j].0; + + if frag1_id < frag2_id { + let temp = frag1_id; + frag1_id = frag2_id; + frag2_id = temp; + } + + if let Some(x) = self.match_to_id.get(&(frag1_id, frag2_id)){ + if *x >= last_removed { + valid_matches.push((frag1_id, frag2_id)); + } + } + } + } + } + } + + // Sort matches + valid_matches.sort(); + valid_matches.reverse(); + + valid_matches + } + + pub fn len(&self) -> usize { + self.id_to_match.len() + } + + pub fn get_frag(&self, id: usize) -> &BitSet { + &self.dag[id].fragment + } + + pub fn get_match_id(&self, mat: &(usize, usize)) -> Option { + self.match_to_id.get(mat).copied() + } +} + From fd418c020ddce3d9194a01cae46a330bad3e152a Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 14:46:59 -0700 Subject: [PATCH 45/51] New state struct --- src/assembly.rs | 67 +++++++++++-------------------------------------- src/lib.rs | 1 + src/matches.rs | 17 +++++++------ src/memoize.rs | 10 +++++--- src/state.rs | 54 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 62 deletions(-) create mode 100644 src/state.rs diff --git a/src/assembly.rs b/src/assembly.rs index 958deaa3..69b661ad 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -30,7 +30,7 @@ use clap::ValueEnum; use rayon::iter::{IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator}; use crate::{ - bounds::Bound, canonize::CanonizeMode, enumerate::EnumerateMode, kernels::KernelMode, matches::Matches, memoize::{MemoizeMode, NewCache}, molecule:: Molecule, utils::{connected_components_under_edges} + bounds::Bound, canonize::CanonizeMode, enumerate::EnumerateMode, kernels::KernelMode, matches::Matches, memoize::{MemoizeMode, NewCache}, molecule:: Molecule, state::State, utils::connected_components_under_edges }; /// Parallelization strategy for the recursive search phase. @@ -173,37 +173,25 @@ fn _is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec, - state: &[BitSet], - state_index: usize, + state: &State, best_index: Arc, bounds: &[Bound], cache: &mut NewCache, parallel_mode: ParallelMode, - kernel_mode: KernelMode, ) -> (usize, usize) { // if removal_order.len() == 1 {println!("{:?}", removal_order);} // println!("{:?}", state); - // let mut memoize_time = Duration::new(0, 0); - // let mut bucket_time = Duration::new(0, 0); - // let mut component_time = Duration::new(0, 0); - // let mut int_time = Duration::new(0, 0); - // let mut vec_time = Duration::new(0, 0); - // let mut matches_time = Duration::new(0, 0); - // let mut sort_time = Duration::new(0, 0); - // Memoization - if cache.memoize_state(mol, &state, state_index, &removal_order) { - return (state_index, 1); + if cache.memoize_state(mol, state) { + return (state.index(), 1); } - let valid_matches = matches.generate_matches(mol, state, state_index, last_removed, best_index.load(Relaxed), bounds); + let (frags, valid_matches) = matches.generate_matches(mol, state, best_index.load(Relaxed), bounds); // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. - let best_child_index = AtomicUsize::from(state_index); + let best_child_index = AtomicUsize::from(state.index()); let states_searched = AtomicUsize::from(1); // Define a closure that handles recursing to a new assembly state based on @@ -214,7 +202,7 @@ pub fn recurse_index_search( let h2 = matches.get_frag(v.1); // TODO: try keeping track of fragment to elimate some some search - if let Some(fragments) = fragments(mol, &state, h1, h2) { + if let Some(fragments) = fragments(mol, &frags, h1, h2) { // If using depth-one parallelism, all descendant states should be // computed serially. let new_parallel = if parallel_mode == ParallelMode::DepthOne { @@ -223,24 +211,17 @@ pub fn recurse_index_search( parallel_mode }; + let new_state = state.update(fragments, h1.len(), i, matches.match_id(&v).unwrap()); + // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search( mol, matches, - matches.get_match_id(&v).unwrap(), - { - let mut clone = removal_order.clone(); - // clone.push(*matches.get(&v).unwrap()); - clone.push(i); - clone - }, - &fragments, - state_index - h1.len() + 1, + &new_state, best_index.clone(), bounds, &mut cache.clone(), new_parallel, - kernel_mode, ); // Update the best assembly indices (across children states and @@ -339,47 +320,29 @@ pub fn index_search( canonize_mode: CanonizeMode, parallel_mode: ParallelMode, memoize_mode: MemoizeMode, - kernel_mode: KernelMode, + _kernel_mode: KernelMode, bounds: &[Bound], _clique: bool, ) -> (u32, u32, usize) { - // Catch not-yet-implemented modes. - // Enumerate non-overlapping isomorphic subgraph pairs. let matches = Matches::new(mol, canonize_mode); // Create memoization cache. - //let mut cache = Cache::new(memoize_mode, canonize_mode); let mut cache = NewCache::new(memoize_mode); - /*let graph = { - if clique { - Some(CompatGraph::new(&matches, &dag)) - } - else { - None - } - };*/ - - // Initialize the first fragment as the entire graph. - let mut init = BitSet::new(); - init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + // Initialize state + let state = State::new(mol); // Search for the shortest assembly pathway recursively. - let edge_count = mol.graph().edge_count(); - let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); + let best_index = Arc::new(AtomicUsize::from(mol.graph().edge_count() - 1)); let (index, states_searched) = recurse_index_search( mol, &matches, - 0, - Vec::new(), - &[init], - edge_count - 1, + &state, best_index, bounds, &mut cache, parallel_mode, - kernel_mode, ); // println!("Bounded: {}", cache.count()); diff --git a/src/lib.rs b/src/lib.rs index 25cd9a3a..80529e9d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,6 +54,7 @@ pub mod kernels; pub mod memoize; pub mod reductions; pub mod matches; +pub mod state; mod vf3; // Python wrapper. diff --git a/src/matches.rs b/src/matches.rs index 43a46f43..818fd012 100644 --- a/src/matches.rs +++ b/src/matches.rs @@ -5,7 +5,7 @@ use fxhash::FxHashMap; use itertools::Itertools; use petgraph::graph::EdgeIndex; -use crate::{bounds::Bound, canonize::{canonize, CanonizeMode, Labeling}, molecule::{Bond, Element, Molecule}, reductions::CompatGraph, utils::{connected_components_under_edges, edge_neighbors}}; +use crate::{bounds::Bound, canonize::{canonize, CanonizeMode, Labeling}, molecule::{Bond, Element, Molecule}, reductions::CompatGraph, state::State, utils::{connected_components_under_edges, edge_neighbors}}; pub struct DagNode { fragment: BitSet, @@ -258,8 +258,11 @@ impl Matches { } } - pub fn generate_matches(&self, mol: &Molecule, state: &[BitSet], state_index: usize, last_removed: usize, best: usize, bounds: &[Bound]) -> Vec<(usize, usize)> { + pub fn generate_matches(&self, mol: &Molecule, state: &State, best: usize, bounds: &[Bound]) -> (Vec, Vec<(usize, usize)>) { let num_edges = mol.graph().edge_count(); + let state_frags = state.fragments(); + let state_index = state.index(); + let last_removed = state.last_removed(); let mut valid_matches: Vec<(usize, usize)> = Vec::new(); let mut subgraphs: Vec<(usize, usize)> = Vec::new(); @@ -267,7 +270,7 @@ impl Matches { let mut buckets_by_len: Vec>> = Vec::new(); // Create subgraphs of size 1 - for (state_id, fragment) in state.iter().enumerate() { + for (state_id, fragment) in state_frags.iter().enumerate() { for edge_idx in fragment.iter() { subgraphs.push((edge_idx, state_id)); } @@ -277,12 +280,12 @@ impl Matches { // Subgraphs to extend in next loop let mut buckets: HashMap> = HashMap::new(); let mut new_subgraphs = Vec::new(); - let mut used_edge_mask = vec![BitSet::with_capacity(num_edges); state.len()]; + let mut used_edge_mask = vec![BitSet::with_capacity(num_edges); state_frags.len()]; // Extend each subgraph and bucket for (frag_id, state_id) in subgraphs { // Get state frament that this subgraph is contained in - let state_frag = &state[state_id]; + let state_frag = &state_frags[state_id]; // Get ids of extensions of this subgraph let children_ids = &self.dag[frag_id].children; @@ -494,7 +497,7 @@ impl Matches { valid_matches.sort(); valid_matches.reverse(); - valid_matches + (state, valid_matches) } pub fn len(&self) -> usize { @@ -505,7 +508,7 @@ impl Matches { &self.dag[id].fragment } - pub fn get_match_id(&self, mat: &(usize, usize)) -> Option { + pub fn match_id(&self, mat: &(usize, usize)) -> Option { self.match_to_id.get(mat).copied() } } diff --git a/src/memoize.rs b/src/memoize.rs index a251639e..3d4e5572 100644 --- a/src/memoize.rs +++ b/src/memoize.rs @@ -9,7 +9,7 @@ use std::sync::atomic::{AtomicUsize, Ordering::Relaxed}; use crate::{ canonize::{canonize, CanonizeMode, Labeling}, - molecule::Molecule, + molecule::Molecule, state::State, }; /// Strategy for memoizing assembly states in the search phase. @@ -160,13 +160,17 @@ impl NewCache { } } - pub fn memoize_state(&mut self, mol: &Molecule, state: &[BitSet], state_index: usize, removal_order: &Vec) -> bool { + pub fn memoize_state(&mut self, mol: &Molecule, state: &State) -> bool { + let fragments = state.fragments(); + let state_index = state.index(); + let removal_order = state.removal_order(); + if self.memoize_mode == MemoizeMode::None { return false; } let mut frag_ids = Vec::new(); - for frag in state { + for frag in fragments { let id = self.get_canon_id(mol, frag); frag_ids.push(id); } diff --git a/src/state.rs b/src/state.rs new file mode 100644 index 00000000..4193bf2c --- /dev/null +++ b/src/state.rs @@ -0,0 +1,54 @@ +use bit_set::BitSet; + +use crate::molecule::Molecule; + +pub struct State { + fragments: Vec, + index: usize, + removal_order: Vec, + last_removed: usize, +} + +impl State { + pub fn new(mol: &Molecule) -> Self { + Self { + fragments: { + let mut init = BitSet::new(); + init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + vec![init] + }, + index: mol.graph().edge_count(), + removal_order: Vec::new(), + last_removed: 0, + } + } + + pub fn update(&self, fragments: Vec, removed_len: usize, removed_idx: usize, last_removed: usize) -> Self { + Self { + fragments, + index: self.index - removed_len + 1, + removal_order: { + let mut clone = self.removal_order.clone(); + clone.push(removed_idx); + clone + }, + last_removed, + } + } + + pub fn fragments(&self) -> &[BitSet] { + &self.fragments + } + + pub fn index(&self) -> usize { + self.index + } + + pub fn removal_order(&self) -> &Vec { + &self.removal_order + } + + pub fn last_removed(&self) -> usize { + self.last_removed + } +} \ No newline at end of file From 688bbc685d22bbe92f96737c57f8949bd4b743bf Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 15:11:27 -0700 Subject: [PATCH 46/51] Refactor bounding --- src/assembly.rs | 1 + src/matches.rs | 66 +++++++++++++++++++++++++++---------------------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 69b661ad..df1fe6ac 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -187,6 +187,7 @@ pub fn recurse_index_search( return (state.index(), 1); } + // Generate matches let (frags, valid_matches) = matches.generate_matches(mol, state, best_index.load(Relaxed), bounds); // Keep track of the best assembly index found in any of this assembly diff --git a/src/matches.rs b/src/matches.rs index 818fd012..79576c5c 100644 --- a/src/matches.rs +++ b/src/matches.rs @@ -392,8 +392,44 @@ impl Matches { // Use bounding strategy to find the largest length match // to try removing from this state. + let largest_length = self.bound(state_index, best, &masks, bounds); + + // Create matches + for bucket in buckets_by_len[largest_length-2..].iter().rev() { + for fragments in bucket.values() { + for i in 0..fragments.len() { + for j in i+1..fragments.len() { + let mut frag1_id = fragments[i].0; + let mut frag2_id = fragments[j].0; + + if frag1_id < frag2_id { + let temp = frag1_id; + frag1_id = frag2_id; + frag2_id = temp; + } + + if let Some(x) = self.match_to_id.get(&(frag1_id, frag2_id)){ + if *x >= last_removed { + valid_matches.push((frag1_id, frag2_id)); + } + } + } + } + } + } + + // Sort matches + valid_matches.sort(); + valid_matches.reverse(); + + (state, valid_matches) + } + + // Return the largest size removal that can possible result in savings + pub fn bound(&self, state_index: usize, best: usize, masks: &Vec>, bounds: &[Bound]) -> usize { let mut largest_length = 2; + for (i, list) in masks.iter().enumerate() { let mut stop = false; for bound_type in bounds { @@ -469,35 +505,7 @@ impl Matches { } } - // Create matches - for bucket in buckets_by_len[largest_length-2..].iter().rev() { - for fragments in bucket.values() { - for i in 0..fragments.len() { - for j in i+1..fragments.len() { - let mut frag1_id = fragments[i].0; - let mut frag2_id = fragments[j].0; - - if frag1_id < frag2_id { - let temp = frag1_id; - frag1_id = frag2_id; - frag2_id = temp; - } - - if let Some(x) = self.match_to_id.get(&(frag1_id, frag2_id)){ - if *x >= last_removed { - valid_matches.push((frag1_id, frag2_id)); - } - } - } - } - } - } - - // Sort matches - valid_matches.sort(); - valid_matches.reverse(); - - (state, valid_matches) + largest_length } pub fn len(&self) -> usize { From 39b72c413805392a24f540e213bf3583700a94ac Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 15:44:05 -0700 Subject: [PATCH 47/51] Build clique --- src/matches.rs | 13 ++++++++++++- src/reductions.rs | 32 ++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/matches.rs b/src/matches.rs index 79576c5c..b60c5045 100644 --- a/src/matches.rs +++ b/src/matches.rs @@ -23,7 +23,7 @@ pub struct Matches { id_to_match: Vec<(usize, usize)>, match_to_id: FxHashMap<(usize, usize), usize>, dag: Vec, - // clique: CompatGraph, + clique: CompatGraph, // clique_offset: usize, edge_types: Vec, } @@ -37,6 +37,14 @@ impl DagNode { children: Vec::new(), } } + + pub fn fragment(&self) -> &BitSet { + &self.fragment + } + + pub fn len(&self) -> usize { + self.fragment.len() + } } impl Matches { @@ -250,10 +258,13 @@ impl Matches { } } + let clique = CompatGraph::new(&dag, &id_to_match, 2); + Self { id_to_match, match_to_id, dag, + clique, edge_types, } } diff --git a/src/reductions.rs b/src/reductions.rs index f7e699c6..71f75c79 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -11,35 +11,46 @@ use bit_set::BitSet; use std::time::Instant; -use std::collections::HashMap; + +use crate::matches::DagNode; /// Graph representation of the compatibility of duplicatable subgraph pairs. pub struct CompatGraph { /// The graph is implemented as an adjacency matrix. The ith element of graph /// has a 1 at position j if {i, j} is an edge. graph: Vec, + offset: usize, } impl CompatGraph { /// Constructs a compatibility graph given a set of matches. - pub fn new(init_matches: &Vec<(usize, usize)>, dag: &HashMap)>) -> Self { + pub fn new(dag: &Vec, matches: &Vec<(usize, usize)>, largest_len: usize) -> Self { let start = Instant::now(); - let size = init_matches.len(); + + // Find the index of the first match in the clique + // TODO: what if largest_len < maximum length in dag? + let mut offset = 0; + while dag[matches[offset].0].len() < largest_len { + offset += 1; + } + + // Compute number of nodes in clique + let size = matches.len() - offset; // Initialize weights and empty graph let mut init_graph: Vec = Vec::with_capacity(size); - for _ in 0..init_matches.len() { + for _ in 0..size { init_graph.push(BitSet::with_capacity(size)); } // Populate graph - for (idx1, (h1, h2)) in init_matches.iter().enumerate() { - for (idx2, (h1p, h2p)) in init_matches[idx1 + 1..].iter().enumerate() { + for (idx1, match1) in matches[offset..].iter().enumerate() { + for (idx2, match2) in matches[offset + idx1 + 1..].iter().enumerate() { let idx2 = idx2 + idx1 + 1; - let h1 = &dag.get(h1).unwrap().0; - let h1p = &dag.get(h1p).unwrap().0; - let h2 = &dag.get(h2).unwrap().0; - let h2p = &dag.get(h2p).unwrap().0; + let h1 = dag[match1.0].fragment(); + let h2 = dag[match1.1].fragment(); + let h1p = dag[match2.0].fragment(); + let h2p = dag[match2.1].fragment(); let forward_compatible = { h2.is_disjoint(h1p) && @@ -59,6 +70,7 @@ impl CompatGraph { Self { graph: init_graph, + offset, } } From 9a64cad57de0ab70e2486b70f1e94b4b11bc7986 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 15:53:25 -0700 Subject: [PATCH 48/51] Fix off by 1 error in state --- src/state.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/state.rs b/src/state.rs index 4193bf2c..1d709b9b 100644 --- a/src/state.rs +++ b/src/state.rs @@ -17,7 +17,7 @@ impl State { init.extend(mol.graph().edge_indices().map(|ix| ix.index())); vec![init] }, - index: mol.graph().edge_count(), + index: mol.graph().edge_count() - 1, removal_order: Vec::new(), last_removed: 0, } From eae23de3ce0d77b1d997c07761408aabe0b38540 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 16:09:35 -0700 Subject: [PATCH 49/51] Make clique an option --- src/assembly.rs | 4 ++-- src/matches.rs | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index df1fe6ac..74825f77 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -323,10 +323,10 @@ pub fn index_search( memoize_mode: MemoizeMode, _kernel_mode: KernelMode, bounds: &[Bound], - _clique: bool, + clique: bool, ) -> (u32, u32, usize) { // Enumerate non-overlapping isomorphic subgraph pairs. - let matches = Matches::new(mol, canonize_mode); + let matches = Matches::new(mol, canonize_mode, clique); // Create memoization cache. let mut cache = NewCache::new(memoize_mode); diff --git a/src/matches.rs b/src/matches.rs index b60c5045..5e753e9a 100644 --- a/src/matches.rs +++ b/src/matches.rs @@ -23,7 +23,7 @@ pub struct Matches { id_to_match: Vec<(usize, usize)>, match_to_id: FxHashMap<(usize, usize), usize>, dag: Vec, - clique: CompatGraph, + clique: Option, // clique_offset: usize, edge_types: Vec, } @@ -49,7 +49,7 @@ impl DagNode { impl Matches { // Generate DAG, clique, and other info from the base molecule. - pub fn new(mol: &Molecule, canonize_mode: CanonizeMode) -> Self { + pub fn new(mol: &Molecule, canonize_mode: CanonizeMode, use_clique: bool) -> Self { let num_edges = mol.graph().edge_count(); let mut subgraphs: Vec = Vec::new(); @@ -258,7 +258,12 @@ impl Matches { } } - let clique = CompatGraph::new(&dag, &id_to_match, 2); + let clique = if use_clique { + Some(CompatGraph::new(&dag, &id_to_match, 2)) + } + else { + None + }; Self { id_to_match, From 19533da28ee25e9803342ff005fa942d74103aab Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 18:16:58 -0700 Subject: [PATCH 50/51] Add handoff from dag to clique --- src/assembly.rs | 9 ++++--- src/matches.rs | 69 +++++++++++++++++++++++++++++++++++++++++------ src/reductions.rs | 21 ++++++++++++--- src/state.rs | 26 ++++++++++++++---- 4 files changed, 106 insertions(+), 19 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 74825f77..315e84fc 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -181,14 +181,17 @@ pub fn recurse_index_search( ) -> (usize, usize) { // if removal_order.len() == 1 {println!("{:?}", removal_order);} // println!("{:?}", state); - + // Memoization if cache.memoize_state(mol, state) { return (state.index(), 1); } // Generate matches - let (frags, valid_matches) = matches.generate_matches(mol, state, best_index.load(Relaxed), bounds); + // Some additional fragmentation of the state may happen while generating matches, returned as frags + // Valid matches is list of matches to remove as pairs of fragment ids + // Clique subgraph is the nodes present in the clique at this state + let (frags, valid_matches, clique_subgraph) = matches.generate_matches(mol, state, best_index.load(Relaxed), bounds); // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. @@ -212,7 +215,7 @@ pub fn recurse_index_search( parallel_mode }; - let new_state = state.update(fragments, h1.len(), i, matches.match_id(&v).unwrap()); + let new_state = state.update(matches, fragments, &v, i, &clique_subgraph); // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search( diff --git a/src/matches.rs b/src/matches.rs index 5e753e9a..8fbd5a55 100644 --- a/src/matches.rs +++ b/src/matches.rs @@ -49,6 +49,7 @@ impl DagNode { impl Matches { // Generate DAG, clique, and other info from the base molecule. + // TODO: generate and return state framework instaed of triplet pub fn new(mol: &Molecule, canonize_mode: CanonizeMode, use_clique: bool) -> Self { let num_edges = mol.graph().edge_count(); @@ -274,7 +275,18 @@ impl Matches { } } - pub fn generate_matches(&self, mol: &Molecule, state: &State, best: usize, bounds: &[Bound]) -> (Vec, Vec<(usize, usize)>) { + pub fn generate_matches(&self, mol: &Molecule, state: &State, best: usize, bounds: &[Bound]) -> (Vec, Vec<(usize, usize)>, Option) { + self.valid_matches_dag(mol, state, best, bounds) + + // if state.use_subgraph() { + // self.valid_matches_subgraph() + // } + // else { + // self.valid_matches_dag() + // } + } + + fn valid_matches_dag(&self, mol: &Molecule, state: &State, best: usize, bounds: &[Bound]) -> (Vec, Vec<(usize, usize)>, Option) { let num_edges = mol.graph().edge_count(); let state_frags = state.fragments(); let state_index = state.index(); @@ -410,8 +422,28 @@ impl Matches { // to try removing from this state. let largest_length = self.bound(state_index, best, &masks, bounds); - // Create matches - for bucket in buckets_by_len[largest_length-2..].iter().rev() { + // Clique reduction subgraph from this state. If none we do not create a subgraph + let mut clique_subgraph = + if let Some(clique) = &self.clique { + if clique.max_len() >= largest_length { + Some(BitSet::with_capacity(clique.len())) + } + else { + None + } + } else { + None + }; + + // Create valid matches and subgraph if needed + for (bucket_idx, bucket) in buckets_by_len.iter().enumerate().rev() { + // If we do not have to create the subgraph, and buckets with this size + // have been bounded, then we can stop generating valid matches + let match_len = bucket_idx + 2; + if clique_subgraph == None && match_len < largest_length { + break; + } + for fragments in bucket.values() { for i in 0..fragments.len() { for j in i+1..fragments.len() { @@ -424,9 +456,17 @@ impl Matches { frag2_id = temp; } - if let Some(x) = self.match_to_id.get(&(frag1_id, frag2_id)){ - if *x >= last_removed { + if let Some(match_id) = self.match_to_id.get(&(frag1_id, frag2_id)) { + if *match_id >= last_removed { valid_matches.push((frag1_id, frag2_id)); + + // Add to subgraph if conditions are met + if let Some(sub) = clique_subgraph.as_mut() { + let clique_id = self.clique.as_ref().unwrap().clique_id(*match_id); + if clique_id != -1 { + sub.insert(clique_id as usize); + } + } } } } @@ -438,7 +478,11 @@ impl Matches { valid_matches.sort(); valid_matches.reverse(); - (state, valid_matches) + (state, valid_matches, clique_subgraph) + } + + fn valid_matches_subgraph(&self) { + } // Return the largest size removal that can possible result in savings @@ -528,12 +572,21 @@ impl Matches { self.id_to_match.len() } - pub fn get_frag(&self, id: usize) -> &BitSet { - &self.dag[id].fragment + pub fn get_frag(&self, frag_id: usize) -> &BitSet { + &self.dag[frag_id].fragment } pub fn match_id(&self, mat: &(usize, usize)) -> Option { self.match_to_id.get(mat).copied() } + + pub fn clique_max_len(&self) -> usize { + if let Some(clique) = &self.clique { + clique.max_len() + } + else { + 0 + } + } } diff --git a/src/reductions.rs b/src/reductions.rs index 71f75c79..4e27300e 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -19,18 +19,19 @@ pub struct CompatGraph { /// The graph is implemented as an adjacency matrix. The ith element of graph /// has a 1 at position j if {i, j} is an edge. graph: Vec, + max_len: usize, offset: usize, } impl CompatGraph { /// Constructs a compatibility graph given a set of matches. - pub fn new(dag: &Vec, matches: &Vec<(usize, usize)>, largest_len: usize) -> Self { + pub fn new(dag: &Vec, matches: &Vec<(usize, usize)>, max_len: usize) -> Self { let start = Instant::now(); // Find the index of the first match in the clique - // TODO: what if largest_len < maximum length in dag? + // TODO: what if max_len < maximum length in dag? let mut offset = 0; - while dag[matches[offset].0].len() < largest_len { + while dag[matches[offset].0].len() < max_len { offset += 1; } @@ -70,6 +71,7 @@ impl CompatGraph { Self { graph: init_graph, + max_len, offset, } } @@ -79,6 +81,19 @@ impl CompatGraph { self.graph.len() } + pub fn max_len(&self) -> usize { + self.max_len + } + + pub fn clique_id(&self, match_id: usize) -> isize { + if match_id < self.offset { + -1 + } + else { + (match_id - self.offset) as isize + } + } + /// Returns the degree vertex v in the subgraph pub fn degree(&self, v: usize, subgraph: &BitSet) -> usize { self.graph[v].intersection(subgraph).count() diff --git a/src/state.rs b/src/state.rs index 1d709b9b..69d4f145 100644 --- a/src/state.rs +++ b/src/state.rs @@ -1,9 +1,10 @@ use bit_set::BitSet; -use crate::molecule::Molecule; +use crate::{matches::Matches, molecule::Molecule}; pub struct State { fragments: Vec, + clique_subgraph: Option, index: usize, removal_order: Vec, last_removed: usize, @@ -17,22 +18,33 @@ impl State { init.extend(mol.graph().edge_indices().map(|ix| ix.index())); vec![init] }, + clique_subgraph: None, index: mol.graph().edge_count() - 1, removal_order: Vec::new(), last_removed: 0, } } - pub fn update(&self, fragments: Vec, removed_len: usize, removed_idx: usize, last_removed: usize) -> Self { + pub fn update(&self, matches: &Matches, fragments: Vec, mat: &(usize, usize), order_idx: usize, subgraph: &Option) -> Self { + let match_len = matches.get_frag(mat.0).len(); + Self { fragments, - index: self.index - removed_len + 1, + index: self.index - match_len + 1, removal_order: { let mut clone = self.removal_order.clone(); - clone.push(removed_idx); + clone.push(order_idx); clone }, - last_removed, + last_removed: matches.match_id(mat).unwrap(), + clique_subgraph: { + if *subgraph != None && match_len <= matches.clique_max_len() { + Some(subgraph.as_ref().unwrap().clone()) + } + else { + None + } + } } } @@ -51,4 +63,8 @@ impl State { pub fn last_removed(&self) -> usize { self.last_removed } + + pub fn use_subgraph(&self) -> bool { + self.clique_subgraph != None + } } \ No newline at end of file From 83ca9815ec6eeb68d4997367ad09245f8deb020f Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 18 Sep 2025 23:40:16 -0700 Subject: [PATCH 51/51] Small fixes --- src/reductions.rs | 4 +++- src/state.rs | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/reductions.rs b/src/reductions.rs index 4e27300e..5b60771b 100644 --- a/src/reductions.rs +++ b/src/reductions.rs @@ -31,7 +31,7 @@ impl CompatGraph { // Find the index of the first match in the clique // TODO: what if max_len < maximum length in dag? let mut offset = 0; - while dag[matches[offset].0].len() < max_len { + while dag[matches[offset].0].len() > max_len { offset += 1; } @@ -85,6 +85,8 @@ impl CompatGraph { self.max_len } + // Takes a match id and returns its id in the clique + // Returns -1 if the match is not used in the clique pub fn clique_id(&self, match_id: usize) -> isize { if match_id < self.offset { -1 diff --git a/src/state.rs b/src/state.rs index 69d4f145..4ad1a82f 100644 --- a/src/state.rs +++ b/src/state.rs @@ -39,7 +39,7 @@ impl State { last_removed: matches.match_id(mat).unwrap(), clique_subgraph: { if *subgraph != None && match_len <= matches.clique_max_len() { - Some(subgraph.as_ref().unwrap().clone()) + subgraph.clone() } else { None