DaymudeLab · Garrett-Pz · Jul 25, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -18,6 +18,8 @@ petgraph = "0.6.5"
 pyo3 = { version = "0.24.1", features = ["abi3-py310", "extension-module"]}
 rayon = "1.10.0"
 dashmap = "6.1.0"
+fxhash = "0.2.1"
+itertools = "0.14.0"
 
 [dev-dependencies]
 criterion = "0.3"

diff --git a/src/assembly.rs b/src/assembly.rs
@@ -19,25 +19,18 @@
 //! ```
 
 use std::{
-    collections::HashMap,
     sync::{
         atomic::{AtomicUsize, Ordering::Relaxed},
         Arc,
-    },
+    }, time::{Duration, Instant}
 };
 
 use bit_set::BitSet;
 use clap::ValueEnum;
-use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
+use rayon::iter::{IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator};
 
 use crate::{
-    bounds::{bound_exceeded, Bound},
-    canonize::{canonize, CanonizeMode, Labeling},
-    enumerate::{enumerate_subgraphs, EnumerateMode},
-    kernels::KernelMode,
-    memoize::{Cache, MemoizeMode},
-    molecule::Molecule,
-    utils::connected_components_under_edges,
+    bounds::Bound, canonize::CanonizeMode, enumerate::EnumerateMode, kernels::KernelMode, matches::Matches, memoize::{MemoizeMode, NewCache}, molecule:: Molecule, state::State, utils::connected_components_under_edges
 };
 
 /// Parallelization strategy for the recursive search phase.
@@ -94,58 +87,6 @@ pub fn depth(mol: &Molecule) -> u32 {
     ix
 }
 
-/// Helper function for [`index_search`]; only public for benchmarking.
-///
-/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule,
-/// sorted to guarantee deterministic iteration.
-pub fn matches(
-    mol: &Molecule,
-    enumerate_mode: EnumerateMode,
-    canonize_mode: CanonizeMode,
-) -> Vec<(BitSet, BitSet)> {
-    // Enumerate all connected, non-induced subgraphs with at most |E|/2 edges
-    // and bin them into isomorphism classes using canonization.
-    let mut isomorphism_classes = HashMap::<Labeling, Vec<BitSet>>::new();
-    for subgraph in enumerate_subgraphs(mol, enumerate_mode) {
-        isomorphism_classes
-            .entry(canonize(mol, &subgraph, canonize_mode))
-            .and_modify(|bucket| bucket.push(subgraph.clone()))
-            .or_insert(vec![subgraph.clone()]);
-    }
-
-    // In each isomorphism class, collect non-overlapping pairs of subgraphs.
-    let mut matches = Vec::new();
-    for bucket in isomorphism_classes.values() {
-        for (i, first) in bucket.iter().enumerate() {
-            for second in &bucket[i..] {
-                if first.is_disjoint(second) {
-                    if first > second {
-                        matches.push((first.clone(), second.clone()));
-                    } else {
-                        matches.push((second.clone(), first.clone()));
-                    }
-                }
-            }
-        }
-    }
-
-    // Sort pairs in a deterministic order and return.
-    matches.sort_by(|e1, e2| {
-        let ord = [
-            e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size.
-            e1.0.cmp(&e2.0),             // First subgraph lexicographical.
-            e1.1.cmp(&e2.1),             // Second subgraph lexicographical.
-        ];
-        let mut i = 0;
-        while ord[i] == std::cmp::Ordering::Equal {
-            i += 1;
-        }
-        ord[i]
-    });
-
-    matches
-}
-
 /// Determine the fragments produced from the given assembly state by removing
 /// the given pair of non-overlapping, isomorphic subgraphs and then adding one
 /// back; return `None` if not possible.
@@ -193,21 +134,35 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti
     Some(fragments)
 }
 
+fn _is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec<Vec<BitSet>>) -> bool {
+    let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
+    let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));
+
+    if let (Some((i1, _)), Some((i2, _))) = (f1, f2) {
+        masks[h1.len() - 2][i1].union_with(h1);
+        masks[h1.len() - 2][i2].union_with(h2);
+        true
+    } else {
+        false
+    }
+}
+
 /// Recursive helper for [`index_search`], only public for benchmarking.
 ///
 /// Inputs:
 /// - `mol`: The molecule whose assembly index is being calculated.
 /// - `matches`: The remaining non-overlapping isomorphic subgraph pairs.
+/// - `graph`: If clique is enabled, the graph of compatible isomorphic subgraph paris.
+/// - `subgraph`: A bitset with length of matches. Has a 1 for every match to be searched in this state.
 /// - `removal_order`: TODO
 /// - `state`: The current assembly state, i.e., a list of fragments.
 /// - `state_index`: This assembly state's upper bound on the assembly index,
 ///   i.e., edges(mol) - 1 - [edges(subgraphs removed) - #(subgraphs removed)].
 /// - `best_index`: The smallest assembly index for all assembly states so far.
-/// - `largest_remove`: An upper bound on the size of fragments that can be
-///   removed from this or any descendant assembly state.
 /// - `bounds`: The list of bounding strategies to apply.
 /// - `cache`: TODO
 /// - `parallel_mode`: The parallelism mode for this state's match iteration.
+/// - `kernel_mode`: The kernelization mode for this state.
 ///
 /// Returns, from this assembly state and any of its descendents:
 /// - `usize`: An updated upper bound on the assembly index. (Note: If this
@@ -217,39 +172,41 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti
 #[allow(clippy::too_many_arguments)]
 pub fn recurse_index_search(
     mol: &Molecule,
-    matches: &[(BitSet, BitSet)],
-    removal_order: Vec<usize>,
-    state: &[BitSet],
-    state_index: usize,
+    matches: &Matches,
+    state: &State,
     best_index: Arc<AtomicUsize>,
-    largest_remove: usize,
     bounds: &[Bound],
-    cache: &mut Cache,
+    cache: &mut NewCache,
     parallel_mode: ParallelMode,
 ) -> (usize, usize) {
-    // If any bounds would prune this assembly state or if memoization is
-    // enabled and this assembly state is preempted by the cached state, halt.
-    if bound_exceeded(
-        mol,
-        state,
-        state_index,
-        best_index.load(Relaxed),
-        largest_remove,
-        bounds,
-    ) || cache.memoize_state(mol, state, state_index, &removal_order)
-    {
-        return (state_index, 1);
+    // if removal_order.len() == 1 {println!("{:?}", removal_order);}
+    // println!("{:?}", state);
+
+    // Memoization
+    if cache.memoize_state(mol, state) {
+        return (state.index(), 1);
     }
 
+    // Generate matches
+    // Some additional fragmentation of the state may happen while generating matches, returned as frags
+    // Valid matches is list of matches to remove as pairs of fragment ids
+    // Clique subgraph is the nodes present in the clique at this state
+    let (frags, valid_matches, clique_subgraph) = matches.generate_matches(mol, state, best_index.load(Relaxed), bounds);
+
     // Keep track of the best assembly index found in any of this assembly
     // state's children and the number of states searched, including this one.
-    let best_child_index = AtomicUsize::from(state_index);
+    let best_child_index = AtomicUsize::from(state.index());
     let states_searched = AtomicUsize::from(1);
-
+    
     // Define a closure that handles recursing to a new assembly state based on
     // the given (enumerated) pair of non-overlapping isomorphic subgraphs.
-    let recurse_on_match = |i: usize, h1: &BitSet, h2: &BitSet| {
-        if let Some(fragments) = fragments(mol, state, h1, h2) {
+    let recurse_on_match = |i: usize, v: (usize, usize)| {
+        //let (h1, h2) = &matches[v];
+        let h1 = matches.get_frag(v.0);
+        let h2 = matches.get_frag(v.1);
+
+        // TODO: try keeping track of fragment to elimate some some search
+        if let Some(fragments) = fragments(mol, &frags, h1, h2) {
             // If using depth-one parallelism, all descendant states should be
             // computed serially.
             let new_parallel = if parallel_mode == ParallelMode::DepthOne {
@@ -258,19 +215,14 @@ pub fn recurse_index_search(
                 parallel_mode
             };
 
+            let new_state = state.update(matches, fragments, &v, i, &clique_subgraph);
+
             // Recurse using the remaining matches and updated fragments.
             let (child_index, child_states_searched) = recurse_index_search(
                 mol,
-                &matches[i + 1..],
-                {
-                    let mut clone = removal_order.clone();
-                    clone.push(i);
-                    clone
-                },
-                &fragments,
-                state_index - h1.len() + 1,
+                matches,
+                &new_state,
                 best_index.clone(),
-                h1.len(),
                 bounds,
                 &mut cache.clone(),
                 new_parallel,
@@ -286,15 +238,15 @@ pub fn recurse_index_search(
 
     // Use the iterator type corresponding to the specified parallelism mode.
     if parallel_mode == ParallelMode::None {
-        matches
+        valid_matches
             .iter()
             .enumerate()
-            .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
+            .for_each(|(i, v)| recurse_on_match(i, *v));
     } else {
-        matches
+        valid_matches
             .par_iter()
             .enumerate()
-            .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
+            .for_each(|(i, v)| recurse_on_match(i, *v));
     }
 
     (
@@ -368,44 +320,37 @@ pub fn recurse_index_search(
 /// ```
 pub fn index_search(
     mol: &Molecule,
-    enumerate_mode: EnumerateMode,
+    _enumerate_mode: EnumerateMode,
     canonize_mode: CanonizeMode,
     parallel_mode: ParallelMode,
     memoize_mode: MemoizeMode,
-    kernel_mode: KernelMode,
+    _kernel_mode: KernelMode,
     bounds: &[Bound],
+    clique: bool,
 ) -> (u32, u32, usize) {
-    // Catch not-yet-implemented modes.
-    if kernel_mode != KernelMode::None {
-        panic!("The chosen --kernel mode is not implemented yet!")
-    }
-
     // Enumerate non-overlapping isomorphic subgraph pairs.
-    let matches = matches(mol, enumerate_mode, canonize_mode);
+    let matches = Matches::new(mol, canonize_mode, clique);
 
     // Create memoization cache.
-    let mut cache = Cache::new(memoize_mode, canonize_mode);
+    let mut cache = NewCache::new(memoize_mode);
 
-    // Initialize the first fragment as the entire graph.
-    let mut init = BitSet::new();
-    init.extend(mol.graph().edge_indices().map(|ix| ix.index()));
+    // Initialize state
+    let state = State::new(mol);
 
     // Search for the shortest assembly pathway recursively.
-    let edge_count = mol.graph().edge_count();
-    let best_index = Arc::new(AtomicUsize::from(edge_count - 1));
+    let best_index = Arc::new(AtomicUsize::from(mol.graph().edge_count() - 1));
     let (index, states_searched) = recurse_index_search(
         mol,
         &matches,
-        Vec::new(),
-        &[init],
-        edge_count - 1,
+        &state,
         best_index,
-        edge_count,
         bounds,
         &mut cache,
         parallel_mode,
     );
 
+    // println!("Bounded: {}", cache.count());
+
     (index as u32, matches.len() as u32, states_searched)
 }
 
@@ -437,6 +382,7 @@ pub fn index(mol: &Molecule) -> u32 {
         MemoizeMode::CanonIndex,
         KernelMode::None,
         &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags],
+        false,
     )
     .0
 }