Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
d80d734
Add clique reduction
Garrett-Pz Jul 25, 2025
5ebfad0
Implement subgraph and new loop in index search
Garrett-Pz Jul 28, 2025
6cb9fe1
Implement using CompatGraph to choose next match
Garrett-Pz Jul 28, 2025
6ccb952
Add CLI argument for clique
Garrett-Pz Jul 28, 2025
f8f0437
Fix search with clique
Garrett-Pz Jul 28, 2025
39e91db
Add clique budget bound
Garrett-Pz Jul 28, 2025
a981ef4
Add default clique bounds. Only allow clique bounds with clique
Garrett-Pz Jul 28, 2025
2bac102
Add cover bound
Garrett-Pz Jul 28, 2025
fe7b47a
Add deletion kernel. Not fully implemented yet
Garrett-Pz Jul 28, 2025
e94d09c
Add support for kernel modes. Implement deletion kernel
Garrett-Pz Jul 28, 2025
51c961d
Add inclusion clone for serial search
Garrett-Pz Jul 29, 2025
6aec6a9
Change must_include_kernel return to usize
Garrett-Pz Jul 29, 2025
82c6701
Implement inclusion kernel for parallel modes
Garrett-Pz Jul 29, 2025
a468413
Optimize largest_remove computation
Garrett-Pz Jul 29, 2025
0f45f5a
Add some comments and documentation
Garrett-Pz Jul 29, 2025
69082d0
Optimize inclusion_kernel return
Garrett-Pz Jul 29, 2025
30612c0
Fix removal_order
Garrett-Pz Aug 4, 2025
26cea2f
Add new int bound
Garrett-Pz Aug 21, 2025
f0c38bc
Add cache counter
Garrett-Pz Aug 21, 2025
4f87f7a
Faster matches
Garrett-Pz Aug 21, 2025
fef2d13
Remove cache counter
Garrett-Pz Aug 21, 2025
ccd5a79
new memoize strategy
Garrett-Pz Aug 26, 2025
4f0f7c0
Modified int bound
Garrett-Pz Aug 26, 2025
7bd9fdd
New (unoptimized) method of calculating masks
Garrett-Pz Aug 27, 2025
66256cd
Optimize masks computation
Garrett-Pz Aug 28, 2025
8742844
Build dag
Garrett-Pz Aug 28, 2025
aaca9c8
New matches generation with DAG
Garrett-Pz Sep 2, 2025
e700ed6
Enumerate matches using dag
Garrett-Pz Sep 3, 2025
0997b00
Create masks with frag_id to state_id map
Garrett-Pz Sep 3, 2025
87ce547
Create masks by searching through state
Garrett-Pz Sep 3, 2025
e66d911
Add int bound
Garrett-Pz Sep 3, 2025
a14e9ed
Cleanup some code
Garrett-Pz Sep 3, 2025
b2689d8
Don't extend subgraphs without a match
Garrett-Pz Sep 3, 2025
fc86fee
Add timers
Garrett-Pz Sep 4, 2025
266cdca
Remove itertools.combinations from search
Garrett-Pz Sep 4, 2025
f7b15f3
Move mask creation into match enumeration
Garrett-Pz Sep 4, 2025
0b90e2e
Build matches after bucketing
Garrett-Pz Sep 4, 2025
9e7e1f5
Move when memoization happens
Garrett-Pz Sep 4, 2025
afb46b3
Add simple vector bound
Garrett-Pz Sep 5, 2025
e1650d9
Move vector bound
Garrett-Pz Sep 5, 2025
883444f
Replace bugged vector bound
Garrett-Pz Sep 9, 2025
09c2cf5
Update cargo.toml
Garrett-Pz Sep 9, 2025
535c251
Change vector bound
Garrett-Pz Sep 12, 2025
d820b0f
New matches struct
Garrett-Pz Sep 18, 2025
fd418c0
New state struct
Garrett-Pz Sep 18, 2025
688bbc6
Refactor bounding
Garrett-Pz Sep 18, 2025
39b72c4
Build clique
Garrett-Pz Sep 18, 2025
9a64cad
Fix off by 1 error in state
Garrett-Pz Sep 18, 2025
eae23de
Make clique an option
Garrett-Pz Sep 18, 2025
19533da
Add handoff from dag to clique
Garrett-Pz Sep 19, 2025
83ca981
Small fixes
Garrett-Pz Sep 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ petgraph = "0.6.5"
pyo3 = { version = "0.24.1", features = ["abi3-py310", "extension-module"]}
rayon = "1.10.0"
dashmap = "6.1.0"
fxhash = "0.2.1"
itertools = "0.14.0"

[dev-dependencies]
criterion = "0.3"
Expand Down
180 changes: 63 additions & 117 deletions src/assembly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,18 @@
//! ```

use std::{
collections::HashMap,
sync::{
atomic::{AtomicUsize, Ordering::Relaxed},
Arc,
},
}, time::{Duration, Instant}
};

use bit_set::BitSet;
use clap::ValueEnum;
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
use rayon::iter::{IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator};

use crate::{
bounds::{bound_exceeded, Bound},
canonize::{canonize, CanonizeMode, Labeling},
enumerate::{enumerate_subgraphs, EnumerateMode},
kernels::KernelMode,
memoize::{Cache, MemoizeMode},
molecule::Molecule,
utils::connected_components_under_edges,
bounds::Bound, canonize::CanonizeMode, enumerate::EnumerateMode, kernels::KernelMode, matches::Matches, memoize::{MemoizeMode, NewCache}, molecule:: Molecule, state::State, utils::connected_components_under_edges
};

/// Parallelization strategy for the recursive search phase.
Expand Down Expand Up @@ -94,58 +87,6 @@ pub fn depth(mol: &Molecule) -> u32 {
ix
}

/// Helper function for [`index_search`]; only public for benchmarking.
///
/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule,
/// sorted to guarantee deterministic iteration.
pub fn matches(
mol: &Molecule,
enumerate_mode: EnumerateMode,
canonize_mode: CanonizeMode,
) -> Vec<(BitSet, BitSet)> {
// Enumerate all connected, non-induced subgraphs with at most |E|/2 edges
// and bin them into isomorphism classes using canonization.
let mut isomorphism_classes = HashMap::<Labeling, Vec<BitSet>>::new();
for subgraph in enumerate_subgraphs(mol, enumerate_mode) {
isomorphism_classes
.entry(canonize(mol, &subgraph, canonize_mode))
.and_modify(|bucket| bucket.push(subgraph.clone()))
.or_insert(vec![subgraph.clone()]);
}

// In each isomorphism class, collect non-overlapping pairs of subgraphs.
let mut matches = Vec::new();
for bucket in isomorphism_classes.values() {
for (i, first) in bucket.iter().enumerate() {
for second in &bucket[i..] {
if first.is_disjoint(second) {
if first > second {
matches.push((first.clone(), second.clone()));
} else {
matches.push((second.clone(), first.clone()));
}
}
}
}
}

// Sort pairs in a deterministic order and return.
matches.sort_by(|e1, e2| {
let ord = [
e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size.
e1.0.cmp(&e2.0), // First subgraph lexicographical.
e1.1.cmp(&e2.1), // Second subgraph lexicographical.
];
let mut i = 0;
while ord[i] == std::cmp::Ordering::Equal {
i += 1;
}
ord[i]
});

matches
}

/// Determine the fragments produced from the given assembly state by removing
/// the given pair of non-overlapping, isomorphic subgraphs and then adding one
/// back; return `None` if not possible.
Expand Down Expand Up @@ -193,21 +134,35 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti
Some(fragments)
}

fn _is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec<Vec<BitSet>>) -> bool {
let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c));
let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c));

if let (Some((i1, _)), Some((i2, _))) = (f1, f2) {
masks[h1.len() - 2][i1].union_with(h1);
masks[h1.len() - 2][i2].union_with(h2);
true
} else {
false
}
}

/// Recursive helper for [`index_search`], only public for benchmarking.
///
/// Inputs:
/// - `mol`: The molecule whose assembly index is being calculated.
/// - `matches`: The remaining non-overlapping isomorphic subgraph pairs.
/// - `graph`: If clique is enabled, the graph of compatible isomorphic subgraph paris.
/// - `subgraph`: A bitset with length of matches. Has a 1 for every match to be searched in this state.
/// - `removal_order`: TODO
/// - `state`: The current assembly state, i.e., a list of fragments.
/// - `state_index`: This assembly state's upper bound on the assembly index,
/// i.e., edges(mol) - 1 - [edges(subgraphs removed) - #(subgraphs removed)].
/// - `best_index`: The smallest assembly index for all assembly states so far.
/// - `largest_remove`: An upper bound on the size of fragments that can be
/// removed from this or any descendant assembly state.
/// - `bounds`: The list of bounding strategies to apply.
/// - `cache`: TODO
/// - `parallel_mode`: The parallelism mode for this state's match iteration.
/// - `kernel_mode`: The kernelization mode for this state.
///
/// Returns, from this assembly state and any of its descendents:
/// - `usize`: An updated upper bound on the assembly index. (Note: If this
Expand All @@ -217,39 +172,41 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti
#[allow(clippy::too_many_arguments)]
pub fn recurse_index_search(
mol: &Molecule,
matches: &[(BitSet, BitSet)],
removal_order: Vec<usize>,
state: &[BitSet],
state_index: usize,
matches: &Matches,
state: &State,
best_index: Arc<AtomicUsize>,
largest_remove: usize,
bounds: &[Bound],
cache: &mut Cache,
cache: &mut NewCache,
parallel_mode: ParallelMode,
) -> (usize, usize) {
// If any bounds would prune this assembly state or if memoization is
// enabled and this assembly state is preempted by the cached state, halt.
if bound_exceeded(
mol,
state,
state_index,
best_index.load(Relaxed),
largest_remove,
bounds,
) || cache.memoize_state(mol, state, state_index, &removal_order)
{
return (state_index, 1);
// if removal_order.len() == 1 {println!("{:?}", removal_order);}
// println!("{:?}", state);

// Memoization
if cache.memoize_state(mol, state) {
return (state.index(), 1);
}

// Generate matches
// Some additional fragmentation of the state may happen while generating matches, returned as frags
// Valid matches is list of matches to remove as pairs of fragment ids
// Clique subgraph is the nodes present in the clique at this state
let (frags, valid_matches, clique_subgraph) = matches.generate_matches(mol, state, best_index.load(Relaxed), bounds);

// Keep track of the best assembly index found in any of this assembly
// state's children and the number of states searched, including this one.
let best_child_index = AtomicUsize::from(state_index);
let best_child_index = AtomicUsize::from(state.index());
let states_searched = AtomicUsize::from(1);

// Define a closure that handles recursing to a new assembly state based on
// the given (enumerated) pair of non-overlapping isomorphic subgraphs.
let recurse_on_match = |i: usize, h1: &BitSet, h2: &BitSet| {
if let Some(fragments) = fragments(mol, state, h1, h2) {
let recurse_on_match = |i: usize, v: (usize, usize)| {
//let (h1, h2) = &matches[v];
let h1 = matches.get_frag(v.0);
let h2 = matches.get_frag(v.1);

// TODO: try keeping track of fragment to elimate some some search
if let Some(fragments) = fragments(mol, &frags, h1, h2) {
// If using depth-one parallelism, all descendant states should be
// computed serially.
let new_parallel = if parallel_mode == ParallelMode::DepthOne {
Expand All @@ -258,19 +215,14 @@ pub fn recurse_index_search(
parallel_mode
};

let new_state = state.update(matches, fragments, &v, i, &clique_subgraph);

// Recurse using the remaining matches and updated fragments.
let (child_index, child_states_searched) = recurse_index_search(
mol,
&matches[i + 1..],
{
let mut clone = removal_order.clone();
clone.push(i);
clone
},
&fragments,
state_index - h1.len() + 1,
matches,
&new_state,
best_index.clone(),
h1.len(),
bounds,
&mut cache.clone(),
new_parallel,
Expand All @@ -286,15 +238,15 @@ pub fn recurse_index_search(

// Use the iterator type corresponding to the specified parallelism mode.
if parallel_mode == ParallelMode::None {
matches
valid_matches
.iter()
.enumerate()
.for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
.for_each(|(i, v)| recurse_on_match(i, *v));
} else {
matches
valid_matches
.par_iter()
.enumerate()
.for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2));
.for_each(|(i, v)| recurse_on_match(i, *v));
}

(
Expand Down Expand Up @@ -368,44 +320,37 @@ pub fn recurse_index_search(
/// ```
pub fn index_search(
mol: &Molecule,
enumerate_mode: EnumerateMode,
_enumerate_mode: EnumerateMode,
canonize_mode: CanonizeMode,
parallel_mode: ParallelMode,
memoize_mode: MemoizeMode,
kernel_mode: KernelMode,
_kernel_mode: KernelMode,
bounds: &[Bound],
clique: bool,
) -> (u32, u32, usize) {
// Catch not-yet-implemented modes.
if kernel_mode != KernelMode::None {
panic!("The chosen --kernel mode is not implemented yet!")
}

// Enumerate non-overlapping isomorphic subgraph pairs.
let matches = matches(mol, enumerate_mode, canonize_mode);
let matches = Matches::new(mol, canonize_mode, clique);

// Create memoization cache.
let mut cache = Cache::new(memoize_mode, canonize_mode);
let mut cache = NewCache::new(memoize_mode);

// Initialize the first fragment as the entire graph.
let mut init = BitSet::new();
init.extend(mol.graph().edge_indices().map(|ix| ix.index()));
// Initialize state
let state = State::new(mol);

// Search for the shortest assembly pathway recursively.
let edge_count = mol.graph().edge_count();
let best_index = Arc::new(AtomicUsize::from(edge_count - 1));
let best_index = Arc::new(AtomicUsize::from(mol.graph().edge_count() - 1));
let (index, states_searched) = recurse_index_search(
mol,
&matches,
Vec::new(),
&[init],
edge_count - 1,
&state,
best_index,
edge_count,
bounds,
&mut cache,
parallel_mode,
);

// println!("Bounded: {}", cache.count());

(index as u32, matches.len() as u32, states_searched)
}

Expand Down Expand Up @@ -437,6 +382,7 @@ pub fn index(mol: &Molecule) -> u32 {
MemoizeMode::CanonIndex,
KernelMode::None,
&[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags],
false,
)
.0
}
Loading