diff --git a/src/assembly.rs b/src/assembly.rs index e8660f44..31b15275 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -28,7 +28,7 @@ use std::{ use bit_set::BitSet; use clap::ValueEnum; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; +use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use crate::{ bounds::{bound_exceeded, Bound}, @@ -37,6 +37,7 @@ use crate::{ kernels::KernelMode, molecule::Molecule, utils::connected_components_under_edges, + reductions::CompatGraph, }; /// Parallelization strategy for the search phase. @@ -190,21 +191,32 @@ fn fractures( #[allow(clippy::too_many_arguments)] fn recurse_index_search_serial( mol: &Molecule, - matches: &[(BitSet, BitSet)], fragments: &[BitSet], state_index: usize, - mut best_index: usize, - largest_remove: usize, + best_index: Arc, bounds: &[Bound], + matches_graph: &CompatGraph, + subgraph: BitSet, ) -> (usize, usize) { + let largest_remove = { + if let Some(v) = subgraph.iter().next() { + matches_graph.weight(v) + 1 + } + else { + return (state_index, 1); + } + }; + // If any bounds are exceeded, halt this search branch. if bound_exceeded( mol, fragments, state_index, - best_index, + best_index.load(Relaxed), largest_remove, bounds, + matches_graph, + &subgraph, ) { return (state_index, 1); } @@ -217,30 +229,32 @@ fn recurse_index_search_serial( // For every pair of duplicatable subgraphs compatible with the current set // of fragments, recurse using the fragments obtained by removing this pair // and adding one subgraph back. - for (i, (h1, h2)) in matches.iter().enumerate() { + for v in subgraph.iter() { + let (h1, h2) = matches_graph.matches(v); if let Some(fractures) = fractures(mol, fragments, h1, h2) { + let subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); + // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search_serial( mol, - &matches[i + 1..], &fractures, state_index - h1.len() + 1, - best_index, - h1.len(), + best_index.clone(), bounds, + matches_graph, + subgraph_clone, ); // Update the best assembly indices (across children states and // the entire search) and the number of descendant states searched. best_child_index = best_child_index.min(child_index); - best_index = best_index.min(best_child_index); + best_index.fetch_min(best_child_index, Relaxed); states_searched += child_states_searched; } } (best_child_index, states_searched) } - /// Recursive helper for the depth-one parallel version of index_search. /// /// Inputs: @@ -256,11 +270,12 @@ fn recurse_index_search_serial( #[allow(clippy::too_many_arguments)] fn recurse_index_search_depthone( mol: &Molecule, - matches: &[(BitSet, BitSet)], fragments: &[BitSet], state_index: usize, best_index: Arc, bounds: &[Bound], + matches_graph: &CompatGraph, + subgraph: BitSet, ) -> (usize, usize) { // Keep track of the number of states searched, including this one. let states_searched = Arc::new(AtomicUsize::from(1)); @@ -268,17 +283,20 @@ fn recurse_index_search_depthone( // For every pair of duplicatable subgraphs compatible with the current set // of fragments, recurse using the fragments obtained by removing this pair // and adding one subgraph back. - matches.par_iter().enumerate().for_each(|(i, (h1, h2))| { + subgraph.iter().collect::>().par_iter().for_each(|v| { + let (h1, h2) = matches_graph.matches(*v); if let Some(fractures) = fractures(mol, fragments, h1, h2) { + let subgraph_clone = matches_graph.forward_neighbors(*v, &subgraph); + // Recurse using the remaining matches and updated fragments. - let (child_index, child_states_searched) = recurse_index_search_depthone_helper( + let (child_index, child_states_searched) = recurse_index_search_serial( mol, - &matches[i + 1..], &fractures, state_index - h1.len() + 1, best_index.clone(), - h1.len(), bounds, + matches_graph, + subgraph_clone, ); // Update the best assembly indices (across children states and @@ -291,74 +309,6 @@ fn recurse_index_search_depthone( (best_index.load(Relaxed), states_searched.load(Relaxed)) } -/// Recursive helper for the depth-one parallel version of index_search. -/// -/// Inputs: -/// - `mol`: The molecule whose assembly index is being calculated. -/// - `matches`: The remaining non-overlapping isomorphic subgraph pairs. -/// - `fragments`: TODO -/// - `state_index`: The assembly index of this assembly state. -/// - `best_index`: The smallest assembly index for all assembly states so far. -/// - `largest_remove`: An upper bound on the size of fragments that can be -/// removed from this or any descendant state. -/// - `bounds`: The list of bounding strategies to apply. -/// -/// Returns, from this assembly state and any of its descendents: -/// - `usize`: The best assembly index found. -/// - `usize`: The number of assembly states searched. -#[allow(clippy::too_many_arguments)] -fn recurse_index_search_depthone_helper( - mol: &Molecule, - matches: &[(BitSet, BitSet)], - fragments: &[BitSet], - state_index: usize, - best_index: Arc, - largest_remove: usize, - bounds: &[Bound], -) -> (usize, usize) { - // If any bounds are exceeded, halt this search branch. - if bound_exceeded( - mol, - fragments, - state_index, - best_index.load(Relaxed), - largest_remove, - bounds, - ) { - return (state_index, 1); - } - - // Keep track of the best assembly index found in any of this assembly - // state's children and the number of states searched, including this one. - let mut best_child_index = state_index; - let mut states_searched = 1; - - // For every pair of duplicatable subgraphs compatible with the current set - // of fragments, recurse using the fragments obtained by removing this pair - // and adding one subgraph back. - for (i, (h1, h2)) in matches.iter().enumerate() { - if let Some(fractures) = fractures(mol, fragments, h1, h2) { - // Recurse using the remaining matches and updated fragments. - let (child_index, child_states_searched) = recurse_index_search_depthone_helper( - mol, - &matches[i + 1..], - &fractures, - state_index - h1.len() + 1, - best_index.clone(), - h1.len(), - bounds, - ); - - // Update the best assembly indices (across children states and - // the entire search) and the number of descendant states searched. - best_child_index = best_child_index.min(child_index); - best_index.fetch_min(best_child_index, Relaxed); - states_searched += child_states_searched; - } - } - - (best_child_index, states_searched) -} /// Recursive helper for the parallel version of index_search. /// @@ -377,12 +327,13 @@ fn recurse_index_search_depthone_helper( #[allow(clippy::too_many_arguments)] fn recurse_index_search_parallel( mol: &Molecule, - matches: &[(BitSet, BitSet)], fragments: &[BitSet], state_index: usize, best_index: Arc, largest_remove: usize, bounds: &[Bound], + matches_graph: &CompatGraph, + subgraph: BitSet, ) -> (usize, usize) { // If any bounds are exceeded, halt this search branch. if bound_exceeded( @@ -392,6 +343,8 @@ fn recurse_index_search_parallel( best_index.load(Relaxed), largest_remove, bounds, + matches_graph, + &subgraph, ) { return (state_index, 1); } @@ -404,17 +357,21 @@ fn recurse_index_search_parallel( // For every pair of duplicatable subgraphs compatible with the current set // of fragments, recurse using the fragments obtained by removing this pair // and adding one subgraph back. - matches.par_iter().enumerate().for_each(|(i, (h1, h2))| { + subgraph.iter().collect::>().par_iter().for_each(|v| { + let (h1, h2) = matches_graph.matches(*v); if let Some(fractures) = fractures(mol, fragments, h1, h2) { + let subgraph_clone = matches_graph.forward_neighbors(*v, &subgraph); + // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search_parallel( mol, - &matches[i + 1..], &fractures, state_index - h1.len() + 1, best_index.clone(), h1.len(), bounds, + matches_graph, + subgraph_clone, ); // Update the best assembly indices (across children states and @@ -509,6 +466,11 @@ pub fn index_search( // Enumerate non-overlapping isomorphic subgraph pairs. let matches = matches(mol, enumerate_mode, canonize_mode).collect::>(); + let matches_graph = CompatGraph::new(matches); + let mut subgraph = BitSet::with_capacity(matches_graph.len()); + for i in 0..matches_graph.len() { + subgraph.insert(i); + } // Initialize the first fragment as the entire graph. let mut init = BitSet::new(); @@ -518,41 +480,46 @@ pub fn index_search( // Search for the shortest assembly pathway recursively. let (index, states_searched) = match parallel_mode { - ParallelMode::None => recurse_index_search_serial( - mol, - &matches, - &[init], - edge_count - 1, - edge_count - 1, - edge_count, - bounds, - ), + ParallelMode::None => { + let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); + recurse_index_search_serial( + mol, + &[init], + edge_count - 1, + best_index, + bounds, + &matches_graph, + subgraph, + ) + } ParallelMode::DepthOne => { let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); recurse_index_search_depthone( mol, - &matches, &[init], edge_count - 1, best_index, bounds, + &matches_graph, + subgraph, ) } ParallelMode::Always => { let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); recurse_index_search_parallel( mol, - &matches, &[init], edge_count - 1, best_index, edge_count, bounds, + &matches_graph, + subgraph, ) } }; - (index as u32, matches.len() as u32, states_searched) + (index as u32, matches_graph.len() as u32, states_searched) } /// Computes a molecule's assembly index using an efficient default strategy. diff --git a/src/bounds.rs b/src/bounds.rs index c3b516e6..c694ef6e 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -1,10 +1,12 @@ -//! Bounds for identifying assembly states (i.e., collections of fragments) -//! from which no further assembly index improvements can be found. +//! TODO use bit_set::BitSet; use clap::ValueEnum; -use crate::molecule::{Bond, Element, Molecule}; +use crate::{ + molecule::{Bond, Element, Molecule}, + reductions::{CompatGraph}, +}; /// Bounding strategies for the search phase of assembly index calcluation. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] @@ -39,24 +41,24 @@ struct EdgeType { pub fn bound_exceeded( mol: &Molecule, fragments: &[BitSet], - state_index: usize, - best_index: usize, + ix: usize, + best: usize, largest_remove: usize, bounds: &[Bound], + matches_graph: &CompatGraph, + subgraph: &BitSet, ) -> bool { for bound_type in bounds { let exceeds = match bound_type { - Bound::Log => state_index - log_bound(fragments) >= best_index, - Bound::Int => state_index - int_bound(fragments, largest_remove) >= best_index, - Bound::VecSimple => { - state_index - vec_simple_bound(fragments, largest_remove, mol) >= best_index - } + Bound::Log => ix - log_bound(fragments) >= best, + Bound::Int => ix - int_bound(fragments, largest_remove) >= best, + Bound::VecSimple => ix - vec_simple_bound(fragments, largest_remove, mol) >= best, Bound::VecSmallFrags => { - state_index - vec_small_frags_bound(fragments, largest_remove, mol) >= best_index - } - _ => { - panic!("One of the chosen bounds is not implemented yet!") + ix - vec_small_frags_bound(fragments, largest_remove, mol) >= best } + Bound::CliqueBudget => ix - clique_budget_bound(matches_graph, subgraph, fragments) >= best, + Bound::CoverNoSort => ix - cover_bound(matches_graph, subgraph, false) >= best, + Bound::CoverSort => ix - cover_bound(matches_graph, subgraph, true) >= best, }; if exceeds { return true; @@ -211,3 +213,194 @@ pub fn vec_small_frags_bound(fragments: &[BitSet], m: usize, mol: &Molecule) -> s - (z + size_two_types.len() + size_two_fragments.len()) - ((sl - z) as f32 / m as f32).ceil() as usize } + +pub fn remaining_weight_bound(graph: &CompatGraph, subgraph: &BitSet) -> usize { + let deg_sum = subgraph.iter().map(|v| graph.degree(v, subgraph)).sum::() as f32; + let max_clique = ((1_f32 + (4_f32 * deg_sum + 1_f32).sqrt()) / 2_f32).floor() as usize; + let mut sum = 0; + let mut iter = subgraph.iter(); + for _ in 0..max_clique { + sum += graph.weight(iter.next().unwrap()); + }; + + sum +} + +pub fn color_bound(graph: &CompatGraph, subgraph: &BitSet) -> usize{ + // Greedy coloring + let mut colors: Vec = vec![-1; graph.len()]; + let mut num_colors = 0; + let mut largest: Vec = Vec::new(); + + + for v in (0..graph.len()).rev() { + if !subgraph.contains(v) { + continue; + } + + let mut used: Vec = vec![0; num_colors]; + + for u in subgraph.intersection(graph.compatible_with(v)) { + if colors[u] != -1 { + used[colors[u] as usize] = 1; + } + } + + let mut max = 0; + let mut max_idx = num_colors; + for i in 0..num_colors { + if used[i] == 0 && largest[i] > max { + max = largest[i]; + max_idx = i; + } + } + + if max_idx == num_colors { + num_colors += 1; + largest.push(0); + } + if graph.weight(v) > largest[max_idx] { + largest[max_idx] = graph.weight(v); + } + + colors[v] = max_idx as i32; + } + + largest.iter().sum::() +} + +pub fn cover_bound(graph: &CompatGraph, subgraph: &BitSet, sort: bool) -> usize { + // Sort vertices + if sort { + let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len()); + for v in subgraph { + vertices.push((v, graph.degree(v, subgraph))); + } + vertices.sort_by(|a, b| b.1.cmp(&a.1)); + cover_bound_helper(graph, subgraph, vertices.iter().map(|(v, _)| *v)) + } + else { + let vertices = (0..graph.len()).rev().filter(|v| subgraph.contains(*v)); + cover_bound_helper(graph, subgraph, vertices) + } +} + +fn cover_bound_helper(graph: &CompatGraph, subgraph: &BitSet, iter: impl Iterator) -> usize { + let mut colors: Vec>> = vec![None; graph.len()]; + let mut col_weights = vec![]; + let mut num_col = 0; + + for v in iter { + let mut v_col = Vec::new(); + let mut used = vec![0; num_col]; + + // Find colors used in neighborhood of v + for u in subgraph.intersection(graph.compatible_with(v)) { + let Some(u_col) = &colors[u] else { + continue; + }; + + for c in u_col { + used[*c] = 1; + } + } + + let mut total_weight = 0; + let v_val = graph.weight(v); + // Find colors to give to v + for c in 0..num_col { + if used[c] == 1 { + continue; + } + + v_col.push(c); + total_weight += col_weights[c]; + + if total_weight >= v_val { + break; + } + } + + if total_weight == 0 { + v_col.push(num_col); + col_weights.push(v_val); + num_col += 1 + } + else if total_weight < v_val { + let mut k = num_col - 1; + while used[k] == 1 { + k -= 1 + } + col_weights[k] += v_val - total_weight + } + + colors[v] = Some(v_col); + }; + + col_weights.iter().sum() +} + +pub fn clique_budget_bound(graph: &CompatGraph, subgraph: &BitSet, fragments: &[BitSet]) -> usize { + let total_bonds = fragments.iter().map(|x| x.len()).sum::(); + let mut bound = 0; + let sizes = { + let mut vec = vec![]; + let mut prev = 0; + for s in subgraph.iter().map(|v| graph.weight(v) + 1) { + if s != prev { + vec.push(s); + prev = s; + } + } + + vec + }; + + for i in sizes { + let mut bound_temp = 0; + let mut has_bonds = fragments.len(); + let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); + let mut smallest_remove = i; + + for v in subgraph.iter() { + if has_bonds == 0 { + break; + } + if graph.weight(v) + 1 > i { + continue; + } + + + let dup = &graph.matches(v); + let bond = dup.1.iter().next().unwrap(); + let mut j = 0; + while !fragments[j].contains(bond) { + j += 1; + } + + if num_bonds[j] > 0 { + let remove = std::cmp::min(dup.0.len(), num_bonds[j]); + bound_temp += 1; + num_bonds[j] -= remove; + smallest_remove = std::cmp::min(smallest_remove, remove); + + if num_bonds[j] == 0 { + has_bonds -= 1; + } + } + } + + let leftover = num_bonds.iter().sum::(); + let log = { + if leftover > 0 { + 0 + } + else { + (smallest_remove as f32).log2().ceil() as usize + } + }; + bound = std::cmp::max(bound, total_bonds - bound_temp - leftover - log); + } + + bound +} diff --git a/src/lib.rs b/src/lib.rs index 18a3077f..306f8b52 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -55,6 +55,9 @@ pub mod kernels; // Utility functions mod utils; +// Reductions to other problem instances +pub mod reductions; + // Python library #[cfg(feature = "python")] pub mod python; diff --git a/src/main.rs b/src/main.rs index 1f723c31..efecbb99 100644 --- a/src/main.rs +++ b/src/main.rs @@ -94,7 +94,7 @@ fn main() -> Result<()> { // Handle bounding strategy CLI arguments. let boundlist: &[Bound] = match cli.boundsgroup { // By default, use a combination of the integer and vector bounds. - None => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], + None => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags, Bound::CliqueBudget, Bound::CoverNoSort], // If --no-bounds is set, do not use any bounds. Some(BoundsGroup { no_bounds: true, .. diff --git a/src/reductions.rs b/src/reductions.rs new file mode 100644 index 00000000..aa4ee875 --- /dev/null +++ b/src/reductions.rs @@ -0,0 +1,121 @@ +use bit_set::BitSet; + +pub struct CompatGraph { + graph: Vec, + weights: Vec, + matches: Vec<(BitSet, BitSet)> +} + +impl CompatGraph { + pub fn new(init_matches: Vec<(BitSet, BitSet)>) -> Self { + let size = init_matches.len(); + + // Initialize weights and empty graph + let mut init_graph: Vec = Vec::with_capacity(size); + let mut init_weights: Vec = Vec::with_capacity(size); + for m in init_matches.iter() { + init_graph.push(BitSet::with_capacity(size)); + init_weights.push(m.0.len() - 1); + } + + // Populate graph + for (idx1, (h1, h2)) in init_matches.iter().enumerate() { + for (idx2, (h1p, h2p)) in init_matches[idx1 + 1..].iter().enumerate() { + let idx2 = idx2 + idx1 + 1; + + let forward_compatible = { + h2.is_disjoint(h1p) && + h2.is_disjoint(h2p) && + (h1.is_disjoint(h1p) || h1.is_superset(h1p)) && + (h1.is_disjoint(h2p) || h1.is_superset(h2p)) + }; + + if forward_compatible { + init_graph[idx1].insert(idx2); + init_graph[idx2].insert(idx1); + } + } + } + + Self { + graph: init_graph, + weights: init_weights, + matches: init_matches, + } + } + + pub fn weight(&self, v: usize) -> usize { + self.weights[v] + } + + pub fn matches(&self, v: usize) -> &(BitSet, BitSet) { + &self.matches[v] + } + + pub fn len(&self) -> usize { + self.graph.len() + } + + pub fn degree(&self, v: usize, subgraph: &BitSet) -> usize { + self.graph[v].intersection(subgraph).count() + } + + pub fn compatible_with(&self, v: usize) -> &BitSet { + &self.graph[v] + } + + pub fn neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + + neighbors + } + + pub fn forward_neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + let mut to_remove = vec![]; + for u in neighbors.iter() { + if u <= v { + to_remove.push(u); + } + if u > v { + break; + } + } + for u in to_remove { + neighbors.remove(u); + } + + neighbors + } + + pub fn are_adjacent(&self, v: usize, u: usize) -> bool { + self.graph[v].contains(u) + } + + pub fn savings_ground_truth(&self, subgraph: &BitSet) -> usize { + self.savings_ground_truth_recurse(0, 0, subgraph) + } + + fn savings_ground_truth_recurse(&self, ix: usize, mut best: usize, subgraph: &BitSet) -> usize { + if subgraph.len() == 0 { + return ix; + } + let mut cx = ix; + + // Search for duplicatable fragment + for v in subgraph.iter() { + let subgraph_clone = self.forward_neighbors(v, &subgraph); + + cx = cx.max(self.savings_ground_truth_recurse( + ix + self.weights[v], + best, + &subgraph_clone, + )); + best = best.max(cx); + } + + cx + } +} \ No newline at end of file