diff --git a/Cargo.lock b/Cargo.lock index 372210a9..d1c16b66 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,7 +77,9 @@ dependencies = [ "criterion", "csv", "dashmap", + "fxhash", "graph-canon", + "itertools 0.14.0", "petgraph", "pyo3", "rayon", @@ -168,6 +170,12 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cast" version = "0.3.0" @@ -301,7 +309,7 @@ dependencies = [ "clap 2.34.0", "criterion-plot", "csv", - "itertools", + "itertools 0.10.5", "lazy_static", "num-traits", "oorandom", @@ -323,7 +331,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] [[package]] @@ -433,6 +441,15 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "glob" version = "0.3.2" @@ -544,6 +561,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" diff --git a/Cargo.toml b/Cargo.toml index f056c248..c24dc767 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,8 @@ petgraph = "0.6.5" pyo3 = { version = "0.24.1", features = ["abi3-py310", "extension-module"]} rayon = "1.10.0" dashmap = "6.1.0" +fxhash = "0.2.1" +itertools = "0.14.0" [dev-dependencies] criterion = "0.3" diff --git a/src/assembly.rs b/src/assembly.rs index 79aa3f9a..315e84fc 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -19,25 +19,18 @@ //! ``` use std::{ - collections::HashMap, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, - }, + }, time::{Duration, Instant} }; use bit_set::BitSet; use clap::ValueEnum; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; +use rayon::iter::{IntoParallelRefIterator, ParallelIterator, IndexedParallelIterator}; use crate::{ - bounds::{bound_exceeded, Bound}, - canonize::{canonize, CanonizeMode, Labeling}, - enumerate::{enumerate_subgraphs, EnumerateMode}, - kernels::KernelMode, - memoize::{Cache, MemoizeMode}, - molecule::Molecule, - utils::connected_components_under_edges, + bounds::Bound, canonize::CanonizeMode, enumerate::EnumerateMode, kernels::KernelMode, matches::Matches, memoize::{MemoizeMode, NewCache}, molecule:: Molecule, state::State, utils::connected_components_under_edges }; /// Parallelization strategy for the recursive search phase. @@ -94,58 +87,6 @@ pub fn depth(mol: &Molecule) -> u32 { ix } -/// Helper function for [`index_search`]; only public for benchmarking. -/// -/// Return all pairs of non-overlapping, isomorphic subgraphs in the molecule, -/// sorted to guarantee deterministic iteration. -pub fn matches( - mol: &Molecule, - enumerate_mode: EnumerateMode, - canonize_mode: CanonizeMode, -) -> Vec<(BitSet, BitSet)> { - // Enumerate all connected, non-induced subgraphs with at most |E|/2 edges - // and bin them into isomorphism classes using canonization. - let mut isomorphism_classes = HashMap::>::new(); - for subgraph in enumerate_subgraphs(mol, enumerate_mode) { - isomorphism_classes - .entry(canonize(mol, &subgraph, canonize_mode)) - .and_modify(|bucket| bucket.push(subgraph.clone())) - .or_insert(vec![subgraph.clone()]); - } - - // In each isomorphism class, collect non-overlapping pairs of subgraphs. - let mut matches = Vec::new(); - for bucket in isomorphism_classes.values() { - for (i, first) in bucket.iter().enumerate() { - for second in &bucket[i..] { - if first.is_disjoint(second) { - if first > second { - matches.push((first.clone(), second.clone())); - } else { - matches.push((second.clone(), first.clone())); - } - } - } - } - } - - // Sort pairs in a deterministic order and return. - matches.sort_by(|e1, e2| { - let ord = [ - e2.0.len().cmp(&e1.0.len()), // Decreasing subgraph size. - e1.0.cmp(&e2.0), // First subgraph lexicographical. - e1.1.cmp(&e2.1), // Second subgraph lexicographical. - ]; - let mut i = 0; - while ord[i] == std::cmp::Ordering::Equal { - i += 1; - } - ord[i] - }); - - matches -} - /// Determine the fragments produced from the given assembly state by removing /// the given pair of non-overlapping, isomorphic subgraphs and then adding one /// back; return `None` if not possible. @@ -193,21 +134,35 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti Some(fragments) } +fn _is_usable(state: &[BitSet], h1: &BitSet, h2: &BitSet, masks: &mut Vec>) -> bool { + let f1 = state.iter().enumerate().find(|(_, c)| h1.is_subset(c)); + let f2 = state.iter().enumerate().find(|(_, c)| h2.is_subset(c)); + + if let (Some((i1, _)), Some((i2, _))) = (f1, f2) { + masks[h1.len() - 2][i1].union_with(h1); + masks[h1.len() - 2][i2].union_with(h2); + true + } else { + false + } +} + /// Recursive helper for [`index_search`], only public for benchmarking. /// /// Inputs: /// - `mol`: The molecule whose assembly index is being calculated. /// - `matches`: The remaining non-overlapping isomorphic subgraph pairs. +/// - `graph`: If clique is enabled, the graph of compatible isomorphic subgraph paris. +/// - `subgraph`: A bitset with length of matches. Has a 1 for every match to be searched in this state. /// - `removal_order`: TODO /// - `state`: The current assembly state, i.e., a list of fragments. /// - `state_index`: This assembly state's upper bound on the assembly index, /// i.e., edges(mol) - 1 - [edges(subgraphs removed) - #(subgraphs removed)]. /// - `best_index`: The smallest assembly index for all assembly states so far. -/// - `largest_remove`: An upper bound on the size of fragments that can be -/// removed from this or any descendant assembly state. /// - `bounds`: The list of bounding strategies to apply. /// - `cache`: TODO /// - `parallel_mode`: The parallelism mode for this state's match iteration. +/// - `kernel_mode`: The kernelization mode for this state. /// /// Returns, from this assembly state and any of its descendents: /// - `usize`: An updated upper bound on the assembly index. (Note: If this @@ -217,39 +172,41 @@ fn fragments(mol: &Molecule, state: &[BitSet], h1: &BitSet, h2: &BitSet) -> Opti #[allow(clippy::too_many_arguments)] pub fn recurse_index_search( mol: &Molecule, - matches: &[(BitSet, BitSet)], - removal_order: Vec, - state: &[BitSet], - state_index: usize, + matches: &Matches, + state: &State, best_index: Arc, - largest_remove: usize, bounds: &[Bound], - cache: &mut Cache, + cache: &mut NewCache, parallel_mode: ParallelMode, ) -> (usize, usize) { - // If any bounds would prune this assembly state or if memoization is - // enabled and this assembly state is preempted by the cached state, halt. - if bound_exceeded( - mol, - state, - state_index, - best_index.load(Relaxed), - largest_remove, - bounds, - ) || cache.memoize_state(mol, state, state_index, &removal_order) - { - return (state_index, 1); + // if removal_order.len() == 1 {println!("{:?}", removal_order);} + // println!("{:?}", state); + + // Memoization + if cache.memoize_state(mol, state) { + return (state.index(), 1); } + // Generate matches + // Some additional fragmentation of the state may happen while generating matches, returned as frags + // Valid matches is list of matches to remove as pairs of fragment ids + // Clique subgraph is the nodes present in the clique at this state + let (frags, valid_matches, clique_subgraph) = matches.generate_matches(mol, state, best_index.load(Relaxed), bounds); + // Keep track of the best assembly index found in any of this assembly // state's children and the number of states searched, including this one. - let best_child_index = AtomicUsize::from(state_index); + let best_child_index = AtomicUsize::from(state.index()); let states_searched = AtomicUsize::from(1); - + // Define a closure that handles recursing to a new assembly state based on // the given (enumerated) pair of non-overlapping isomorphic subgraphs. - let recurse_on_match = |i: usize, h1: &BitSet, h2: &BitSet| { - if let Some(fragments) = fragments(mol, state, h1, h2) { + let recurse_on_match = |i: usize, v: (usize, usize)| { + //let (h1, h2) = &matches[v]; + let h1 = matches.get_frag(v.0); + let h2 = matches.get_frag(v.1); + + // TODO: try keeping track of fragment to elimate some some search + if let Some(fragments) = fragments(mol, &frags, h1, h2) { // If using depth-one parallelism, all descendant states should be // computed serially. let new_parallel = if parallel_mode == ParallelMode::DepthOne { @@ -258,19 +215,14 @@ pub fn recurse_index_search( parallel_mode }; + let new_state = state.update(matches, fragments, &v, i, &clique_subgraph); + // Recurse using the remaining matches and updated fragments. let (child_index, child_states_searched) = recurse_index_search( mol, - &matches[i + 1..], - { - let mut clone = removal_order.clone(); - clone.push(i); - clone - }, - &fragments, - state_index - h1.len() + 1, + matches, + &new_state, best_index.clone(), - h1.len(), bounds, &mut cache.clone(), new_parallel, @@ -286,15 +238,15 @@ pub fn recurse_index_search( // Use the iterator type corresponding to the specified parallelism mode. if parallel_mode == ParallelMode::None { - matches + valid_matches .iter() .enumerate() - .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2)); + .for_each(|(i, v)| recurse_on_match(i, *v)); } else { - matches + valid_matches .par_iter() .enumerate() - .for_each(|(i, (h1, h2))| recurse_on_match(i, h1, h2)); + .for_each(|(i, v)| recurse_on_match(i, *v)); } ( @@ -368,44 +320,37 @@ pub fn recurse_index_search( /// ``` pub fn index_search( mol: &Molecule, - enumerate_mode: EnumerateMode, + _enumerate_mode: EnumerateMode, canonize_mode: CanonizeMode, parallel_mode: ParallelMode, memoize_mode: MemoizeMode, - kernel_mode: KernelMode, + _kernel_mode: KernelMode, bounds: &[Bound], + clique: bool, ) -> (u32, u32, usize) { - // Catch not-yet-implemented modes. - if kernel_mode != KernelMode::None { - panic!("The chosen --kernel mode is not implemented yet!") - } - // Enumerate non-overlapping isomorphic subgraph pairs. - let matches = matches(mol, enumerate_mode, canonize_mode); + let matches = Matches::new(mol, canonize_mode, clique); // Create memoization cache. - let mut cache = Cache::new(memoize_mode, canonize_mode); + let mut cache = NewCache::new(memoize_mode); - // Initialize the first fragment as the entire graph. - let mut init = BitSet::new(); - init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + // Initialize state + let state = State::new(mol); // Search for the shortest assembly pathway recursively. - let edge_count = mol.graph().edge_count(); - let best_index = Arc::new(AtomicUsize::from(edge_count - 1)); + let best_index = Arc::new(AtomicUsize::from(mol.graph().edge_count() - 1)); let (index, states_searched) = recurse_index_search( mol, &matches, - Vec::new(), - &[init], - edge_count - 1, + &state, best_index, - edge_count, bounds, &mut cache, parallel_mode, ); + // println!("Bounded: {}", cache.count()); + (index as u32, matches.len() as u32, states_searched) } @@ -437,6 +382,7 @@ pub fn index(mol: &Molecule) -> u32 { MemoizeMode::CanonIndex, KernelMode::None, &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], + false, ) .0 } diff --git a/src/bounds.rs b/src/bounds.rs index 5cff5ec5..e13b097d 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -14,7 +14,10 @@ use bit_set::BitSet; use clap::ValueEnum; -use crate::molecule::{Bond, Element, Molecule}; +use crate::{ + molecule::{Bond, Element, Molecule}, + reductions::CompatGraph, +}; /// Type of upper bound on the "savings" possible from an assembly state. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] @@ -66,24 +69,43 @@ struct EdgeType { /// Returns `true` iff any of the given bounds would prune this assembly state. pub fn bound_exceeded( mol: &Molecule, + matches: &Vec<(BitSet, BitSet)>, + graph: &Option, fragments: &[BitSet], + subgraph: &BitSet, state_index: usize, best_index: usize, largest_remove: usize, bounds: &[Bound], + masks: &Vec>, ) -> bool { for bound_type in bounds { let exceeds = match bound_type { Bound::Log => state_index - log_bound(fragments) >= best_index, - Bound::Int => state_index - int_bound(fragments, largest_remove) >= best_index, + //Bound::Int => state_index - int_bound_seet(masks, largest_remove) >= best_index, + Bound::Int => best_index + int_bound_new(masks) <= state_index, Bound::VecSimple => { state_index - vec_simple_bound(fragments, largest_remove, mol) >= best_index } Bound::VecSmallFrags => { state_index - vec_small_frags_bound(fragments, largest_remove, mol) >= best_index } - _ => { - panic!("One of the chosen bounds is not implemented yet!") + Bound::CliqueBudget => best_index + clique_budget_bound(matches, subgraph, fragments) <= state_index, + Bound::CoverNoSort => { + if let Some(g) = graph { + best_index + cover_bound(matches, g, subgraph, false) <= state_index + } + else { + false + } + } + Bound::CoverSort => { + if let Some(g) = graph { + best_index + cover_bound(matches, g, subgraph, true) <= state_index + } + else { + false + } } }; if exceeds { @@ -104,7 +126,7 @@ fn log_bound(fragments: &[BitSet]) -> usize { } /// TODO -fn int_bound(fragments: &[BitSet], m: usize) -> usize { +fn _int_bound(fragments: &[BitSet], m: usize) -> usize { let mut max_s: usize = 0; let mut frag_sizes: Vec = Vec::new(); @@ -129,6 +151,74 @@ fn int_bound(fragments: &[BitSet], m: usize) -> usize { max_s } +fn int_bound_new(masks: &Vec>) -> usize { + let mut max = 0; + for (i, list) in masks.iter().enumerate() { + let mut val = 0; + let i = i + 2; + for (j, frag) in list.iter().enumerate() { + let mf = masks[0][j].len(); + let mi = frag.len(); + let x = mf + (mi % i) - mi; + val += mf - (mi / i) - x / (i-1) - (x % (i-1) != 0) as usize; + } + val -= (i as f32).log2().ceil() as usize; + max = max.max(val); + } + + max +} + +fn _int_bound_seet(masks: &Vec>, largest_remove: usize) -> usize { + let mut dup_bonds_2 = 0; + let mut dup_bonds_total; + let mut max; + let mut size_lists = vec![Vec::new(); masks.len()]; + + for j in 0..masks.len() + { + size_lists[j] = vec![0; masks[j].len()]; + for i in 0..masks[0].len() + { + size_lists[j][i] = masks[j][i].len(); + } + } + + for i in 0..size_lists[0].len() { + dup_bonds_2 += size_lists[0][i] / 2; + } + dup_bonds_2 -= 1; + max = dup_bonds_2; + + for j in 3..largest_remove+1 + { + let size_list = &size_lists[j - 2]; + let size_list2 = &size_lists[0]; + let mut adjusted_size_list = vec![0; size_list.len()]; + let mut adjusted_size_list2 = vec![0; size_list.len()]; + for i in 0..size_list.len() + { + adjusted_size_list[i] = size_list[i] - size_list[i] % j; + adjusted_size_list2[i] = size_list2[i] - adjusted_size_list[i]; + } + dup_bonds_total = 0; + for i in 0..size_list.len() + { + dup_bonds_total += adjusted_size_list[i] - adjusted_size_list[i] / j; + dup_bonds_total += adjusted_size_list2[i] - adjusted_size_list2[i] / (j - 1); + if adjusted_size_list2[i] % (j - 1) != 0 { + dup_bonds_total -= 1; + } + } + dup_bonds_total -= (j as f32).log2().ceil() as usize; + max = max.max(dup_bonds_total); + } + + max +} + + + /// TODO // Count number of unique edges in a fragment // Helper function for vector bounds @@ -239,3 +329,139 @@ fn vec_small_frags_bound(fragments: &[BitSet], m: usize, mol: &Molecule) -> usiz s - (z + size_two_types.len() + size_two_fragments.len()) - ((sl - z) as f32 / m as f32).ceil() as usize } + +pub fn clique_budget_bound(matches: &Vec<(BitSet, BitSet)>, subgraph: &BitSet, fragments: &[BitSet]) -> usize { + let total_bonds = fragments.iter().map(|x| x.len()).sum::(); + let mut bound = 0; + let sizes = { + let mut vec = vec![]; + let mut prev = 0; + for s in subgraph.iter().map(|v| matches[v].0.len()) { + if s != prev { + vec.push(s); + prev = s; + } + } + + vec + }; + + for i in sizes { + let mut bound_temp = 0; + let mut has_bonds = fragments.len(); + let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); + let mut smallest_remove = i; + + for v in subgraph.iter() { + if has_bonds == 0 { + break; + } + if matches[v].0.len() > i { + continue; + } + + + let dup = &matches[v]; + let bond = dup.1.iter().next().unwrap(); + let mut j = 0; + while !fragments[j].contains(bond) { + j += 1; + } + + if num_bonds[j] > 0 { + let remove = std::cmp::min(dup.0.len(), num_bonds[j]); + bound_temp += 1; + num_bonds[j] -= remove; + smallest_remove = std::cmp::min(smallest_remove, remove); + + if num_bonds[j] == 0 { + has_bonds -= 1; + } + } + } + + let leftover = num_bonds.iter().sum::(); + let log = { + if leftover > 0 { + 0 + } + else { + (smallest_remove as f32).log2().ceil() as usize + } + }; + bound = std::cmp::max(bound, total_bonds - bound_temp - leftover - log); + } + + bound +} + +pub fn cover_bound(matches: &Vec<(BitSet, BitSet)>, graph: &CompatGraph, subgraph: &BitSet, sort: bool) -> usize { + // Sort vertices + if sort { + let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len()); + for v in subgraph { + vertices.push((v, graph.degree(v, subgraph))); + } + vertices.sort_by(|a, b| b.1.cmp(&a.1)); + cover_bound_helper(matches, graph, subgraph, vertices.iter().map(|(v, _)| *v)) + } + else { + let vertices = (0..graph.len()).rev().filter(|v| subgraph.contains(*v)); + cover_bound_helper(matches, graph, subgraph, vertices) + } +} + +fn cover_bound_helper(matches: &Vec<(BitSet, BitSet)>, graph: &CompatGraph, subgraph: &BitSet, iter: impl Iterator) -> usize { + let mut colors: Vec>> = vec![None; graph.len()]; + let mut col_weights = vec![]; + let mut num_col = 0; + + for v in iter { + let mut v_col = Vec::new(); + let mut used = vec![0; num_col]; + + // Find colors used in neighborhood of v + for u in subgraph.intersection(graph.compatible_with(v)) { + let Some(u_col) = &colors[u] else { + continue; + }; + + for c in u_col { + used[*c] = 1; + } + } + + let mut total_weight = 0; + let v_val = matches[v].0.len() - 1; + // Find colors to give to v + for c in 0..num_col { + if used[c] == 1 { + continue; + } + + v_col.push(c); + total_weight += col_weights[c]; + + if total_weight >= v_val { + break; + } + } + + if total_weight == 0 { + v_col.push(num_col); + col_weights.push(v_val); + num_col += 1 + } + else if total_weight < v_val { + let mut k = num_col - 1; + while used[k] == 1 { + k -= 1 + } + col_weights[k] += v_val - total_weight + } + + colors[v] = Some(v_col); + }; + + col_weights.iter().sum() +} diff --git a/src/kernels.rs b/src/kernels.rs index 5b1f1292..f4574df0 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -12,6 +12,8 @@ //! the equivalent problem of weighted independent set.) use clap::ValueEnum; +use bit_set::BitSet; +use crate::reductions::CompatGraph; /// Graph kernelization strategy when searching using the clique reduction. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] @@ -27,3 +29,85 @@ pub enum KernelMode { /// Apply kernels after every fragmentation step. Always, } + +/// Takes a subgraph as input and removes all nodes that will not be part of any maximum weight clique. +/// Uses the strategy of domination as described in Lamm et al. +pub fn deletion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, mut subgraph: BitSet) -> BitSet{ + let subgraph_copy = subgraph.clone(); + + for v in subgraph_copy.iter() { + let v_val = matches[v].0.len(); + let neighbors_v = g.neighbors(v, &subgraph); + + let Some(w1) = neighbors_v.iter().next() else { + continue; + }; + let Some(w2) = neighbors_v.iter().last() else { + continue; + }; + + let mut s = subgraph.clone(); + s.intersect_with(&g.compatible_with(w1)); + for u in s.intersection(&&g.compatible_with(w2)) { + if g.are_adjacent(v, u) || v == u { + continue; + } + + let u_val = matches[u].0.len(); + if v_val > u_val { + continue; + } + + let neighbors_u = g.neighbors(u, &subgraph); + + if neighbors_v.is_subset(&neighbors_u) { + subgraph.remove(v); + break; + } + } + } + + subgraph +} + +/// Takes a subgraph as input and returns the first vertex that will be included in some maximum weight clique. +/// Uses the strategies of neighborhood removal and isolated vertex removal from Lamm et al. +pub fn inclusion_kernel(matches: &Vec<(BitSet, BitSet)>, g: &CompatGraph, subgraph: &BitSet) -> usize { + let tot = subgraph.iter().map(|v| matches[v].0.len() - 1).sum::(); + + 'outer: for v in subgraph { + let v_val = matches[v].0.len() - 1; + + // Neighborhood removal + let neighbors_val = g.neighbors(v, subgraph).iter().map(|u| matches[u].0.len() - 1).sum::(); + if v_val >= tot - neighbors_val - v_val { + return v; + } + + // Isolated vertex removal. + // Only makes it through this loop if the non-neighborhood of v is independent and + // contains vertices with weight no higher than v. + let mut neighbors: Vec = vec![]; + for u in subgraph.difference(&g.compatible_with(v)) { + if u == v { + continue; + } + if matches[u].0.len() - 1 > v_val { + continue 'outer; + } + + for w in neighbors.iter() { + if g.are_adjacent(u, *w) { + continue 'outer; + } + } + + neighbors.push(u); + } + + return v; + } + + matches.len() + +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index a8f8c8c2..80529e9d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,9 @@ pub mod canonize; pub mod enumerate; pub mod kernels; pub mod memoize; +pub mod reductions; +pub mod matches; +pub mod state; mod vf3; // Python wrapper. diff --git a/src/main.rs b/src/main.rs index 6df6f4e1..f526425c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -55,6 +55,9 @@ struct Cli { /// Strategy for performing graph kernelization during the search phase. #[arg(long, value_enum, default_value_t = KernelMode::None)] kernel: KernelMode, + + #[arg(long)] + clique: bool, } #[derive(Args, Debug)] @@ -93,20 +96,73 @@ fn main() -> Result<()> { } // Handle bounding strategy CLI arguments. - let boundlist: &[Bound] = match cli.boundsgroup { - // By default, use a combination of the integer and vector bounds. - None => &[Bound::Int, Bound::VecSimple, Bound::VecSmallFrags], + let boundlist: &[Bound] = match (cli.boundsgroup, cli.clique) { + // If clique not enable, default to a combination of the integer and vector bounds. + (None, false) => &[Bound::Int, Bound::VecSimple, /*Bound::VecSmallFrags*/], + // If clique is enabled, default to combination of int, vec, and clique bounds. + (None, true) => &[Bound::Int, Bound::VecSimple], // If --no-bounds is set, do not use any bounds. - Some(BoundsGroup { + (Some(BoundsGroup { no_bounds: true, .. - }) => &[], + }), _) => &[], // Otherwise, use the bounds that were specified. - Some(BoundsGroup { + (Some(BoundsGroup { no_bounds: false, bounds, - }) => &bounds.clone(), + }), _) => &bounds.clone(), }; + let clique_bounds = vec![Bound::CliqueBudget, Bound::CoverNoSort, Bound::CoverSort]; + for bound in clique_bounds { + if !cli.clique && boundlist.contains(&bound) { + panic!("Bound {:?} can not be used without clique enabled", bound); + } + if !cli.clique && cli.kernel != KernelMode::None { + panic!("Cannot use kernels without clique enabled"); + } + } + + + // use std::ffi::{OsStr, OsString}; + // use std::path::Path; + // use assembly_theory::molecule::Molecule; + // use rayon::iter::{IntoParallelIterator, ParallelIterator, IntoParallelRefIterator, IndexedParallelIterator}; + + // let paths = fs::read_dir(Path::new("data").join("coconut_220")).unwrap(); + // let mut mol_list: Vec = Vec::new(); + // let mut names: Vec = Vec::new(); + + // for path in paths { + // let name = path.unwrap().path(); + // if name.extension().and_then(OsStr::to_str) != Some("mol") { + // continue; + // } + // names.push(name.file_name().unwrap().to_os_string()); + // mol_list.push( + // parse_molfile_str( + // &fs::read_to_string(name.clone()) + // .expect(&format!("Could not read file {name:?}")), + // ) + // .expect(&format!("Failed to parse {name:?}")), + // ); + // } + + // names.par_iter().zip(mol_list.par_iter()).for_each(|(n, mol)| { + // let index = index_search( + // &mol, + // cli.enumerate, + // cli.canonize, + // cli.parallel, + // cli.memoize, + // cli.kernel, + // boundlist, + // cli.clique, + // ); + // println!("{:?}: MA: {} Space: {}, Searched: {}", n, index.0, index.1, index.2); + // }); + // std::process::exit(1); + + // Call index calculation with all the various options. let (index, num_matches, states_searched) = index_search( &mol, @@ -116,6 +172,7 @@ fn main() -> Result<()> { cli.memoize, cli.kernel, boundlist, + cli.clique, ); // Print final output, depending on --verbose. diff --git a/src/matches.rs b/src/matches.rs new file mode 100644 index 00000000..8fbd5a55 --- /dev/null +++ b/src/matches.rs @@ -0,0 +1,592 @@ +use std::collections::{BTreeMap, HashMap, HashSet}; + +use bit_set::BitSet; +use fxhash::FxHashMap; +use itertools::Itertools; +use petgraph::graph::EdgeIndex; + +use crate::{bounds::Bound, canonize::{canonize, CanonizeMode, Labeling}, molecule::{Bond, Element, Molecule}, reductions::CompatGraph, state::State, utils::{connected_components_under_edges, edge_neighbors}}; + +pub struct DagNode { + fragment: BitSet, + canon_id: usize, + children: Vec, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct EdgeType { + bond: Bond, + ends: (Element, Element), +} + +pub struct Matches { + id_to_match: Vec<(usize, usize)>, + match_to_id: FxHashMap<(usize, usize), usize>, + dag: Vec, + clique: Option, + // clique_offset: usize, + edge_types: Vec, +} + + +impl DagNode { + pub fn new(fragment: BitSet, canon_id: usize) -> Self { + Self { + fragment, + canon_id, + children: Vec::new(), + } + } + + pub fn fragment(&self) -> &BitSet { + &self.fragment + } + + pub fn len(&self) -> usize { + self.fragment.len() + } +} + +impl Matches { + // Generate DAG, clique, and other info from the base molecule. + // TODO: generate and return state framework instaed of triplet + pub fn new(mol: &Molecule, canonize_mode: CanonizeMode, use_clique: bool) -> Self { + let num_edges = mol.graph().edge_count(); + + let mut subgraphs: Vec = Vec::new(); + let mut constructed_frags: HashSet = HashSet::new(); + let mut frag_to_id: HashMap = HashMap::new(); + + let mut dag: Vec = Vec::with_capacity(num_edges); + let mut matches: Vec<(usize, usize)> = Vec::new(); + + let mut next_canon_id = 0; + + // Generate subgraphs with one edge + for i in 0..num_edges { + let mut new_bitset = BitSet::with_capacity(num_edges); + new_bitset.insert(i); + + let node = DagNode::new(new_bitset, 0); + + dag.push(node); + subgraphs.push(i); + } + + next_canon_id += 1; + + while subgraphs.len() > 0 { + // Initialize new buckets and subgraph list + let mut buckets: BTreeMap> = BTreeMap::new(); + let mut child_subgraphs: Vec = Vec::new(); + + // Extend and bucket new subgraphs + for parent_idx in subgraphs.iter() { + // Get fragment to be extended + let fragment = &dag[*parent_idx].fragment; + + // Construct edge neighborhood of fragment + let mut neighborhood = BitSet::with_capacity(num_edges); + for e in fragment { + neighborhood.extend(edge_neighbors(mol.graph(), EdgeIndex::new(e)).map(|x| x.index())); + } + neighborhood.difference_with(fragment); + + // Extend fragment with edges from neighborhood + for e in neighborhood.iter() { + // Create extended fragment + let mut new_fragment = fragment.clone(); + new_fragment.insert(e); + + // Check if fragment has already been created + // If yes, continue + if constructed_frags.contains(&new_fragment) { + continue; + } + + // Bucket subgraph + // Gives ownership of new_fragment to buckets + let canon = canonize(mol, &new_fragment, canonize_mode); + buckets.entry(canon) + .and_modify(|bucket| bucket.push((new_fragment.clone(), *parent_idx))) + .or_insert(vec![(new_fragment.clone(), *parent_idx)]); + + // Add to constructed_frags + constructed_frags.insert(new_fragment); + } + } + + // Go through buckets to create matches + for fragments in buckets.values() { + // Variables to track if a match has been found + let mut frag_has_match = BitSet::with_capacity(fragments.len()); + + // Loop through pairs of bitsets to find matches + for pair in fragments.iter().enumerate().combinations(2) { + // Extract pair of fragments + let frag1 = &pair[0].1.0; + let frag2 = &pair[1].1.0; + + // Check for disjointness + // If the fragments are disjoint, they form a match + if frag1.is_disjoint(&frag2) { + // Extract indices of fragments + let idx1 = pair[0].0; + let idx2 = pair[1].0; + + // If this is the first match for a fragment, create a dag node + if !frag_has_match.contains(idx1) { + // Create new dag node + let new_dag_node = DagNode::new(frag1.clone(), next_canon_id); + let child_idx = dag.len(); + + // Add child to dag list + dag.push(new_dag_node); + + // Add index to parent's child list + let parent_idx = pair[0].1.1; + let parent = &mut dag[parent_idx]; + parent.children.push(child_idx); + + // Add dag node to next subgraphs list + child_subgraphs.push(child_idx); + + // Add to frag_to_id map + frag_to_id.insert(frag1.clone(), child_idx); + } + if !frag_has_match.contains(idx2) { + // Create new dag node + let new_dag_node = DagNode::new(frag2.clone(), next_canon_id); + let child_idx = dag.len(); + + // Add child to dag list + dag.push(new_dag_node); + + // Add index to parent's child list + let parent_idx = pair[1].1.1; + let parent = &mut dag[parent_idx]; + parent.children.push(child_idx); + + // Add dag node to next subgraphs list + child_subgraphs.push(child_idx); + + // Add to frag_to_id map + frag_to_id.insert(frag2.clone(), child_idx); + } + + // Get fragment ids and add to matches + let frag1_id = frag_to_id.get(&frag1).unwrap(); + let frag2_id = frag_to_id.get(&frag2).unwrap(); + + if frag1_id > frag2_id { + matches.push((*frag1_id, *frag2_id)); + } + else { + matches.push((*frag2_id, *frag1_id)); + } + + // Mark that these fragments have matches + frag_has_match.insert(idx1); + frag_has_match.insert(idx2); + } + } + + // If there was a match, increment canon_id counter + if !frag_has_match.is_empty() { + next_canon_id += 1; + } + } + + // Update to the new subgraphs + subgraphs = child_subgraphs; + } + + // Sort matches + // Larger frag_ids get a smaller match_id + // Thus fragments with many edges have small ids + matches.sort(); + matches.reverse(); + + // Give ids to matches + let mut next_match_id = 0; + let mut id_to_match = Vec::new(); + let mut match_to_id: FxHashMap<(usize, usize), usize> = FxHashMap::default(); + + for m in matches { + id_to_match.push(m); + match_to_id.insert(m, next_match_id); + next_match_id += 1; + } + + // Create edge type ids + let graph = mol.graph(); + let mut edgetype_to_id: HashMap = HashMap::new(); + let mut edge_types: Vec = vec![0; num_edges]; + let mut next_id = 0; + + // Create a list of the elements of the nodes + // The list can be indexed by NodeIndex-es + let mut node_elements: Vec = Vec::new(); + for v in graph.node_weights() { + node_elements.push(v.element()); + } + + // Extract bond types + let weights: Vec = graph.edge_weights().copied().collect(); + + // types will hold an element for every unique edge type in fragment + for e in graph.edge_indices() { + // Extract bond type and endpoint atom types + let bond = weights[e.index()]; + let (end1, end2) = graph.edge_endpoints(e).expect("Edge Endpoint Error"); + + let atom1 = node_elements[end1.index()]; + let atom2 = node_elements[end2.index()]; + + // Create edgetype + let ends = if atom1 < atom2 {(atom1, atom2)} else {(atom2, atom1)}; + let edgetype = EdgeType{bond, ends}; + + // Find or create id for this edgetype + // and store the edgetype id for this edge + if let Some(id) = edgetype_to_id.get(&edgetype) { + edge_types[e.index()] = *id; + } + else { + edge_types[e.index()] = next_id; + edgetype_to_id.insert(edgetype, next_id); + next_id += 1; + } + } + + let clique = if use_clique { + Some(CompatGraph::new(&dag, &id_to_match, 2)) + } + else { + None + }; + + Self { + id_to_match, + match_to_id, + dag, + clique, + edge_types, + } + } + + pub fn generate_matches(&self, mol: &Molecule, state: &State, best: usize, bounds: &[Bound]) -> (Vec, Vec<(usize, usize)>, Option) { + self.valid_matches_dag(mol, state, best, bounds) + + // if state.use_subgraph() { + // self.valid_matches_subgraph() + // } + // else { + // self.valid_matches_dag() + // } + } + + fn valid_matches_dag(&self, mol: &Molecule, state: &State, best: usize, bounds: &[Bound]) -> (Vec, Vec<(usize, usize)>, Option) { + let num_edges = mol.graph().edge_count(); + let state_frags = state.fragments(); + let state_index = state.index(); + let last_removed = state.last_removed(); + + let mut valid_matches: Vec<(usize, usize)> = Vec::new(); + let mut subgraphs: Vec<(usize, usize)> = Vec::new(); + let mut masks: Vec> = Vec::new(); + let mut buckets_by_len: Vec>> = Vec::new(); + + // Create subgraphs of size 1 + for (state_id, fragment) in state_frags.iter().enumerate() { + for edge_idx in fragment.iter() { + subgraphs.push((edge_idx, state_id)); + } + } + + while subgraphs.len() > 0 { + // Subgraphs to extend in next loop + let mut buckets: HashMap> = HashMap::new(); + let mut new_subgraphs = Vec::new(); + let mut used_edge_mask = vec![BitSet::with_capacity(num_edges); state_frags.len()]; + + // Extend each subgraph and bucket + for (frag_id, state_id) in subgraphs { + // Get state frament that this subgraph is contained in + let state_frag = &state_frags[state_id]; + + // Get ids of extensions of this subgraph + let children_ids = &self.dag[frag_id].children; + + for child_id in children_ids { + // Check if this extension is valid + // i.e. the extended subgraph is contained in a fragment of state + let child_frag = &self.dag[*child_id].fragment; + let valid = child_frag.is_subset(state_frag); + + if valid { + let canon_id = &self.dag[*child_id].canon_id; + + // Add fragment to bucket + buckets.entry(*canon_id) + .and_modify(|bucket| bucket.push((*child_id, state_id))) + .or_insert(vec![(*child_id, state_id)]); + } + } + } + + // Search through buckets and create matches + for fragments in buckets.values_mut() { + let mut has_match = BitSet::with_capacity(fragments.len()); + // Loop over pairs of fragments + for i in 0..fragments.len() { + for j in i+1..fragments.len() { + if has_match.contains(i) && has_match.contains(j) { + continue; + } + + let mut frag1 = (i, fragments[i]); + let mut frag2 = (j, fragments[j]); + + // Order fragments + if frag1.1.0 < frag2.1.0 { + let temp = frag1; + frag1 = frag2; + frag2 = temp; + } + + let frag1_bucket_idx = frag1.0; + let frag1_id = frag1.1.0; + let frag1_state_id = frag1.1.1; + + let frag2_bucket_idx = frag2.0; + let frag2_id = frag2.1.0; + let frag2_state_id = frag2.1.1; + + // Check for valid match + if let Some(x) = self.match_to_id.get(&(frag1_id, frag2_id)) { + // Only add match if it occurs later in the ordering than + // the last removed match + if *x >= last_removed { + // valid_matches.push((frag1_id, frag2_id)); + + // If this is the first time seeing that this frag has a match, + // add it to the new_subgraphs list and modify mask + if !has_match.contains(frag1_bucket_idx) { + new_subgraphs.push((frag1_id, frag1_state_id)); + used_edge_mask[frag1_state_id].union_with(&self.dag[frag1_id].fragment); + has_match.insert(frag1_bucket_idx); + } + if !has_match.contains(frag2_bucket_idx) { + new_subgraphs.push((frag2_id, frag2_state_id)); + used_edge_mask[frag2_state_id].union_with(&self.dag[frag2_id].fragment); + has_match.insert(frag2_bucket_idx); + } + } + } + } + } + + // Remove any matchless fragments from bucket + let len = fragments.len(); + for i in (0..len).rev() { + if !has_match.contains(i) { + fragments.remove(i); + } + } + } + + // Add buckets to list + buckets_by_len.push(buckets); + + // Add mask to list + masks.push(used_edge_mask); + + // Update to new subgraphs + subgraphs = new_subgraphs; + } + + // Let new state be only the edges which can be matched + let mut state = Vec::new(); + + for frag in masks[0].iter() { + // Add connected components to new state + state.extend(connected_components_under_edges(mol.graph(), &frag)); + } + + // Remove frags of size 1 or less + state.retain(|frag| frag.len() >= 2); + + + // Use bounding strategy to find the largest length match + // to try removing from this state. + let largest_length = self.bound(state_index, best, &masks, bounds); + + // Clique reduction subgraph from this state. If none we do not create a subgraph + let mut clique_subgraph = + if let Some(clique) = &self.clique { + if clique.max_len() >= largest_length { + Some(BitSet::with_capacity(clique.len())) + } + else { + None + } + } else { + None + }; + + // Create valid matches and subgraph if needed + for (bucket_idx, bucket) in buckets_by_len.iter().enumerate().rev() { + // If we do not have to create the subgraph, and buckets with this size + // have been bounded, then we can stop generating valid matches + let match_len = bucket_idx + 2; + if clique_subgraph == None && match_len < largest_length { + break; + } + + for fragments in bucket.values() { + for i in 0..fragments.len() { + for j in i+1..fragments.len() { + let mut frag1_id = fragments[i].0; + let mut frag2_id = fragments[j].0; + + if frag1_id < frag2_id { + let temp = frag1_id; + frag1_id = frag2_id; + frag2_id = temp; + } + + if let Some(match_id) = self.match_to_id.get(&(frag1_id, frag2_id)) { + if *match_id >= last_removed { + valid_matches.push((frag1_id, frag2_id)); + + // Add to subgraph if conditions are met + if let Some(sub) = clique_subgraph.as_mut() { + let clique_id = self.clique.as_ref().unwrap().clique_id(*match_id); + if clique_id != -1 { + sub.insert(clique_id as usize); + } + } + } + } + } + } + } + } + + // Sort matches + valid_matches.sort(); + valid_matches.reverse(); + + (state, valid_matches, clique_subgraph) + } + + fn valid_matches_subgraph(&self) { + + } + + // Return the largest size removal that can possible result in savings + pub fn bound(&self, state_index: usize, best: usize, masks: &Vec>, bounds: &[Bound]) -> usize { + let mut largest_length = 2; + + + for (i, list) in masks.iter().enumerate() { + let mut stop = false; + for bound_type in bounds { + if stop { + break; + } + + match bound_type { + Bound::Int => { + let mut bound = 0; + let i = i + 2; + for (j, frag) in list.iter().enumerate() { + let mf = masks[0][j].len(); + let mi = frag.len(); + let x = mf + (mi % i) - mi; + bound += mf - (mi / i) - (x+i-2) / (i-1); + } + bound -= (i as f32).log2().ceil() as usize; + + if state_index - bound >= best { + stop = true; + } + } + Bound::VecSimple => { + let size_two_list = &masks[0]; + let mut total_set = HashSet::new(); + let mut small_set = HashSet::new(); + + for frag in size_two_list.iter() { + for edge in frag { + total_set.insert(self.edge_types[edge]); + } + } + for frag in list { + for edge in frag { + small_set.insert(self.edge_types[edge]); + } + } + + let z = total_set.len(); + let zi = small_set.len(); + let i = i + 2; + let s: usize = size_two_list.iter().map(|frag| frag.len()).sum(); + let mi: usize = list.iter().map(|frag| frag.len()).sum(); + let mf = s - mi; + + let bound = s - { + if z < i { + z-1 + (i as f32 / z as f32).log2().ceil() as usize + (mi/i) + ((mi%i)+mf - (z - zi) + i-2)/(i-1) + } + else { + z + (mi-zi)/i + ((mi-zi)%i + mf - (z - zi) + i-2)/(i-1) + } + }; + + if mi == 0 { + break; + } + + if state_index - bound >= best { + stop = true; + } + } + _ => () + } + } + + if stop { + largest_length += 1; + } + else { + break; + } + } + + largest_length + } + + pub fn len(&self) -> usize { + self.id_to_match.len() + } + + pub fn get_frag(&self, frag_id: usize) -> &BitSet { + &self.dag[frag_id].fragment + } + + pub fn match_id(&self, mat: &(usize, usize)) -> Option { + self.match_to_id.get(mat).copied() + } + + pub fn clique_max_len(&self) -> usize { + if let Some(clique) = &self.clique { + clique.max_len() + } + else { + 0 + } + } +} + diff --git a/src/memoize.rs b/src/memoize.rs index 6ae2060e..3d4e5572 100644 --- a/src/memoize.rs +++ b/src/memoize.rs @@ -5,10 +5,11 @@ use std::sync::Arc; use bit_set::BitSet; use clap::ValueEnum; use dashmap::DashMap; +use std::sync::atomic::{AtomicUsize, Ordering::Relaxed}; use crate::{ canonize::{canonize, CanonizeMode, Labeling}, - molecule::Molecule, + molecule::Molecule, state::State, }; /// Strategy for memoizing assembly states in the search phase. @@ -47,6 +48,8 @@ pub struct Cache { /// A parallel-aware map from fragments to their canonical labelings; only /// used with [`MemoizeMode::CanonIndex`]. fragment_labels: Arc>, + + counter: Arc, } impl Cache { @@ -57,6 +60,7 @@ impl Cache { canonize_mode, cache: Arc::new(DashMap::)>::new()), fragment_labels: Arc::new(DashMap::::new()), + counter: Arc::new(AtomicUsize::from(0)), } } @@ -95,7 +99,7 @@ impl Cache { /// preempted by the cached assembly state. /// See https://github.com/DaymudeLab/assembly-theory/pull/95 for details. pub fn memoize_state( - &self, + &mut self, mol: &Molecule, state: &[BitSet], state_index: usize, @@ -121,9 +125,104 @@ impl Cache { .value() .clone(); if cached_index <= state_index && cached_order < *removal_order { + self.counter.fetch_add(1, Relaxed); + //println!("{:?}", state); return true; } } false } + + pub fn count(&self) -> usize { + self.counter.load(Relaxed) + } +} + +#[derive(Clone)] +pub struct NewCache { + memoize_mode: MemoizeMode, + cache: Arc, (usize, Vec)>>, + label_to_canon_id: Arc>, + frag_to_canon_id: Arc>, + next_id: Arc, + counter: Arc, } + +impl NewCache { + pub fn new(mode: MemoizeMode) -> Self { + Self { + memoize_mode: mode, + cache: Arc::new(DashMap::, (usize, Vec)>::new()), + label_to_canon_id: Arc::new(DashMap::::new()), + frag_to_canon_id: Arc::new(DashMap::::new()), + next_id: Arc::new(AtomicUsize::from(0)), + counter: Arc::new(AtomicUsize::from(0)), + } + } + + pub fn memoize_state(&mut self, mol: &Molecule, state: &State) -> bool { + let fragments = state.fragments(); + let state_index = state.index(); + let removal_order = state.removal_order(); + + if self.memoize_mode == MemoizeMode::None { + return false; + } + + let mut frag_ids = Vec::new(); + for frag in fragments { + let id = self.get_canon_id(mol, frag); + frag_ids.push(id); + } + frag_ids.sort(); + + let mut stop = false; + + self.cache + .entry(frag_ids) + .and_modify(|val| { + if val.0 > state_index || val.1 > *removal_order { + val.0 = state_index; + val.1 = removal_order.clone(); + } else { + self.counter.fetch_add(1, Relaxed); + stop = true; + } + }) + .or_insert((state_index, removal_order.clone())); + + stop + } + + fn get_canon_id(&mut self, mol: &Molecule, frag: &BitSet) -> usize { + // If frag has id, use it + if let Some(x) = self.frag_to_canon_id.get(frag) { + *x + } + // Otherwise canonize to get labeling + else { + let canon = canonize(mol, frag, CanonizeMode::TreeNauty); + // If label has id, use it + if let Some(x) = self.label_to_canon_id.get(&canon) { + let id = *x; + self.frag_to_canon_id.insert(frag.clone(), id); + id + } + // Otherwise asign new id + else { + let id = self.next_id.fetch_add(1, Relaxed); + self.label_to_canon_id.insert(canon, id); + self.frag_to_canon_id.insert(frag.clone(), id); + id + } + } + } + + pub fn add_count(&mut self) { + self.counter.fetch_add(1, Relaxed); + } + + pub fn count(&self) -> usize { + self.counter.load(Relaxed) + } +} \ No newline at end of file diff --git a/src/python.rs b/src/python.rs index df74f5f2..e224e0c0 100644 --- a/src/python.rs +++ b/src/python.rs @@ -470,6 +470,8 @@ pub fn _index_search( memoize_mode, kernel_mode, &boundlist, + // TODO: implement clique CLI param for python interface + false, )) } diff --git a/src/reductions.rs b/src/reductions.rs new file mode 100644 index 00000000..5b60771b --- /dev/null +++ b/src/reductions.rs @@ -0,0 +1,146 @@ +//! Reduces the optimal molecular assembly index problem into maximum weighted clique. +//! +//! Removing a duplicatable subgraph pair (h1, h2) gives a savings of |h1| - 1. +//! The problem of finding the optimal molecular assembly index can thus be thought of as finding +//! a set of duplicatable subgraph removals that are all compatible with each other (i.e. can be removed +//! at the same time) with the largest savings possible. Thus, this problem is close to that of finding +//! a maximum clique in a graph with duplicatable subgraph pairs for vertices, vertex weights |h1| - 1, and +//! edges representing compatibility. However, the order of duplicatable subgraph removals can affect +//! compatibility, and so edges must be directed. By ordering the vertices in a nice way, we can ignore +//! edge directionality. + +use bit_set::BitSet; +use std::time::Instant; + +use crate::matches::DagNode; + +/// Graph representation of the compatibility of duplicatable subgraph pairs. +pub struct CompatGraph { + /// The graph is implemented as an adjacency matrix. The ith element of graph + /// has a 1 at position j if {i, j} is an edge. + graph: Vec, + max_len: usize, + offset: usize, +} + +impl CompatGraph { + /// Constructs a compatibility graph given a set of matches. + pub fn new(dag: &Vec, matches: &Vec<(usize, usize)>, max_len: usize) -> Self { + let start = Instant::now(); + + // Find the index of the first match in the clique + // TODO: what if max_len < maximum length in dag? + let mut offset = 0; + while dag[matches[offset].0].len() > max_len { + offset += 1; + } + + // Compute number of nodes in clique + let size = matches.len() - offset; + + // Initialize weights and empty graph + let mut init_graph: Vec = Vec::with_capacity(size); + for _ in 0..size { + init_graph.push(BitSet::with_capacity(size)); + } + + // Populate graph + for (idx1, match1) in matches[offset..].iter().enumerate() { + for (idx2, match2) in matches[offset + idx1 + 1..].iter().enumerate() { + let idx2 = idx2 + idx1 + 1; + let h1 = dag[match1.0].fragment(); + let h2 = dag[match1.1].fragment(); + let h1p = dag[match2.0].fragment(); + let h2p = dag[match2.1].fragment(); + + let forward_compatible = { + h2.is_disjoint(h1p) && + h2.is_disjoint(h2p) && + (h1.is_disjoint(h1p) || h1.is_superset(h1p)) && + (h1.is_disjoint(h2p) || h1.is_superset(h2p)) + }; + + if forward_compatible { + init_graph[idx1].insert(idx2); + init_graph[idx2].insert(idx1); + } + } + } + + println!("Graph Time: {:?}", start.elapsed()); + + Self { + graph: init_graph, + max_len, + offset, + } + } + + /// Returns the number of vertices in the graph. + pub fn len(&self) -> usize { + self.graph.len() + } + + pub fn max_len(&self) -> usize { + self.max_len + } + + // Takes a match id and returns its id in the clique + // Returns -1 if the match is not used in the clique + pub fn clique_id(&self, match_id: usize) -> isize { + if match_id < self.offset { + -1 + } + else { + (match_id - self.offset) as isize + } + } + + /// Returns the degree vertex v in the subgraph + pub fn degree(&self, v: usize, subgraph: &BitSet) -> usize { + self.graph[v].intersection(subgraph).count() + } + + /// Returns the set of matches compatible with v. I.e. the set neighbors + /// of v. + pub fn compatible_with(&self, v: usize) -> &BitSet { + &self.graph[v] + } + + /// Returns the neighbors of v in the given subgraph. + /// The returned BitSet is a clone, and thus can be safely modified by the calling function + /// without changing the graph. + pub fn neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + + neighbors + } + + /// Returns the neighbors of v that occur after v in the matches list. + /// The returned BitSet is a clone, and thus can be safely modified by the calling function + /// without changing the graph. + pub fn forward_neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + let mut to_remove = vec![]; + for u in neighbors.iter() { + if u <= v { + to_remove.push(u); + } + if u > v { + break; + } + } + for u in to_remove { + neighbors.remove(u); + } + + neighbors + } + + /// Returns true if vertices v and u are adjacent. + pub fn are_adjacent(&self, v: usize, u: usize) -> bool { + self.graph[v].contains(u) + } +} \ No newline at end of file diff --git a/src/state.rs b/src/state.rs new file mode 100644 index 00000000..4ad1a82f --- /dev/null +++ b/src/state.rs @@ -0,0 +1,70 @@ +use bit_set::BitSet; + +use crate::{matches::Matches, molecule::Molecule}; + +pub struct State { + fragments: Vec, + clique_subgraph: Option, + index: usize, + removal_order: Vec, + last_removed: usize, +} + +impl State { + pub fn new(mol: &Molecule) -> Self { + Self { + fragments: { + let mut init = BitSet::new(); + init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + vec![init] + }, + clique_subgraph: None, + index: mol.graph().edge_count() - 1, + removal_order: Vec::new(), + last_removed: 0, + } + } + + pub fn update(&self, matches: &Matches, fragments: Vec, mat: &(usize, usize), order_idx: usize, subgraph: &Option) -> Self { + let match_len = matches.get_frag(mat.0).len(); + + Self { + fragments, + index: self.index - match_len + 1, + removal_order: { + let mut clone = self.removal_order.clone(); + clone.push(order_idx); + clone + }, + last_removed: matches.match_id(mat).unwrap(), + clique_subgraph: { + if *subgraph != None && match_len <= matches.clique_max_len() { + subgraph.clone() + } + else { + None + } + } + } + } + + pub fn fragments(&self) -> &[BitSet] { + &self.fragments + } + + pub fn index(&self) -> usize { + self.index + } + + pub fn removal_order(&self) -> &Vec { + &self.removal_order + } + + pub fn last_removed(&self) -> usize { + self.last_removed + } + + pub fn use_subgraph(&self) -> bool { + self.clique_subgraph != None + } +} \ No newline at end of file