From f79cad746a70fa94b85d074a055691b6d3e36bd5 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 9 Jun 2025 13:28:30 -0700 Subject: [PATCH 01/32] Create Branch --- src/assembly.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/assembly.rs b/src/assembly.rs index dbb04c60..2cf3eda0 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -230,6 +230,14 @@ fn recurse_index_search( cx } +fn recurese_clique_index_search() { + +} + +pub fn clique_index_search() { + +} + #[allow(clippy::too_many_arguments)] fn parallel_recurse_index_search( mol: &Molecule, From c44096b2d1d792b96710aa4e506f23d464f6c4d2 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 9 Jun 2025 13:54:46 -0700 Subject: [PATCH 02/32] Add matches_graph generation --- src/assembly.rs | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 2cf3eda0..6439d50f 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -24,6 +24,7 @@ use std::{ }; use bit_set::BitSet; +use petgraph::{graph::NodeIndex, Graph, Undirected}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use crate::{ @@ -230,12 +231,39 @@ fn recurse_index_search( cx } -fn recurese_clique_index_search() { +fn recurse_clique_index_search() { } -pub fn clique_index_search() { +pub fn clique_index_search(mol: &Molecule) { + let mut matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); + matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); + + let mut matches_graph = Graph::<(usize, usize), _, Undirected>::new_undirected(); + let mut nodes: Vec = Vec::new(); + + // Create vertices of matches_graph + for (idx, (h1, _)) in matches.iter().enumerate() { + let v = matches_graph.add_node((h1.len() - 1, idx)); + nodes.push(v); + } + // Create edges of matches_graph + // two matches are connected if they are compatible with each other + for (idx1, (h1, h2)) in matches.iter().enumerate() { + for (idx2, (h1p, h2p)) in matches[idx1 + 1..].iter().enumerate() { + let compatible = { + h2.is_disjoint(h2p) && + (h1.is_disjoint(h1p) || h1.is_subset(h1p) || h1.is_superset(h1p)) && + (h1.is_disjoint(h2p) || h1.is_superset(h2p)) && + (h1p.is_disjoint(h2) || h1p.is_superset(h2)) + }; + + if compatible { + matches_graph.add_edge(nodes[idx1], nodes[idx1 + idx2 + 1], ()); + } + } + } } #[allow(clippy::too_many_arguments)] From e79f665449e0e6b565eee3fc29896f8506071bb0 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 9 Jun 2025 14:55:38 -0700 Subject: [PATCH 03/32] Add search method using matches graph --- src/assembly.rs | 107 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 3 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 6439d50f..3a5a2efe 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -231,21 +231,103 @@ fn recurse_index_search( cx } -fn recurse_clique_index_search() { - +fn recurse_clique_index_search(mol: &Molecule, + matches: &Vec<(BitSet, BitSet)>, + fragments: &[BitSet], + ix: usize, + largest_remove: usize, + mut best: usize, + states_searched: &mut usize, + subgraph: BitSet, + nodes: &Vec, + matches_graph: &Graph<(usize, usize), (), Undirected>, +) -> usize { + let mut cx = ix; + + *states_searched += 1; + + // Search for duplicatable fragment + for x in subgraph.iter() { + let v = nodes[x]; + let (h1, h2) = &matches[x]; + + let mut fractures = fragments.to_owned(); + let f1 = fragments.iter().enumerate().find(|(_, c)| h1.is_subset(c)); + let f2 = fragments.iter().enumerate().find(|(_, c)| h2.is_subset(c)); + + let largest_remove = h1.len(); + + let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else { + continue; + }; + + // All of these clones are on bitsets and cheap enough + if i1 == i2 { + let mut union = h1.clone(); + union.union_with(h2); + let mut difference = f1.clone(); + difference.difference_with(&union); + let c = connected_components_under_edges(mol.graph(), &difference); + fractures.extend(c); + fractures.swap_remove(i1); + } else { + let mut f1r = f1.clone(); + f1r.difference_with(h1); + let mut f2r = f2.clone(); + f2r.difference_with(h2); + + let c1 = connected_components_under_edges(mol.graph(), &f1r); + let c2 = connected_components_under_edges(mol.graph(), &f2r); + + fractures.extend(c1); + fractures.extend(c2); + + fractures.swap_remove(i1.max(i2)); + fractures.swap_remove(i1.min(i2)); + } + + fractures.retain(|i| i.len() > 1); + fractures.push(h1.clone()); + + let mut subgraph_clone = subgraph.clone(); + let mut neighbors = BitSet::new(); + for u in matches_graph.neighbors(v) { + neighbors.insert(matches_graph.node_weight(u).unwrap().1); + } + subgraph_clone.intersect_with(&neighbors); + + cx = cx.min(recurse_clique_index_search( + mol, + matches, + &fractures, + ix - h1.len() + 1, + largest_remove, + best, + states_searched, + subgraph_clone, + nodes, + matches_graph, + )); + best = best.min(cx); + } + + cx } -pub fn clique_index_search(mol: &Molecule) { +pub fn clique_index_search(mol: &Molecule) -> (u32, u32, usize) { let mut matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); let mut matches_graph = Graph::<(usize, usize), _, Undirected>::new_undirected(); let mut nodes: Vec = Vec::new(); + let mut subgraph = BitSet::new(); + // Create vertices of matches_graph for (idx, (h1, _)) in matches.iter().enumerate() { let v = matches_graph.add_node((h1.len() - 1, idx)); nodes.push(v); + subgraph.insert(idx); } // Create edges of matches_graph @@ -264,6 +346,25 @@ pub fn clique_index_search(mol: &Molecule) { } } } + + let mut init = BitSet::new(); + init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + let edge_count = mol.graph().edge_count(); + + let mut total_search = 0; + let index = recurse_clique_index_search( + mol, + &matches, + &[init], + edge_count - 1, + edge_count, + edge_count - 1, + &mut total_search, + subgraph, + &nodes, + &matches_graph); + + (index as u32, matches.len() as u32, total_search) } #[allow(clippy::too_many_arguments)] From d70e84e89e0eb276463801e0c3428c588959bc35 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 9 Jun 2025 15:17:02 -0700 Subject: [PATCH 04/32] Create Branch --- src/assembly.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/assembly.rs b/src/assembly.rs index dbb04c60..2cf3eda0 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -230,6 +230,14 @@ fn recurse_index_search( cx } +fn recurese_clique_index_search() { + +} + +pub fn clique_index_search() { + +} + #[allow(clippy::too_many_arguments)] fn parallel_recurse_index_search( mol: &Molecule, From 72649cca46c027628b6167650d8227071371e902 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 9 Jun 2025 15:17:03 -0700 Subject: [PATCH 05/32] Add matches_graph generation --- src/assembly.rs | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 2cf3eda0..6439d50f 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -24,6 +24,7 @@ use std::{ }; use bit_set::BitSet; +use petgraph::{graph::NodeIndex, Graph, Undirected}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use crate::{ @@ -230,12 +231,39 @@ fn recurse_index_search( cx } -fn recurese_clique_index_search() { +fn recurse_clique_index_search() { } -pub fn clique_index_search() { +pub fn clique_index_search(mol: &Molecule) { + let mut matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); + matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); + + let mut matches_graph = Graph::<(usize, usize), _, Undirected>::new_undirected(); + let mut nodes: Vec = Vec::new(); + + // Create vertices of matches_graph + for (idx, (h1, _)) in matches.iter().enumerate() { + let v = matches_graph.add_node((h1.len() - 1, idx)); + nodes.push(v); + } + // Create edges of matches_graph + // two matches are connected if they are compatible with each other + for (idx1, (h1, h2)) in matches.iter().enumerate() { + for (idx2, (h1p, h2p)) in matches[idx1 + 1..].iter().enumerate() { + let compatible = { + h2.is_disjoint(h2p) && + (h1.is_disjoint(h1p) || h1.is_subset(h1p) || h1.is_superset(h1p)) && + (h1.is_disjoint(h2p) || h1.is_superset(h2p)) && + (h1p.is_disjoint(h2) || h1p.is_superset(h2)) + }; + + if compatible { + matches_graph.add_edge(nodes[idx1], nodes[idx1 + idx2 + 1], ()); + } + } + } } #[allow(clippy::too_many_arguments)] From d38b457c7423612e60d433674c9a041b5eca5d11 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 9 Jun 2025 15:17:04 -0700 Subject: [PATCH 06/32] Add search method using matches graph --- src/assembly.rs | 107 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 3 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 6439d50f..3a5a2efe 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -231,21 +231,103 @@ fn recurse_index_search( cx } -fn recurse_clique_index_search() { - +fn recurse_clique_index_search(mol: &Molecule, + matches: &Vec<(BitSet, BitSet)>, + fragments: &[BitSet], + ix: usize, + largest_remove: usize, + mut best: usize, + states_searched: &mut usize, + subgraph: BitSet, + nodes: &Vec, + matches_graph: &Graph<(usize, usize), (), Undirected>, +) -> usize { + let mut cx = ix; + + *states_searched += 1; + + // Search for duplicatable fragment + for x in subgraph.iter() { + let v = nodes[x]; + let (h1, h2) = &matches[x]; + + let mut fractures = fragments.to_owned(); + let f1 = fragments.iter().enumerate().find(|(_, c)| h1.is_subset(c)); + let f2 = fragments.iter().enumerate().find(|(_, c)| h2.is_subset(c)); + + let largest_remove = h1.len(); + + let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else { + continue; + }; + + // All of these clones are on bitsets and cheap enough + if i1 == i2 { + let mut union = h1.clone(); + union.union_with(h2); + let mut difference = f1.clone(); + difference.difference_with(&union); + let c = connected_components_under_edges(mol.graph(), &difference); + fractures.extend(c); + fractures.swap_remove(i1); + } else { + let mut f1r = f1.clone(); + f1r.difference_with(h1); + let mut f2r = f2.clone(); + f2r.difference_with(h2); + + let c1 = connected_components_under_edges(mol.graph(), &f1r); + let c2 = connected_components_under_edges(mol.graph(), &f2r); + + fractures.extend(c1); + fractures.extend(c2); + + fractures.swap_remove(i1.max(i2)); + fractures.swap_remove(i1.min(i2)); + } + + fractures.retain(|i| i.len() > 1); + fractures.push(h1.clone()); + + let mut subgraph_clone = subgraph.clone(); + let mut neighbors = BitSet::new(); + for u in matches_graph.neighbors(v) { + neighbors.insert(matches_graph.node_weight(u).unwrap().1); + } + subgraph_clone.intersect_with(&neighbors); + + cx = cx.min(recurse_clique_index_search( + mol, + matches, + &fractures, + ix - h1.len() + 1, + largest_remove, + best, + states_searched, + subgraph_clone, + nodes, + matches_graph, + )); + best = best.min(cx); + } + + cx } -pub fn clique_index_search(mol: &Molecule) { +pub fn clique_index_search(mol: &Molecule) -> (u32, u32, usize) { let mut matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); let mut matches_graph = Graph::<(usize, usize), _, Undirected>::new_undirected(); let mut nodes: Vec = Vec::new(); + let mut subgraph = BitSet::new(); + // Create vertices of matches_graph for (idx, (h1, _)) in matches.iter().enumerate() { let v = matches_graph.add_node((h1.len() - 1, idx)); nodes.push(v); + subgraph.insert(idx); } // Create edges of matches_graph @@ -264,6 +346,25 @@ pub fn clique_index_search(mol: &Molecule) { } } } + + let mut init = BitSet::new(); + init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + let edge_count = mol.graph().edge_count(); + + let mut total_search = 0; + let index = recurse_clique_index_search( + mol, + &matches, + &[init], + edge_count - 1, + edge_count, + edge_count - 1, + &mut total_search, + subgraph, + &nodes, + &matches_graph); + + (index as u32, matches.len() as u32, total_search) } #[allow(clippy::too_many_arguments)] From 0eba9e59bf94e899b485f23e93ffebbda7bbe008 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 10 Jun 2025 15:16:29 -0700 Subject: [PATCH 07/32] Add reduction rule --- src/assembly.rs | 107 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 3a5a2efe..9ba440fb 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -16,15 +16,14 @@ //! # } //! ``` use std::{ - collections::BTreeSet, - sync::{ + char::UNICODE_VERSION, collections::BTreeSet, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, - }, + } }; use bit_set::BitSet; -use petgraph::{graph::NodeIndex, Graph, Undirected}; +use petgraph::{graph::NodeIndex, prelude::StableGraph, Graph, Undirected}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use crate::{ @@ -237,15 +236,31 @@ fn recurse_clique_index_search(mol: &Molecule, ix: usize, largest_remove: usize, mut best: usize, + bounds: &[Bound], states_searched: &mut usize, subgraph: BitSet, nodes: &Vec, - matches_graph: &Graph<(usize, usize), (), Undirected>, + matches_graph: &StableGraph<(usize, usize), (), Undirected>, ) -> usize { let mut cx = ix; *states_searched += 1; + // Branch and Bound + for bound_type in bounds { + let exceeds = match bound_type { + Bound::Log => ix - log_bound(fragments) >= best, + Bound::IntChain => ix - addition_bound(fragments, largest_remove) >= best, + Bound::VecChainSimple => ix - vec_bound_simple(fragments, largest_remove, mol) >= best, + Bound::VecChainSmallFrags => { + ix - vec_bound_small_frags(fragments, largest_remove, mol) >= best + } + }; + if exceeds { + return ix; + } + } + // Search for duplicatable fragment for x in subgraph.iter() { let v = nodes[x]; @@ -291,8 +306,12 @@ fn recurse_clique_index_search(mol: &Molecule, let mut subgraph_clone = subgraph.clone(); let mut neighbors = BitSet::new(); + let weight_v = matches_graph.node_weight(v).unwrap().1; for u in matches_graph.neighbors(v) { - neighbors.insert(matches_graph.node_weight(u).unwrap().1); + let weight_u = matches_graph.node_weight(u).unwrap().1; + if weight_u >= weight_v { + neighbors.insert(matches_graph.node_weight(u).unwrap().1); + } } subgraph_clone.intersect_with(&neighbors); @@ -303,6 +322,7 @@ fn recurse_clique_index_search(mol: &Molecule, ix - h1.len() + 1, largest_remove, best, + bounds, states_searched, subgraph_clone, nodes, @@ -314,11 +334,56 @@ fn recurse_clique_index_search(mol: &Molecule, cx } -pub fn clique_index_search(mol: &Molecule) -> (u32, u32, usize) { +fn kernelize(mut g: StableGraph<(usize, usize), (), Undirected>, mut subgraph: BitSet) + -> (StableGraph<(usize, usize), (), Undirected>, BitSet) { + let nodes: Vec = g.node_indices().collect(); + let mut count = 0; + + for (idx, v) in nodes.iter().enumerate() { + let v_weight = g.node_weight(*v); + if v_weight == None { + continue; + } + let v_weight = v_weight.unwrap(); + let v_val = v_weight.0; + let v_rank = v_weight.1; + let neighbors_v: BTreeSet = g.neighbors(*v).map(|x| g.node_weight(x).unwrap().1).collect(); + + for u in &nodes[idx+1..] { + let u_weight = g.node_weight(*u); + if u_weight == None { + continue; + } + let u_weight = u_weight.unwrap(); + let u_val = u_weight.0; + let u_rank = u_weight.1; + let neighbors_u: BTreeSet = g.neighbors(*u).map(|x| g.node_weight(x).unwrap().1).collect(); + + if neighbors_u.is_subset(&neighbors_v) && u_val <= v_val { + count += 1; + + g.remove_node(*u); + subgraph.remove(u_rank); + + } + else if neighbors_v.is_subset(&neighbors_u) && v_val <= u_val { + count += 1; + + g.remove_node(*v); + subgraph.remove(v_rank); + } + } + } + + // println!("Reduce count: {}", count); + (g, subgraph) +} + +pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { let mut matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); - let mut matches_graph = Graph::<(usize, usize), _, Undirected>::new_undirected(); + let mut matches_graph = StableGraph::<(usize, usize), _, Undirected>::with_capacity(matches.len(), matches.len()); let mut nodes: Vec = Vec::new(); let mut subgraph = BitSet::new(); @@ -351,19 +416,35 @@ pub fn clique_index_search(mol: &Molecule) -> (u32, u32, usize) { init.extend(mol.graph().edge_indices().map(|ix| ix.index())); let edge_count = mol.graph().edge_count(); + (matches_graph, subgraph) = kernelize(matches_graph, subgraph); + + for x in &subgraph { + if matches_graph.node_weight(nodes[x]) == None { + println!("!: {}", x); + } + } + let mut total_search = 0; + + use std::time::Instant; + let start = Instant::now(); + let index = recurse_clique_index_search( mol, &matches, &[init], edge_count - 1, edge_count, - edge_count - 1, + edge_count - 1, + bounds, &mut total_search, subgraph, &nodes, &matches_graph); + let dur = start.elapsed(); + println!("Time: {:?}", dur); + (index as u32, matches.len() as u32, total_search) } @@ -569,6 +650,10 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let edge_count = mol.graph().edge_count(); let mut total_search = 0; + + use std::time::Instant; + let start = Instant::now(); + let index = recurse_index_search( mol, &matches, @@ -579,6 +664,10 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize bounds, &mut total_search, ); + + let dur = start.elapsed(); + println!("Time: {:?}", dur); + (index as u32, matches.len() as u32, total_search) } From c169da21ea6b40aa37bd461ce2d304859a49f6d2 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 10 Jun 2025 16:05:41 -0700 Subject: [PATCH 08/32] Modify kernelization data structure --- src/assembly.rs | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 9ba440fb..2a2a0173 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -23,7 +23,7 @@ use std::{ }; use bit_set::BitSet; -use petgraph::{graph::NodeIndex, prelude::StableGraph, Graph, Undirected}; +use petgraph::{graph::NodeIndex, Graph, Undirected}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use crate::{ @@ -240,7 +240,7 @@ fn recurse_clique_index_search(mol: &Molecule, states_searched: &mut usize, subgraph: BitSet, nodes: &Vec, - matches_graph: &StableGraph<(usize, usize), (), Undirected>, + matches_graph: &Graph<(usize, usize), (), Undirected>, ) -> usize { let mut cx = ix; @@ -314,6 +314,7 @@ fn recurse_clique_index_search(mol: &Molecule, } } subgraph_clone.intersect_with(&neighbors); + // subgraph_clone = kernelize(&matches_graph, subgraph_clone); cx = cx.min(recurse_clique_index_search( mol, @@ -334,56 +335,59 @@ fn recurse_clique_index_search(mol: &Molecule, cx } -fn kernelize(mut g: StableGraph<(usize, usize), (), Undirected>, mut subgraph: BitSet) - -> (StableGraph<(usize, usize), (), Undirected>, BitSet) { +fn kernelize(g: &Graph<(usize, usize), (), Undirected>, mut subgraph: BitSet) -> BitSet { let nodes: Vec = g.node_indices().collect(); let mut count = 0; for (idx, v) in nodes.iter().enumerate() { let v_weight = g.node_weight(*v); - if v_weight == None { - continue; - } let v_weight = v_weight.unwrap(); let v_val = v_weight.0; let v_rank = v_weight.1; - let neighbors_v: BTreeSet = g.neighbors(*v).map(|x| g.node_weight(x).unwrap().1).collect(); + if !subgraph.contains(v_rank) { + continue; + } + let neighbors_v: BTreeSet = g.neighbors(*v) + .map(|x| g.node_weight(x).unwrap().1) + .filter(|x| subgraph.contains(*x)) + .collect(); for u in &nodes[idx+1..] { let u_weight = g.node_weight(*u); - if u_weight == None { - continue; - } let u_weight = u_weight.unwrap(); let u_val = u_weight.0; let u_rank = u_weight.1; - let neighbors_u: BTreeSet = g.neighbors(*u).map(|x| g.node_weight(x).unwrap().1).collect(); + if !subgraph.contains(u_rank) { + continue; + } + let neighbors_u: BTreeSet = g.neighbors(*u) + .map(|x| g.node_weight(x).unwrap().1) + .filter(|x| subgraph.contains(*x)) + .collect(); if neighbors_u.is_subset(&neighbors_v) && u_val <= v_val { count += 1; - g.remove_node(*u); subgraph.remove(u_rank); } else if neighbors_v.is_subset(&neighbors_u) && v_val <= u_val { count += 1; - g.remove_node(*v); subgraph.remove(v_rank); } } } // println!("Reduce count: {}", count); - (g, subgraph) + subgraph } pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { let mut matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); - let mut matches_graph = StableGraph::<(usize, usize), _, Undirected>::with_capacity(matches.len(), matches.len()); + let mut matches_graph = Graph::<(usize, usize), _, Undirected>::with_capacity(matches.len(), matches.len()); let mut nodes: Vec = Vec::new(); let mut subgraph = BitSet::new(); @@ -416,7 +420,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize init.extend(mol.graph().edge_indices().map(|ix| ix.index())); let edge_count = mol.graph().edge_count(); - (matches_graph, subgraph) = kernelize(matches_graph, subgraph); + subgraph = kernelize(&matches_graph, subgraph); for x in &subgraph { if matches_graph.node_weight(nodes[x]) == None { From c823d98cb351a10e97bd3e33b31c94a5c19be301 Mon Sep 17 00:00:00 2001 From: Garrett Parzych Date: Wed, 11 Jun 2025 09:11:19 -0700 Subject: [PATCH 09/32] Add some optimizations --- src/assembly.rs | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 2a2a0173..ea999ca8 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -314,7 +314,7 @@ fn recurse_clique_index_search(mol: &Molecule, } } subgraph_clone.intersect_with(&neighbors); - // subgraph_clone = kernelize(&matches_graph, subgraph_clone); + subgraph_clone = kernelize(&matches_graph, subgraph_clone, nodes); cx = cx.min(recurse_clique_index_search( mol, @@ -335,51 +335,48 @@ fn recurse_clique_index_search(mol: &Molecule, cx } -fn kernelize(g: &Graph<(usize, usize), (), Undirected>, mut subgraph: BitSet) -> BitSet { - let nodes: Vec = g.node_indices().collect(); - let mut count = 0; +fn kernelize(g: &Graph<(usize, usize), (), Undirected>, mut subgraph: BitSet, nodes: &Vec) -> BitSet { + //let mut count = 0; - for (idx, v) in nodes.iter().enumerate() { - let v_weight = g.node_weight(*v); - let v_weight = v_weight.unwrap(); - let v_val = v_weight.0; - let v_rank = v_weight.1; + let subgraph_copy = subgraph.clone(); + for v_rank in subgraph_copy.iter() { if !subgraph.contains(v_rank) { continue; } - let neighbors_v: BTreeSet = g.neighbors(*v) + let v_index = nodes[v_rank]; + let v_val = g.node_weight(v_index).unwrap().0; + let neighbors_v: BTreeSet = g.neighbors(v_index) .map(|x| g.node_weight(x).unwrap().1) .filter(|x| subgraph.contains(*x)) .collect(); - for u in &nodes[idx+1..] { - let u_weight = g.node_weight(*u); - let u_weight = u_weight.unwrap(); - let u_val = u_weight.0; - let u_rank = u_weight.1; + for u_rank in subgraph_copy.iter().filter(|x| *x > v_rank) { if !subgraph.contains(u_rank) { continue; } - let neighbors_u: BTreeSet = g.neighbors(*u) + let u_index = nodes[u_rank]; + let u_val = g.node_weight(u_index).unwrap().0; + let neighbors_u: BTreeSet = g.neighbors(u_index) .map(|x| g.node_weight(x).unwrap().1) .filter(|x| subgraph.contains(*x)) .collect(); if neighbors_u.is_subset(&neighbors_v) && u_val <= v_val { - count += 1; + //count += 1; subgraph.remove(u_rank); } else if neighbors_v.is_subset(&neighbors_u) && v_val <= u_val { - count += 1; + //count += 1; subgraph.remove(v_rank); + break; } } } - // println!("Reduce count: {}", count); + //println!("Reduce count: {}", count); subgraph } @@ -420,13 +417,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize init.extend(mol.graph().edge_indices().map(|ix| ix.index())); let edge_count = mol.graph().edge_count(); - subgraph = kernelize(&matches_graph, subgraph); - - for x in &subgraph { - if matches_graph.node_weight(nodes[x]) == None { - println!("!: {}", x); - } - } + subgraph = kernelize(&matches_graph, subgraph, &nodes); let mut total_search = 0; From 8e248584b4245e5eb0de2eaf85be977905950a13 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 11 Jun 2025 18:05:54 -0700 Subject: [PATCH 10/32] Add new bound --- src/assembly.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index ea999ca8..d364c78f 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -259,6 +259,12 @@ fn recurse_clique_index_search(mol: &Molecule, if exceeds { return ix; } + if subgraph.len() <= ix - best { + let sum: usize = subgraph.iter().map(|rank| matches_graph.node_weight(nodes[rank]).unwrap().0).sum(); + if ix as i32 - sum as i32 >= best as i32 { + return ix; + } + } } // Search for duplicatable fragment @@ -314,7 +320,7 @@ fn recurse_clique_index_search(mol: &Molecule, } } subgraph_clone.intersect_with(&neighbors); - subgraph_clone = kernelize(&matches_graph, subgraph_clone, nodes); + //subgraph_clone = kernelize(&matches_graph, subgraph_clone, nodes); cx = cx.min(recurse_clique_index_search( mol, @@ -403,8 +409,8 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let compatible = { h2.is_disjoint(h2p) && (h1.is_disjoint(h1p) || h1.is_subset(h1p) || h1.is_superset(h1p)) && - (h1.is_disjoint(h2p) || h1.is_superset(h2p)) && - (h1p.is_disjoint(h2) || h1p.is_superset(h2)) + (h1p.is_disjoint(h2) || h1p.is_superset(h2)) && + (h1.is_disjoint(h2p) || h1.is_superset(h2p)) }; if compatible { From 63f6cfe8fd1ef7ed04f9260f2c8f71d1fbf794ff Mon Sep 17 00:00:00 2001 From: Garrett Parzych Date: Fri, 13 Jun 2025 15:51:41 -0700 Subject: [PATCH 11/32] Change graph data structure --- src/assembly.rs | 363 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 251 insertions(+), 112 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index d364c78f..e443d5ab 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -16,12 +16,14 @@ //! # } //! ``` use std::{ - char::UNICODE_VERSION, collections::BTreeSet, sync::{ + char::UNICODE_VERSION, collections::BTreeSet, ops::BitAnd, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, } }; +use std::time::Instant; + use bit_set::BitSet; use petgraph::{graph::NodeIndex, Graph, Undirected}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; @@ -56,6 +58,66 @@ pub enum Bound { VecChainSmallFrags, } +#[derive(Debug)] +struct CGraph { + graph: Vec, + weights: Vec, + matches: Vec<(BitSet, BitSet)>, +} + +impl CGraph { + pub fn new(mut init_matches: Vec<(BitSet, BitSet)>) -> Self{ + let size = init_matches.len(); + init_matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); + + // Initialize weights and empty graph + let mut init_graph: Vec = Vec::with_capacity(size); + let mut init_weights: Vec = Vec::with_capacity(size); + for m in init_matches.iter() { + init_graph.push(BitSet::with_capacity(size)); + init_weights.push(m.0.len()); + } + + // Populate graph + for (idx1, (h1, h2)) in init_matches.iter().enumerate() { + for (idx2, (h1p, h2p)) in init_matches[idx1 + 1..].iter().enumerate() { + let idx2 = idx2 + idx1 + 1; + + let compatible = { + h2.is_disjoint(h2p) && + (h1.is_disjoint(h1p) || h1.is_subset(h1p) || h1.is_superset(h1p)) && + (h1p.is_disjoint(h2) || h1p.is_superset(h2)) && + (h1.is_disjoint(h2p) || h1.is_superset(h2p)) + }; + + if compatible { + init_graph[idx1].insert(idx2); + init_graph[idx2].insert(idx1); + } + } + } + + Self { + graph: init_graph, + weights: init_weights, + matches: init_matches, + } + } + + // Should probably return Option<> instead + pub fn adjacent_to(&self, v: usize) -> &BitSet { + &self.graph[v] + } + + pub fn remaining_weight_bound(&self, subgraph: &BitSet) -> usize { + subgraph.iter().map(|i| self.weights[i]).sum() + } + + pub fn get_match(&self, v: usize) -> &(BitSet, BitSet) { + &self.matches[v] + } +} + pub fn naive_assembly_depth(mol: &Molecule) -> u32 { let mut ix = u32::MAX; for (left, right) in mol.partitions().unwrap() { @@ -154,6 +216,7 @@ fn recurse_index_search( mut best: usize, bounds: &[Bound], states_searched: &mut usize, + last_removed: i32 ) -> usize { let mut cx = ix; @@ -176,6 +239,8 @@ fn recurse_index_search( // Search for duplicatable fragment for (i, (h1, h2)) in matches.iter().enumerate() { + let i = i as i32; + if i <= last_removed {continue;} let mut fractures = fragments.to_owned(); let f1 = fragments.iter().enumerate().find(|(_, c)| h1.is_subset(c)); let f2 = fragments.iter().enumerate().find(|(_, c)| h2.is_subset(c)); @@ -216,13 +281,14 @@ fn recurse_index_search( cx = cx.min(recurse_index_search( mol, - &matches[i + 1..], + &matches, &fractures, ix - h1.len() + 1, largest_remove, best, bounds, states_searched, + i )); best = best.min(cx); } @@ -230,8 +296,107 @@ fn recurse_index_search( cx } + +fn recurse_clique_index_search_with_start(mol: &Molecule, + fragments: &[BitSet], + ix: usize, + largest_remove: usize, + mut best: usize, + bounds: &[Bound], + states_searched: &mut usize, + mut subgraph: BitSet, + matches_graph: &CGraph, + depth: usize +) -> usize { + let mut cx = ix; + + // Search for duplicatable fragment + for v in subgraph.clone().iter() { + if !subgraph.contains(v) { + continue; + } + + let (h1, h2) = matches_graph.get_match(v); + + let mut fractures = fragments.to_owned(); + let f1 = fragments.iter().enumerate().find(|(_, c)| h1.is_subset(c)); + let f2 = fragments.iter().enumerate().find(|(_, c)| h2.is_subset(c)); + + let largest_remove = h1.len(); + + let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else { + continue; + }; + + // All of these clones are on bitsets and cheap enough + if i1 == i2 { + let mut union = h1.clone(); + union.union_with(h2); + let mut difference = f1.clone(); + difference.difference_with(&union); + let c = connected_components_under_edges(mol.graph(), &difference); + fractures.extend(c); + fractures.swap_remove(i1); + } else { + let mut f1r = f1.clone(); + f1r.difference_with(h1); + let mut f2r = f2.clone(); + f2r.difference_with(h2); + + let c1 = connected_components_under_edges(mol.graph(), &f1r); + let c2 = connected_components_under_edges(mol.graph(), &f2r); + + fractures.extend(c1); + fractures.extend(c2); + + fractures.swap_remove(i1.max(i2)); + fractures.swap_remove(i1.min(i2)); + } + + fractures.retain(|i| i.len() > 1); + fractures.push(h1.clone()); + + let mut subgraph_clone = subgraph.clone(); + let neighbors = matches_graph.adjacent_to(v); + subgraph_clone.intersect_with(&neighbors); + + cx = cx.min(recurse_clique_index_search( + mol, + &fractures, + ix - h1.len() + 1, + largest_remove, + best, + bounds, + states_searched, + subgraph_clone, + matches_graph, + depth + 1, + )); + if cx < best { + best = cx; + // kernelize + for v in subgraph.clone().iter() { + let mut neighbor_sum = 0; + for u in matches_graph.adjacent_to(v) { + if subgraph.contains(u) { + neighbor_sum += matches_graph.weights[u]; + } + } + + if ix >= best + neighbor_sum { + subgraph.remove(v); + } + } + } + + subgraph.remove(v); + } + + cx +} + + fn recurse_clique_index_search(mol: &Molecule, - matches: &Vec<(BitSet, BitSet)>, fragments: &[BitSet], ix: usize, largest_remove: usize, @@ -239,8 +404,8 @@ fn recurse_clique_index_search(mol: &Molecule, bounds: &[Bound], states_searched: &mut usize, subgraph: BitSet, - nodes: &Vec, - matches_graph: &Graph<(usize, usize), (), Undirected>, + matches_graph: &CGraph, + depth: usize, ) -> usize { let mut cx = ix; @@ -259,18 +424,18 @@ fn recurse_clique_index_search(mol: &Molecule, if exceeds { return ix; } - if subgraph.len() <= ix - best { - let sum: usize = subgraph.iter().map(|rank| matches_graph.node_weight(nodes[rank]).unwrap().0).sum(); - if ix as i32 - sum as i32 >= best as i32 { + if subgraph.iter().count() <= ix - best { + if ix >= best + matches_graph.remaining_weight_bound(&subgraph) { return ix; } } } + let mut to_remove = BitSet::with_capacity(subgraph.capacity()); + // Search for duplicatable fragment - for x in subgraph.iter() { - let v = nodes[x]; - let (h1, h2) = &matches[x]; + for v in subgraph.iter() { + let (h1, h2) = matches_graph.get_match(v); let mut fractures = fragments.to_owned(); let f1 = fragments.iter().enumerate().find(|(_, c)| h1.is_subset(c)); @@ -311,20 +476,16 @@ fn recurse_clique_index_search(mol: &Molecule, fractures.push(h1.clone()); let mut subgraph_clone = subgraph.clone(); - let mut neighbors = BitSet::new(); - let weight_v = matches_graph.node_weight(v).unwrap().1; - for u in matches_graph.neighbors(v) { - let weight_u = matches_graph.node_weight(u).unwrap().1; - if weight_u >= weight_v { - neighbors.insert(matches_graph.node_weight(u).unwrap().1); - } - } + let neighbors = matches_graph.adjacent_to(v).clone(); subgraph_clone.intersect_with(&neighbors); - //subgraph_clone = kernelize(&matches_graph, subgraph_clone, nodes); + subgraph_clone.difference_with(&to_remove); + //print!("Depth: {} ", depth); + if depth == 1 { + subgraph_clone = kernelize(matches_graph, subgraph_clone); + } cx = cx.min(recurse_clique_index_search( mol, - matches, &fractures, ix - h1.len() + 1, largest_remove, @@ -332,51 +493,91 @@ fn recurse_clique_index_search(mol: &Molecule, bounds, states_searched, subgraph_clone, - nodes, matches_graph, + depth + 1, )); best = best.min(cx); + + to_remove.insert(v); } cx } -fn kernelize(g: &Graph<(usize, usize), (), Undirected>, mut subgraph: BitSet, nodes: &Vec) -> BitSet { +pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { + // Graph Initialization + let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); + let num_matches = matches.len(); + let matches_graph = CGraph::new(matches); + let mut subgraph = BitSet::with_capacity(num_matches); + for i in 0..num_matches { + subgraph.insert(i); + } + + // Kernelization + //let start = Instant::now(); + subgraph = kernelize(&matches_graph, subgraph); + //let dur = start.elapsed(); + //println!("Kernel Time: {:?}", dur); + + // Search + let mut total_search = 0; + let mut init = BitSet::new(); + init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + let edge_count = mol.graph().edge_count(); + + //let start = Instant::now(); + + let index = recurse_clique_index_search( + mol, + &[init], + edge_count - 1, + edge_count, + edge_count - 1, + bounds, + &mut total_search, + subgraph.clone(), + &matches_graph, + 1); + + //let dur = start.elapsed(); + //println!("Search Time: {:?}", dur); + + (index as u32, num_matches as u32, total_search) +} + +fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { //let mut count = 0; let subgraph_copy = subgraph.clone(); - for v_rank in subgraph_copy.iter() { - if !subgraph.contains(v_rank) { + for v in subgraph_copy.iter() { + if !subgraph.contains(v) { continue; } - let v_index = nodes[v_rank]; - let v_val = g.node_weight(v_index).unwrap().0; - let neighbors_v: BTreeSet = g.neighbors(v_index) - .map(|x| g.node_weight(x).unwrap().1) - .filter(|x| subgraph.contains(*x)) - .collect(); - - for u_rank in subgraph_copy.iter().filter(|x| *x > v_rank) { - if !subgraph.contains(u_rank) { + + let v_val = g.weights[v]; + let mut neighbors_v= g.adjacent_to(v).clone(); + neighbors_v.intersect_with(&subgraph); + + for u in subgraph_copy.iter().filter(|x| x > &v) { + if !subgraph.contains(u) { continue; } - let u_index = nodes[u_rank]; - let u_val = g.node_weight(u_index).unwrap().0; - let neighbors_u: BTreeSet = g.neighbors(u_index) - .map(|x| g.node_weight(x).unwrap().1) - .filter(|x| subgraph.contains(*x)) - .collect(); - - if neighbors_u.is_subset(&neighbors_v) && u_val <= v_val { + + let u_val = g.weights[u]; + let mut neighbors_u = g.adjacent_to(u).clone(); + neighbors_u.intersect_with(&subgraph); + + if u_val <= v_val && neighbors_u.is_subset(&neighbors_v) { //count += 1; - subgraph.remove(u_rank); + subgraph.remove(u); } - else if neighbors_v.is_subset(&neighbors_u) && v_val <= u_val { + else if v_val <= u_val && neighbors_v.is_subset(&neighbors_u) { //count += 1; - subgraph.remove(v_rank); + subgraph.remove(v); break; } } @@ -386,69 +587,6 @@ fn kernelize(g: &Graph<(usize, usize), (), Undirected>, mut subgraph: BitSet, no subgraph } -pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { - let mut matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); - matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); - - let mut matches_graph = Graph::<(usize, usize), _, Undirected>::with_capacity(matches.len(), matches.len()); - let mut nodes: Vec = Vec::new(); - - let mut subgraph = BitSet::new(); - - // Create vertices of matches_graph - for (idx, (h1, _)) in matches.iter().enumerate() { - let v = matches_graph.add_node((h1.len() - 1, idx)); - nodes.push(v); - subgraph.insert(idx); - } - - // Create edges of matches_graph - // two matches are connected if they are compatible with each other - for (idx1, (h1, h2)) in matches.iter().enumerate() { - for (idx2, (h1p, h2p)) in matches[idx1 + 1..].iter().enumerate() { - let compatible = { - h2.is_disjoint(h2p) && - (h1.is_disjoint(h1p) || h1.is_subset(h1p) || h1.is_superset(h1p)) && - (h1p.is_disjoint(h2) || h1p.is_superset(h2)) && - (h1.is_disjoint(h2p) || h1.is_superset(h2p)) - }; - - if compatible { - matches_graph.add_edge(nodes[idx1], nodes[idx1 + idx2 + 1], ()); - } - } - } - - let mut init = BitSet::new(); - init.extend(mol.graph().edge_indices().map(|ix| ix.index())); - let edge_count = mol.graph().edge_count(); - - subgraph = kernelize(&matches_graph, subgraph, &nodes); - - let mut total_search = 0; - - use std::time::Instant; - let start = Instant::now(); - - let index = recurse_clique_index_search( - mol, - &matches, - &[init], - edge_count - 1, - edge_count, - edge_count - 1, - bounds, - &mut total_search, - subgraph, - &nodes, - &matches_graph); - - let dur = start.elapsed(); - println!("Time: {:?}", dur); - - (index as u32, matches.len() as u32, total_search) -} - #[allow(clippy::too_many_arguments)] fn parallel_recurse_index_search( mol: &Molecule, @@ -609,6 +747,7 @@ pub fn index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { edge_count - 1, bounds, &mut total_search, + -1 ); (index as u32, total_search) }; @@ -652,8 +791,7 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let edge_count = mol.graph().edge_count(); let mut total_search = 0; - use std::time::Instant; - let start = Instant::now(); + //let start = Instant::now(); let index = recurse_index_search( mol, @@ -664,10 +802,11 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize edge_count - 1, bounds, &mut total_search, + -1 ); - let dur = start.elapsed(); - println!("Time: {:?}", dur); + //let dur = start.elapsed(); + //println!("Time: {:?}", dur); (index as u32, matches.len() as u32, total_search) } From f517c6a10191df1710f710d2289d29ce7da1ff98 Mon Sep 17 00:00:00 2001 From: Garrett Parzych Date: Sat, 14 Jun 2025 00:34:28 -0700 Subject: [PATCH 12/32] Add color bound --- src/assembly.rs | 54 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index e443d5ab..c2ad685d 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -63,6 +63,7 @@ struct CGraph { graph: Vec, weights: Vec, matches: Vec<(BitSet, BitSet)>, + colors: Option>>, } impl CGraph { @@ -101,6 +102,7 @@ impl CGraph { graph: init_graph, weights: init_weights, matches: init_matches, + colors: None, } } @@ -116,6 +118,47 @@ impl CGraph { pub fn get_match(&self, v: usize) -> &(BitSet, BitSet) { &self.matches[v] } + + pub fn color_bound(&self, subgraph: &BitSet) -> usize{ + // Greedy coloring + let mut colors: Vec = vec![0; self.matches.len()]; + let mut num_colors = 0; + let mut largest: Vec = Vec::new(); + + + for v in (0..self.matches.len()).rev() { + if !subgraph.contains(v) { + continue; + } + + let mut used: Vec = vec![0; num_colors + 1]; + + for u in subgraph.intersection(&self.graph[v]) { + if colors[u] != 0 { + used[colors[u] - 1] = 1; + } + } + + let mut k = 0; + while used[k] != 0 { + k += 1; + } + + if k == num_colors { + num_colors += 1; + largest.push(0); + } + if self.weights[v] > largest[k] { + largest[k] = self.weights[v] + } + + colors[v] = k + 1; + } + + largest.iter().sum::() + //println!("{:?}", self.graph.iter().map(|x| x.len()).collect::>()); + //println!("{:?}", colors); + } } pub fn naive_assembly_depth(mol: &Molecule) -> u32 { @@ -424,11 +467,12 @@ fn recurse_clique_index_search(mol: &Molecule, if exceeds { return ix; } - if subgraph.iter().count() <= ix - best { - if ix >= best + matches_graph.remaining_weight_bound(&subgraph) { - return ix; - } - } + } + if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound(&subgraph) { + return ix; + } + if ix >= best + matches_graph.color_bound(&subgraph) { + return ix; } let mut to_remove = BitSet::with_capacity(subgraph.capacity()); From b1e03160d6f81527d6387e0dbe9750ad5c340f47 Mon Sep 17 00:00:00 2001 From: Garrett Parzych Date: Mon, 16 Jun 2025 22:17:05 -0700 Subject: [PATCH 13/32] Add faster kernelization --- src/assembly.rs | 69 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index cfac47ae..90d766b1 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -63,7 +63,6 @@ struct CGraph { graph: Vec, weights: Vec, matches: Vec<(BitSet, BitSet)>, - colors: Option>>, } impl CGraph { @@ -76,7 +75,7 @@ impl CGraph { let mut init_weights: Vec = Vec::with_capacity(size); for m in init_matches.iter() { init_graph.push(BitSet::with_capacity(size)); - init_weights.push(m.0.len()); + init_weights.push(m.0.len() - 1); } // Populate graph @@ -102,10 +101,21 @@ impl CGraph { graph: init_graph, weights: init_weights, matches: init_matches, - colors: None, } } + pub fn len(&self) -> usize { + self.matches.len() + } + + pub fn degree(&self, v: usize, subgraph: &BitSet) -> usize { + self.graph[v].intersection(subgraph).count() + } + + pub fn degree_dist(&self) -> Vec { + (0..self.len()).map(|v| self.graph[v].iter().count()).collect() + } + // Should probably return Option<> instead pub fn adjacent_to(&self, v: usize) -> &BitSet { &self.graph[v] @@ -117,7 +127,7 @@ impl CGraph { pub fn get_match(&self, v: usize) -> &(BitSet, BitSet) { &self.matches[v] - } + } pub fn color_bound(&self, subgraph: &BitSet) -> usize{ // Greedy coloring @@ -475,7 +485,7 @@ fn recurse_clique_index_search(mol: &Molecule, return ix; } - let mut to_remove = BitSet::with_capacity(subgraph.capacity()); + let mut to_remove = BitSet::with_capacity(matches_graph.len()); // Search for duplicatable fragment for v in subgraph.iter() { @@ -591,9 +601,9 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize } fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { - //let mut count = 0; - + let mut count = 0; let subgraph_copy = subgraph.clone(); + for v in subgraph_copy.iter() { if !subgraph.contains(v) { continue; @@ -613,21 +623,60 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { neighbors_u.intersect_with(&subgraph); if u_val <= v_val && neighbors_u.is_subset(&neighbors_v) { - //count += 1; + count += 1; subgraph.remove(u); } else if v_val <= u_val && neighbors_v.is_subset(&neighbors_u) { - //count += 1; + count += 1; + + subgraph.remove(v); + break; + } + } + } + + println!("Reduce count: {}", count); + subgraph +} + +fn kernelize_fast(g: &CGraph, mut subgraph: BitSet) -> BitSet { + let mut count = 0; + let subgraph_copy = subgraph.clone(); + + for v in subgraph_copy.iter() { + let v_val = g.weights[v]; + let mut neighbors_v = g.adjacent_to(v).clone(); + neighbors_v.intersect_with(&subgraph); + + let Some(w) = neighbors_v.iter().next() else { + count += 1; + subgraph.remove(v); + continue; + }; + + for u in g.adjacent_to(w).intersection(&subgraph) { + if g.adjacent_to(v).contains(u) || u == v { + continue; + } + + let u_val = g.weights[u]; + if v_val > u_val { + continue; + } + let mut neighbors_u = g.adjacent_to(u).clone(); + neighbors_u.intersect_with(&subgraph); + if neighbors_v.is_subset(&neighbors_u) { + count += 1; subgraph.remove(v); break; } } } - //println!("Reduce count: {}", count); + println!("Reduce count: {}", count); subgraph } From 2f7667cdae1e19255149fa498d367504fff4a8a9 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 17 Jun 2025 12:17:09 -0700 Subject: [PATCH 14/32] Fix matches_graph is now directed --- src/assembly.rs | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 90d766b1..bc50394d 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -83,16 +83,25 @@ impl CGraph { for (idx2, (h1p, h2p)) in init_matches[idx1 + 1..].iter().enumerate() { let idx2 = idx2 + idx1 + 1; - let compatible = { - h2.is_disjoint(h2p) && - (h1.is_disjoint(h1p) || h1.is_subset(h1p) || h1.is_superset(h1p)) && - (h1p.is_disjoint(h2) || h1p.is_superset(h2)) && + let forward_compatible = { + h2.is_disjoint(h1p) && + h2.is_disjoint(h2p) && + (h1.is_disjoint(h1p) || h1.is_superset(h1p)) && (h1.is_disjoint(h2p) || h1.is_superset(h2p)) }; - if compatible { + if forward_compatible { init_graph[idx1].insert(idx2); - init_graph[idx2].insert(idx1); + + let backward_compatible = { + h2p.is_disjoint(h1) && + (h1p.is_disjoint(h1) || h1p.is_superset(h1)) && + (h1p.is_disjoint(h2) || h1p.is_superset(h2)) + }; + + if backward_compatible { + init_graph[idx2].insert(idx1); + } } } } @@ -534,9 +543,9 @@ fn recurse_clique_index_search(mol: &Molecule, subgraph_clone.intersect_with(&neighbors); subgraph_clone.difference_with(&to_remove); //print!("Depth: {} ", depth); - if depth == 1 { + /*if depth == 1 { subgraph_clone = kernelize(matches_graph, subgraph_clone); - } + }*/ cx = cx.min(recurse_clique_index_search( mol, @@ -562,17 +571,22 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize // Graph Initialization let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); let num_matches = matches.len(); + + let start = Instant::now(); let matches_graph = CGraph::new(matches); + let dur = start.elapsed(); + println!("Graph Time: {:?}", dur); + let mut subgraph = BitSet::with_capacity(num_matches); for i in 0..num_matches { subgraph.insert(i); } // Kernelization - //let start = Instant::now(); + let start = Instant::now(); subgraph = kernelize(&matches_graph, subgraph); - //let dur = start.elapsed(); - //println!("Kernel Time: {:?}", dur); + let dur = start.elapsed(); + println!("Kernel Time: {:?}", dur); // Search let mut total_search = 0; @@ -580,7 +594,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize init.extend(mol.graph().edge_indices().map(|ix| ix.index())); let edge_count = mol.graph().edge_count(); - //let start = Instant::now(); + let start = Instant::now(); let index = recurse_clique_index_search( mol, @@ -594,8 +608,8 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize &matches_graph, 1); - //let dur = start.elapsed(); - //println!("Search Time: {:?}", dur); + let dur = start.elapsed(); + println!("Search Time: {:?}", dur); (index as u32, num_matches as u32, total_search) } @@ -884,7 +898,7 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let edge_count = mol.graph().edge_count(); let mut total_search = 0; - //let start = Instant::now(); + let start = Instant::now(); let index = recurse_index_search( mol, @@ -898,8 +912,8 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize -1 ); - //let dur = start.elapsed(); - //println!("Time: {:?}", dur); + let dur = start.elapsed(); + println!("Search Time: {:?}", dur); (index as u32, matches.len() as u32, total_search) } From b964a37cbfe23d981662c71780ff735684b6cd06 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Tue, 17 Jun 2025 13:29:09 -0700 Subject: [PATCH 15/32] Fix fast kernelization and make it default --- src/assembly.rs | 75 ++++++++++++------------------------------------- 1 file changed, 18 insertions(+), 57 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index bc50394d..6608ca36 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -125,9 +125,16 @@ impl CGraph { (0..self.len()).map(|v| self.graph[v].iter().count()).collect() } - // Should probably return Option<> instead - pub fn adjacent_to(&self, v: usize) -> &BitSet { - &self.graph[v] + // Returns a safely mutable set of out neighbors of v in subgraph. + pub fn out_neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + + neighbors + } + + pub fn are_adjacent(&self, v: usize, u: usize) -> bool { + self.graph[v].contains(u) } pub fn remaining_weight_bound(&self, subgraph: &BitSet) -> usize { @@ -418,9 +425,7 @@ fn recurse_clique_index_search_with_start(mol: &Molecule, fractures.retain(|i| i.len() > 1); fractures.push(h1.clone()); - let mut subgraph_clone = subgraph.clone(); - let neighbors = matches_graph.adjacent_to(v); - subgraph_clone.intersect_with(&neighbors); + let mut subgraph_clone = matches_graph.out_neighbors(v, &subgraph); cx = cx.min(recurse_clique_index_search( mol, @@ -439,7 +444,7 @@ fn recurse_clique_index_search_with_start(mol: &Molecule, // kernelize for v in subgraph.clone().iter() { let mut neighbor_sum = 0; - for u in matches_graph.adjacent_to(v) { + for u in matches_graph.graph[v].iter() { if subgraph.contains(u) { neighbor_sum += matches_graph.weights[u]; } @@ -538,9 +543,7 @@ fn recurse_clique_index_search(mol: &Molecule, fractures.retain(|i| i.len() > 1); fractures.push(h1.clone()); - let mut subgraph_clone = subgraph.clone(); - let neighbors = matches_graph.adjacent_to(v).clone(); - subgraph_clone.intersect_with(&neighbors); + let mut subgraph_clone = matches_graph.out_neighbors(v, &subgraph); subgraph_clone.difference_with(&to_remove); //print!("Depth: {} ", depth); /*if depth == 1 { @@ -619,50 +622,8 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { let subgraph_copy = subgraph.clone(); for v in subgraph_copy.iter() { - if !subgraph.contains(v) { - continue; - } - let v_val = g.weights[v]; - let mut neighbors_v= g.adjacent_to(v).clone(); - neighbors_v.intersect_with(&subgraph); - - for u in subgraph_copy.iter().filter(|x| x > &v) { - if !subgraph.contains(u) { - continue; - } - - let u_val = g.weights[u]; - let mut neighbors_u = g.adjacent_to(u).clone(); - neighbors_u.intersect_with(&subgraph); - - if u_val <= v_val && neighbors_u.is_subset(&neighbors_v) { - count += 1; - - subgraph.remove(u); - - } - else if v_val <= u_val && neighbors_v.is_subset(&neighbors_u) { - count += 1; - - subgraph.remove(v); - break; - } - } - } - - println!("Reduce count: {}", count); - subgraph -} - -fn kernelize_fast(g: &CGraph, mut subgraph: BitSet) -> BitSet { - let mut count = 0; - let subgraph_copy = subgraph.clone(); - - for v in subgraph_copy.iter() { - let v_val = g.weights[v]; - let mut neighbors_v = g.adjacent_to(v).clone(); - neighbors_v.intersect_with(&subgraph); + let neighbors_v = g.out_neighbors(v, &subgraph); let Some(w) = neighbors_v.iter().next() else { count += 1; @@ -670,8 +631,8 @@ fn kernelize_fast(g: &CGraph, mut subgraph: BitSet) -> BitSet { continue; }; - for u in g.adjacent_to(w).intersection(&subgraph) { - if g.adjacent_to(v).contains(u) || u == v { + for u in subgraph.iter() { + if !g.are_adjacent(u, w) || !subgraph.contains(u) || g.are_adjacent(v, u) || g.are_adjacent(u, v) || u == v { continue; } @@ -679,8 +640,8 @@ fn kernelize_fast(g: &CGraph, mut subgraph: BitSet) -> BitSet { if v_val > u_val { continue; } - let mut neighbors_u = g.adjacent_to(u).clone(); - neighbors_u.intersect_with(&subgraph); + + let neighbors_u = g.out_neighbors(u, &subgraph); if neighbors_v.is_subset(&neighbors_u) { count += 1; From f77da544d351b938f877c344091157bfb90bdd48 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 18 Jun 2025 14:11:48 -0700 Subject: [PATCH 16/32] Add new color bound --- src/assembly.rs | 82 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 12 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 6608ca36..a10c6717 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -133,6 +133,25 @@ impl CGraph { neighbors } + pub fn forward_neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + let mut neighbors = self.graph[v].clone(); + neighbors.intersect_with(subgraph); + let mut to_remove = vec![]; + for u in neighbors.iter() { + if u <= v { + to_remove.push(u); + } + if u > v { + break; + } + } + for u in to_remove { + neighbors.remove(u); + } + + neighbors + } + pub fn are_adjacent(&self, v: usize, u: usize) -> bool { self.graph[v].contains(u) } @@ -180,10 +199,57 @@ impl CGraph { colors[v] = k + 1; } + //print!("Bounds: {}, ", largest.iter().sum::()); + //println!("{:?}, ", self.graph.iter().map(|x| x.len()).collect::>()); + //println!("{:?}", colors); largest.iter().sum::() + } + + pub fn color_bound_improved(&self, subgraph: &BitSet) -> usize{ + // Greedy coloring + let mut colors: Vec = vec![0; self.matches.len()]; + let mut num_colors = 0; + let mut largest: Vec = Vec::new(); + + + for v in (0..self.matches.len()).rev() { + if !subgraph.contains(v) { + continue; + } + + let mut used: Vec = vec![0; num_colors]; + + for u in subgraph.intersection(&self.graph[v]) { + if colors[u] != 0 { + used[colors[u] - 1] = 1; + } + } + + let mut max = 0; + let mut max_idx = num_colors; + for i in 0..num_colors { + if used[i] == 0 && largest[i] > max { + max = largest[i]; + max_idx = i; + } + } + + if max_idx == num_colors { + num_colors += 1; + largest.push(0); + } + if self.weights[v] > largest[max_idx] { + largest[max_idx] = self.weights[v] + } + + colors[v] = max_idx + 1; + } + //println!("{} ", largest.iter().sum::()); //println!("{:?}", self.graph.iter().map(|x| x.len()).collect::>()); //println!("{:?}", colors); + + largest.iter().sum::() } } @@ -425,7 +491,7 @@ fn recurse_clique_index_search_with_start(mol: &Molecule, fractures.retain(|i| i.len() > 1); fractures.push(h1.clone()); - let mut subgraph_clone = matches_graph.out_neighbors(v, &subgraph); + let subgraph_clone = matches_graph.out_neighbors(v, &subgraph); cx = cx.min(recurse_clique_index_search( mol, @@ -499,8 +565,6 @@ fn recurse_clique_index_search(mol: &Molecule, return ix; } - let mut to_remove = BitSet::with_capacity(matches_graph.len()); - // Search for duplicatable fragment for v in subgraph.iter() { let (h1, h2) = matches_graph.get_match(v); @@ -543,9 +607,7 @@ fn recurse_clique_index_search(mol: &Molecule, fractures.retain(|i| i.len() > 1); fractures.push(h1.clone()); - let mut subgraph_clone = matches_graph.out_neighbors(v, &subgraph); - subgraph_clone.difference_with(&to_remove); - //print!("Depth: {} ", depth); + let mut subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); /*if depth == 1 { subgraph_clone = kernelize(matches_graph, subgraph_clone); }*/ @@ -563,8 +625,6 @@ fn recurse_clique_index_search(mol: &Molecule, depth + 1, )); best = best.min(cx); - - to_remove.insert(v); } cx @@ -626,13 +686,11 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { let neighbors_v = g.out_neighbors(v, &subgraph); let Some(w) = neighbors_v.iter().next() else { - count += 1; - subgraph.remove(v); continue; }; for u in subgraph.iter() { - if !g.are_adjacent(u, w) || !subgraph.contains(u) || g.are_adjacent(v, u) || g.are_adjacent(u, v) || u == v { + if !subgraph.contains(u) || !g.are_adjacent(u, w) || g.are_adjacent(v, u) || g.are_adjacent(u, v) || v == u { continue; } @@ -651,7 +709,7 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { } } - println!("Reduce count: {}", count); + //println!("Reduce count: {}", count); subgraph } From b032ef1bd39dd6c414db3709aa68274d4989bee4 Mon Sep 17 00:00:00 2001 From: Garrett Parzych Date: Wed, 18 Jun 2025 18:10:50 -0700 Subject: [PATCH 17/32] Add cover bound --- src/assembly.rs | 86 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 12 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index a10c6717..883eba55 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -16,7 +16,7 @@ //! # } //! ``` use std::{ - char::UNICODE_VERSION, collections::BTreeSet, ops::BitAnd, sync::{ + collections::BTreeSet, sync::{ atomic::{AtomicUsize, Ordering::Relaxed}, Arc, } @@ -25,7 +25,6 @@ use std::{ use std::time::Instant; use bit_set::BitSet; -use petgraph::{graph::NodeIndex, Graph, Undirected}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use crate::{ @@ -166,7 +165,7 @@ impl CGraph { pub fn color_bound(&self, subgraph: &BitSet) -> usize{ // Greedy coloring - let mut colors: Vec = vec![0; self.matches.len()]; + let mut colors: Vec = vec![-1; self.len()]; let mut num_colors = 0; let mut largest: Vec = Vec::new(); @@ -179,8 +178,8 @@ impl CGraph { let mut used: Vec = vec![0; num_colors + 1]; for u in subgraph.intersection(&self.graph[v]) { - if colors[u] != 0 { - used[colors[u] - 1] = 1; + if colors[u] != -1 { + used[colors[u] as usize] = 1; } } @@ -194,10 +193,10 @@ impl CGraph { largest.push(0); } if self.weights[v] > largest[k] { - largest[k] = self.weights[v] + largest[k] = self.weights[v]; } - colors[v] = k + 1; + colors[v] = k as i32; } //print!("Bounds: {}, ", largest.iter().sum::()); //println!("{:?}, ", self.graph.iter().map(|x| x.len()).collect::>()); @@ -208,7 +207,7 @@ impl CGraph { pub fn color_bound_improved(&self, subgraph: &BitSet) -> usize{ // Greedy coloring - let mut colors: Vec = vec![0; self.matches.len()]; + let mut colors: Vec = vec![-1; self.len()]; let mut num_colors = 0; let mut largest: Vec = Vec::new(); @@ -221,8 +220,8 @@ impl CGraph { let mut used: Vec = vec![0; num_colors]; for u in subgraph.intersection(&self.graph[v]) { - if colors[u] != 0 { - used[colors[u] - 1] = 1; + if colors[u] != -1 { + used[colors[u] as usize] = 1; } } @@ -243,7 +242,7 @@ impl CGraph { largest[max_idx] = self.weights[v] } - colors[v] = max_idx + 1; + colors[v] = max_idx as i32; } //println!("{} ", largest.iter().sum::()); //println!("{:?}", self.graph.iter().map(|x| x.len()).collect::>()); @@ -251,6 +250,66 @@ impl CGraph { largest.iter().sum::() } + + pub fn cover_bound(&self, subgraph: &BitSet) -> usize { + // Greedy coloring + let mut colors: Vec>> = vec![None; self.len()]; + let mut col_weights = vec![]; + let mut num_col = 0; + + for v in (0..self.matches.len()).rev() { + if !subgraph.contains(v) { + continue; + } + + let mut v_col = Vec::new(); + let mut used = vec![0; num_col]; + + // Find colors used in neighborhood of v + for u in subgraph.intersection(&self.graph[v]) { + let Some(u_col) = &colors[u] else { + continue; + }; + + for c in u_col { + used[*c] = 1; + } + } + + let mut total_weight = 0; + let v_val = self.weights[v]; + // Find colors to give to v + for c in 0..num_col { + if used[c] == 1 { + continue; + } + + v_col.push(c); + total_weight += col_weights[c]; + + if total_weight >= v_val { + break; + } + } + + if total_weight == 0 { + v_col.push(num_col); + col_weights.push(self.weights[v]); + num_col += 1 + } + else if total_weight < self.weights[v] { + let mut k = num_col - 1; + while used[k] == 1 { + k -= 1 + } + col_weights[k] += self.weights[v] - total_weight + } + + colors[v] = Some(v_col); + }; + + col_weights.iter().sum() + } } pub fn naive_assembly_depth(mol: &Molecule) -> u32 { @@ -561,7 +620,10 @@ fn recurse_clique_index_search(mol: &Molecule, if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound(&subgraph) { return ix; } - if ix >= best + matches_graph.color_bound(&subgraph) { + /*if ix >= best + matches_graph.color_bound_improved(&subgraph) { + return ix; + }*/ + if ix >= best + matches_graph.cover_bound(&subgraph) { return ix; } From d8d8c7e9ed6c95a55c6e1c042ca5ea5617b7445a Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Thu, 19 Jun 2025 13:08:18 -0700 Subject: [PATCH 18/32] Fix kernelize bug --- src/assembly.rs | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 883eba55..1416562f 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -91,16 +91,7 @@ impl CGraph { if forward_compatible { init_graph[idx1].insert(idx2); - - let backward_compatible = { - h2p.is_disjoint(h1) && - (h1p.is_disjoint(h1) || h1p.is_superset(h1)) && - (h1p.is_disjoint(h2) || h1p.is_superset(h2)) - }; - - if backward_compatible { - init_graph[idx2].insert(idx1); - } + init_graph[idx2].insert(idx1); } } } @@ -124,8 +115,8 @@ impl CGraph { (0..self.len()).map(|v| self.graph[v].iter().count()).collect() } - // Returns a safely mutable set of out neighbors of v in subgraph. - pub fn out_neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { + // Returns a safely mutable set of neighbors of v in subgraph. + pub fn neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { let mut neighbors = self.graph[v].clone(); neighbors.intersect_with(subgraph); @@ -550,7 +541,7 @@ fn recurse_clique_index_search_with_start(mol: &Molecule, fractures.retain(|i| i.len() > 1); fractures.push(h1.clone()); - let subgraph_clone = matches_graph.out_neighbors(v, &subgraph); + let subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); cx = cx.min(recurse_clique_index_search( mol, @@ -745,14 +736,14 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { for v in subgraph_copy.iter() { let v_val = g.weights[v]; - let neighbors_v = g.out_neighbors(v, &subgraph); + let neighbors_v = g.neighbors(v, &subgraph); let Some(w) = neighbors_v.iter().next() else { continue; }; - for u in subgraph.iter() { - if !subgraph.contains(u) || !g.are_adjacent(u, w) || g.are_adjacent(v, u) || g.are_adjacent(u, v) || v == u { + for u in g.graph[w].intersection(&subgraph) { + if g.are_adjacent(v, u) || v == u { continue; } @@ -761,7 +752,7 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { continue; } - let neighbors_u = g.out_neighbors(u, &subgraph); + let neighbors_u = g.neighbors(u, &subgraph); if neighbors_v.is_subset(&neighbors_u) { count += 1; From 60187cc0b456c855180abd38ddbb1c284d457553 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 23 Jun 2025 13:24:35 -0700 Subject: [PATCH 19/32] Add new kernelize and weight bound methods --- src/assembly.rs | 70 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 1416562f..c80906d3 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -147,7 +147,19 @@ impl CGraph { } pub fn remaining_weight_bound(&self, subgraph: &BitSet) -> usize { - subgraph.iter().map(|i| self.weights[i]).sum() + subgraph.iter().map(|v| self.weights[v]).sum() + } + + pub fn remaining_weight_bound_new(&self, subgraph: &BitSet) -> usize { + let deg_sum = subgraph.iter().map(|v| self.degree(v, subgraph)).sum::() as f32; + let max_clique = ((1_f32 + (4_f32 * deg_sum + 1_f32).sqrt()) / 2_f32).floor() as usize; + let mut sum = 0; + let mut iter = subgraph.iter(); + for _ in 0..max_clique { + sum += self.weights[iter.next().unwrap()]; + }; + + sum } pub fn get_match(&self, v: usize) -> &(BitSet, BitSet) { @@ -285,15 +297,15 @@ impl CGraph { if total_weight == 0 { v_col.push(num_col); - col_weights.push(self.weights[v]); + col_weights.push(v_val); num_col += 1 } - else if total_weight < self.weights[v] { + else if total_weight < v_val { let mut k = num_col - 1; while used[k] == 1 { k -= 1 } - col_weights[k] += self.weights[v] - total_weight + col_weights[k] += v_val - total_weight } colors[v] = Some(v_col); @@ -590,6 +602,9 @@ fn recurse_clique_index_search(mol: &Molecule, matches_graph: &CGraph, depth: usize, ) -> usize { + if subgraph.len() == 0 { + return ix; + } let mut cx = ix; *states_searched += 1; @@ -608,9 +623,9 @@ fn recurse_clique_index_search(mol: &Molecule, return ix; } } - if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound(&subgraph) { + /*if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound_new(&subgraph) { return ix; - } + }*/ /*if ix >= best + matches_graph.color_bound_improved(&subgraph) { return ix; }*/ @@ -700,7 +715,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize // Kernelization let start = Instant::now(); - subgraph = kernelize(&matches_graph, subgraph); + subgraph = kernelize_fast(&matches_graph, subgraph); let dur = start.elapsed(); println!("Kernel Time: {:?}", dur); @@ -766,6 +781,47 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { subgraph } +fn kernelize_fast(g: &CGraph, mut subgraph: BitSet) -> BitSet { + let mut count = 0; + let subgraph_copy = subgraph.clone(); + + for v in subgraph_copy.iter() { + let v_val = g.weights[v]; + let neighbors_v = g.neighbors(v, &subgraph); + + let Some(w1) = neighbors_v.iter().next() else { + continue; + }; + let Some(w2) = neighbors_v.iter().last() else { + continue; + }; + + let mut s = subgraph.clone(); + s.intersect_with(&g.graph[w1]); + for u in s.intersection(&&g.graph[w2]) { + if g.are_adjacent(v, u) || v == u { + continue; + } + + let u_val = g.weights[u]; + if v_val > u_val { + continue; + } + + let neighbors_u = g.neighbors(u, &subgraph); + + if neighbors_v.is_subset(&neighbors_u) { + count += 1; + subgraph.remove(v); + break; + } + } + } + + //println!("Reduce count: {}", count); + subgraph +} + #[allow(clippy::too_many_arguments)] fn parallel_recurse_index_search( mol: &Molecule, From 19ffb9749f668399454ad468fc43910a36ef1e3a Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 23 Jun 2025 13:26:07 -0700 Subject: [PATCH 20/32] Add benchmark for search --- Cargo.toml | 4 +++ benches/search_benchmark.rs | 66 +++++++++++++++++++++++++++++++++++++ src/assembly.rs | 38 +++++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 benches/search_benchmark.rs diff --git a/Cargo.toml b/Cargo.toml index d509466a..dd1430e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,3 +30,7 @@ crate-type = ["rlib","cdylib"] name = "benchmark" harness = false +[[bench]] +name = "search_benchmark" +harness = false + diff --git a/benches/search_benchmark.rs b/benches/search_benchmark.rs new file mode 100644 index 00000000..d534104b --- /dev/null +++ b/benches/search_benchmark.rs @@ -0,0 +1,66 @@ +use bit_set::BitSet; +use criterion::{criterion_group, criterion_main, Criterion}; +use std::time::Duration; +use std::{ffi::OsStr, time::Instant}; +use std::fs; +use std::path::Path; + +use assembly_theory::{ + assembly::{clique_index_search_bench}, + loader, + molecule::Molecule, +}; + +pub fn reference_datasets(c: &mut Criterion) { + // Define a new criterion benchmark group of dataset benchmarks. + let mut group = c.benchmark_group("reference_datasets"); + + // Define datasets, bounds, and labels. + let datasets = ["gdb13_1201", "gdb17_200", "coconut_55"]; + + // Loop over all datasets of interest. + for dataset in datasets.iter() { + // Load all molecules from the given dataset. + let paths = fs::read_dir(Path::new("data").join(dataset)).unwrap(); + let mut mol_list: Vec = Vec::new(); + for path in paths { + let name = path.unwrap().path(); + if name.extension().and_then(OsStr::to_str) != Some("mol") { + continue; + } + mol_list.push( + loader::parse_molfile_str( + &fs::read_to_string(name.clone()) + .expect(&format!("Could not read file {name:?}")), + ) + .expect(&format!("Failed to parse {name:?}")), + ); + } + + group.bench_function(*dataset, move |b| { + b.iter_custom(|iters| { + let mut total = Duration::new(0, 0); + for _ in 0..iters { + for mol in mol_list.iter() { + let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); + let start = Instant::now(); + clique_index_search_bench(mol, matches); + total += start.elapsed() + } + } + + total + }, + ); + }); + } + + group.finish(); +} + +criterion_group! { + name = benchmark; + config = Criterion::default().sample_size(20); + targets = reference_datasets +} +criterion_main!(benchmark); diff --git a/src/assembly.rs b/src/assembly.rs index c80906d3..29fb5485 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -745,6 +745,44 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize (index as u32, num_matches as u32, total_search) } +pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>) -> (u32, u32, usize) { + // Graph Initialization + let num_matches = matches.len(); + let matches_graph = CGraph::new(matches); + + // Kernelization + let mut subgraph = BitSet::with_capacity(num_matches); + for i in 0..num_matches { + subgraph.insert(i); + } + //subgraph = kernelize_fast(&matches_graph, subgraph); + + // Search + let mut total_search = 0; + let mut init = BitSet::new(); + init.extend(mol.graph().edge_indices().map(|ix| ix.index())); + let edge_count = mol.graph().edge_count(); + let bounds = vec![ + Bound::IntChain, + Bound::VecChainSimple, + Bound::VecChainSmallFrags, + ]; + + let index = recurse_clique_index_search( + mol, + &[init], + edge_count - 1, + edge_count, + edge_count - 1, + &bounds, + &mut total_search, + subgraph.clone(), + &matches_graph, + 1); + + (index as u32, num_matches as u32, total_search) +} + fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { let mut count = 0; let subgraph_copy = subgraph.clone(); From e5c6d4ca02153b35023e421cf6d6c50fe8c8adbe Mon Sep 17 00:00:00 2001 From: Garrett Parzych Date: Tue, 24 Jun 2025 18:00:13 -0700 Subject: [PATCH 21/32] Add fragment bound --- src/assembly.rs | 217 +++++++++++++++--------------------------------- 1 file changed, 67 insertions(+), 150 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 29fb5485..6928e760 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -115,6 +115,13 @@ impl CGraph { (0..self.len()).map(|v| self.graph[v].iter().count()).collect() } + pub fn density(&self, subgraph: &BitSet) -> f32 { + let n = subgraph.len() as f32; + let deg_sum = subgraph.iter().map(|v| self.graph[v].intersection(subgraph).count()).sum::() as f32; + + deg_sum / (n * (n + 1_f32)) + } + // Returns a safely mutable set of neighbors of v in subgraph. pub fn neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { let mut neighbors = self.graph[v].clone(); @@ -147,10 +154,6 @@ impl CGraph { } pub fn remaining_weight_bound(&self, subgraph: &BitSet) -> usize { - subgraph.iter().map(|v| self.weights[v]).sum() - } - - pub fn remaining_weight_bound_new(&self, subgraph: &BitSet) -> usize { let deg_sum = subgraph.iter().map(|v| self.degree(v, subgraph)).sum::() as f32; let max_clique = ((1_f32 + (4_f32 * deg_sum + 1_f32).sqrt()) / 2_f32).floor() as usize; let mut sum = 0; @@ -173,48 +176,6 @@ impl CGraph { let mut largest: Vec = Vec::new(); - for v in (0..self.matches.len()).rev() { - if !subgraph.contains(v) { - continue; - } - - let mut used: Vec = vec![0; num_colors + 1]; - - for u in subgraph.intersection(&self.graph[v]) { - if colors[u] != -1 { - used[colors[u] as usize] = 1; - } - } - - let mut k = 0; - while used[k] != 0 { - k += 1; - } - - if k == num_colors { - num_colors += 1; - largest.push(0); - } - if self.weights[v] > largest[k] { - largest[k] = self.weights[v]; - } - - colors[v] = k as i32; - } - //print!("Bounds: {}, ", largest.iter().sum::()); - //println!("{:?}, ", self.graph.iter().map(|x| x.len()).collect::>()); - //println!("{:?}", colors); - - largest.iter().sum::() - } - - pub fn color_bound_improved(&self, subgraph: &BitSet) -> usize{ - // Greedy coloring - let mut colors: Vec = vec![-1; self.len()]; - let mut num_colors = 0; - let mut largest: Vec = Vec::new(); - - for v in (0..self.matches.len()).rev() { if !subgraph.contains(v) { continue; @@ -313,6 +274,37 @@ impl CGraph { col_weights.iter().sum() } + + pub fn frag_bound(&self, subgraph: &BitSet, fragments: &[BitSet]) -> usize { + let mut num_bonds: Vec = fragments.iter().map(|x| x.len() as isize).collect(); + let mut has_bonds = fragments.len(); + let mut bound = 0; + + for v in subgraph.iter() { + if has_bonds == 0 { + break; + } + + let m = &self.matches[v]; + let b = m.1.iter().next().unwrap(); + let mut i = 0; + while !fragments[i].contains(b) { + i += 1; + } + + if num_bonds[i] > 0 { + let m_len = m.0.len(); + num_bonds[i] -= (m_len + 1) as isize; + bound += m_len; + + if num_bonds[i] <= 0 { + has_bonds -= 1; + } + } + } + + bound + } } pub fn naive_assembly_depth(mol: &Molecule) -> u32 { @@ -494,103 +486,6 @@ fn recurse_index_search( } -fn recurse_clique_index_search_with_start(mol: &Molecule, - fragments: &[BitSet], - ix: usize, - largest_remove: usize, - mut best: usize, - bounds: &[Bound], - states_searched: &mut usize, - mut subgraph: BitSet, - matches_graph: &CGraph, - depth: usize -) -> usize { - let mut cx = ix; - - // Search for duplicatable fragment - for v in subgraph.clone().iter() { - if !subgraph.contains(v) { - continue; - } - - let (h1, h2) = matches_graph.get_match(v); - - let mut fractures = fragments.to_owned(); - let f1 = fragments.iter().enumerate().find(|(_, c)| h1.is_subset(c)); - let f2 = fragments.iter().enumerate().find(|(_, c)| h2.is_subset(c)); - - let largest_remove = h1.len(); - - let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else { - continue; - }; - - // All of these clones are on bitsets and cheap enough - if i1 == i2 { - let mut union = h1.clone(); - union.union_with(h2); - let mut difference = f1.clone(); - difference.difference_with(&union); - let c = connected_components_under_edges(mol.graph(), &difference); - fractures.extend(c); - fractures.swap_remove(i1); - } else { - let mut f1r = f1.clone(); - f1r.difference_with(h1); - let mut f2r = f2.clone(); - f2r.difference_with(h2); - - let c1 = connected_components_under_edges(mol.graph(), &f1r); - let c2 = connected_components_under_edges(mol.graph(), &f2r); - - fractures.extend(c1); - fractures.extend(c2); - - fractures.swap_remove(i1.max(i2)); - fractures.swap_remove(i1.min(i2)); - } - - fractures.retain(|i| i.len() > 1); - fractures.push(h1.clone()); - - let subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); - - cx = cx.min(recurse_clique_index_search( - mol, - &fractures, - ix - h1.len() + 1, - largest_remove, - best, - bounds, - states_searched, - subgraph_clone, - matches_graph, - depth + 1, - )); - if cx < best { - best = cx; - // kernelize - for v in subgraph.clone().iter() { - let mut neighbor_sum = 0; - for u in matches_graph.graph[v].iter() { - if subgraph.contains(u) { - neighbor_sum += matches_graph.weights[u]; - } - } - - if ix >= best + neighbor_sum { - subgraph.remove(v); - } - } - } - - subgraph.remove(v); - } - - cx -} - - fn recurse_clique_index_search(mol: &Molecule, fragments: &[BitSet], ix: usize, @@ -623,10 +518,13 @@ fn recurse_clique_index_search(mol: &Molecule, return ix; } } - /*if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound_new(&subgraph) { + /*if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound(&subgraph) { + return ix; + }*/ + /*if ix >= best + matches_graph.color_bound(&subgraph) { return ix; }*/ - /*if ix >= best + matches_graph.color_bound_improved(&subgraph) { + /*if ix >= best + matches_graph.frag_bound(&subgraph, fragments) { return ix; }*/ if ix >= best + matches_graph.cover_bound(&subgraph) { @@ -677,13 +575,32 @@ fn recurse_clique_index_search(mol: &Molecule, let mut subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); /*if depth == 1 { - subgraph_clone = kernelize(matches_graph, subgraph_clone); + /*if subgraph_clone.len() >= 260 { + let start = Instant::now(); + subgraph_clone = kernelize(matches_graph, subgraph_clone); + let dur = start.elapsed(); + println!("Kernel Time: {:?}", dur); + }*/ + //println!("{}, {}", subgraph_clone.len(), matches_graph.density(&subgraph_clone)); + if subgraph_clone.len() >= 10 && matches_graph.density(&subgraph_clone) >= 0.6 { + println!("{}", subgraph_clone.len()); + println!("{:?}", subgraph_clone); + println!("{:?}", subgraph_clone.iter().map(|v| matches_graph.weights[v]).collect::>()); + for (edges, v) in subgraph_clone.iter().map(|v| &matches_graph.graph[v]).zip(subgraph_clone.iter()) { + let mut edges_clone = edges.clone(); + edges_clone.intersect_with(&subgraph_clone); + edges_clone.insert(v); + println!("{:?}", edges_clone.symmetric_difference(&subgraph_clone).collect::>()); + } + + std::process::exit(1); + } }*/ cx = cx.min(recurse_clique_index_search( mol, &fractures, - ix - h1.len() + 1, + ix - matches_graph.weights[v], largest_remove, best, bounds, @@ -706,7 +623,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let start = Instant::now(); let matches_graph = CGraph::new(matches); let dur = start.elapsed(); - println!("Graph Time: {:?}", dur); + //println!("Graph Time: {:?}", dur); let mut subgraph = BitSet::with_capacity(num_matches); for i in 0..num_matches { @@ -717,7 +634,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let start = Instant::now(); subgraph = kernelize_fast(&matches_graph, subgraph); let dur = start.elapsed(); - println!("Kernel Time: {:?}", dur); + //println!("Kernel Time: {:?}", dur); // Search let mut total_search = 0; @@ -755,7 +672,7 @@ pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>) for i in 0..num_matches { subgraph.insert(i); } - //subgraph = kernelize_fast(&matches_graph, subgraph); + subgraph = kernelize_fast(&matches_graph, subgraph); // Search let mut total_search = 0; From a48282b2f31f1d417c05d2018448555038632bc7 Mon Sep 17 00:00:00 2001 From: Garrett Parzych Date: Tue, 24 Jun 2025 22:57:24 -0700 Subject: [PATCH 22/32] Improve fragment bound --- src/assembly.rs | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 6928e760..695705fd 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -96,6 +96,8 @@ impl CGraph { } } + // Sort by degree + Self { graph: init_graph, weights: init_weights, @@ -276,7 +278,7 @@ impl CGraph { } pub fn frag_bound(&self, subgraph: &BitSet, fragments: &[BitSet]) -> usize { - let mut num_bonds: Vec = fragments.iter().map(|x| x.len() as isize).collect(); + let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); let mut has_bonds = fragments.len(); let mut bound = 0; @@ -294,16 +296,22 @@ impl CGraph { if num_bonds[i] > 0 { let m_len = m.0.len(); - num_bonds[i] -= (m_len + 1) as isize; - bound += m_len; + if m_len + 1 > num_bonds[i] { + bound += num_bonds[i] - 1; + num_bonds[i] = 0; + } + else { + bound += m_len; + num_bonds[i] -= m_len + 1; + } - if num_bonds[i] <= 0 { + if num_bonds[i] == 0 { has_bonds -= 1; } } } - bound + bound + 1 } } @@ -508,7 +516,7 @@ fn recurse_clique_index_search(mol: &Molecule, for bound_type in bounds { let exceeds = match bound_type { Bound::Log => ix - log_bound(fragments) >= best, - Bound::IntChain => ix - addition_bound(fragments, largest_remove) >= best, + Bound::IntChain => false, // ix - addition_bound(fragments, largest_remove) >= best, Bound::VecChainSimple => ix - vec_bound_simple(fragments, largest_remove, mol) >= best, Bound::VecChainSmallFrags => { ix - vec_bound_small_frags(fragments, largest_remove, mol) >= best @@ -530,6 +538,15 @@ fn recurse_clique_index_search(mol: &Molecule, if ix >= best + matches_graph.cover_bound(&subgraph) { return ix; } + + /*println!("Neccesary: {}", ix - best); + println!("Weight Sum: {}", matches_graph.remaining_weight_bound(&subgraph)); + println!("Add: {}", addition_bound(fragments, largest_remove)); + println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); + println!("Vec: {}", vec_bound_simple(fragments, largest_remove, mol)); + println!("Small Vec: {}", vec_bound_small_frags(fragments, largest_remove, mol)); + println!("Color: {}", matches_graph.color_bound(&subgraph)); + println!("Cover: {}\n", matches_graph.cover_bound(&subgraph));*/ // Search for duplicatable fragment for v in subgraph.iter() { From f10ae5ef50386a66b5daa28bbd839eeb80e3bf60 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 25 Jun 2025 14:02:58 -0700 Subject: [PATCH 23/32] Fix fragment bound --- src/assembly.rs | 123 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 97 insertions(+), 26 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 695705fd..7acd0127 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -278,40 +278,63 @@ impl CGraph { } pub fn frag_bound(&self, subgraph: &BitSet, fragments: &[BitSet]) -> usize { - let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); - let mut has_bonds = fragments.len(); + let total_bonds = fragments.iter().map(|x| x.len()).sum::(); let mut bound = 0; + let m = self.weights[subgraph.iter().next().unwrap()] + 1; - for v in subgraph.iter() { - if has_bonds == 0 { - break; - } - + /*let mut removes: Vec> = vec![Vec::new(); fragments.len()]; + for v in subgraph { let m = &self.matches[v]; let b = m.1.iter().next().unwrap(); - let mut i = 0; - while !fragments[i].contains(b) { - i += 1; + let mut j = 0; + while !fragments[j].contains(b) { + j += 1; } - if num_bonds[i] > 0 { - let m_len = m.0.len(); - if m_len + 1 > num_bonds[i] { - bound += num_bonds[i] - 1; - num_bonds[i] = 0; + removes[j].push(self.weights[v] + 1); + } + let num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); + println!("{:?}", num_bonds); + println!("{:?}", removes);*/ + + for i in 2..m + 1 { + let mut bound_temp = 0; + let mut largest = 0; + let mut has_bonds = fragments.len(); + let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); + for v in subgraph.iter() { + if has_bonds == 0 { + break; } - else { - bound += m_len; - num_bonds[i] -= m_len + 1; + if self.weights[v] + 1 > i { + continue; + } + + let dup = &self.matches[v]; + let bond = dup.1.iter().next().unwrap(); + let mut j = 0; + while !fragments[j].contains(bond) { + j += 1; } - if num_bonds[i] == 0 { - has_bonds -= 1; + if num_bonds[j] > 0 { + let remove = std::cmp::min(dup.0.len(), num_bonds[j]); + largest = std::cmp::max(largest, remove); + bound_temp += 1; + num_bonds[j] -= remove; + + if num_bonds[j] == 0 { + has_bonds -= 1; + } } } + + let log = (largest as f32).log2().ceil() as usize; + let leftover = num_bonds.iter().map(|x| (x / largest) + (x % largest != 0) as usize).sum::(); + bound = std::cmp::max(bound, total_bonds - bound_temp - log - leftover); } - bound + 1 + bound } } @@ -515,8 +538,8 @@ fn recurse_clique_index_search(mol: &Molecule, // Branch and Bound for bound_type in bounds { let exceeds = match bound_type { - Bound::Log => ix - log_bound(fragments) >= best, - Bound::IntChain => false, // ix - addition_bound(fragments, largest_remove) >= best, + Bound::Log => false, //ix - log_bound(fragments) >= best, + Bound::IntChain => false, //ix - addition_bound(fragments, largest_remove) >= best, Bound::VecChainSimple => ix - vec_bound_simple(fragments, largest_remove, mol) >= best, Bound::VecChainSmallFrags => { ix - vec_bound_small_frags(fragments, largest_remove, mol) >= best @@ -532,14 +555,15 @@ fn recurse_clique_index_search(mol: &Molecule, /*if ix >= best + matches_graph.color_bound(&subgraph) { return ix; }*/ - /*if ix >= best + matches_graph.frag_bound(&subgraph, fragments) { + if ix >= best + matches_graph.frag_bound(&subgraph, fragments) { return ix; - }*/ + } if ix >= best + matches_graph.cover_bound(&subgraph) { return ix; } - /*println!("Neccesary: {}", ix - best); + /*println!("Neccesary: {}", {if ix >= best {ix - best} else { 0 }}); + println!("Ground Truth: {}", savings_ground_truth(0, 0, &subgraph, matches_graph)); println!("Weight Sum: {}", matches_graph.remaining_weight_bound(&subgraph)); println!("Add: {}", addition_bound(fragments, largest_remove)); println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); @@ -548,6 +572,17 @@ fn recurse_clique_index_search(mol: &Molecule, println!("Color: {}", matches_graph.color_bound(&subgraph)); println!("Cover: {}\n", matches_graph.cover_bound(&subgraph));*/ + /*if addition_bound(fragments, largest_remove) < matches_graph.frag_bound(&subgraph, fragments) && ix > best { + println!("Largest Remove: {}", largest_remove); + println!("Add: {}", addition_bound(fragments, largest_remove)); + println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); + }*/ + + /*if savings_ground_truth(0, 0, &subgraph, matches_graph) > matches_graph.frag_bound(&subgraph, fragments) { + println!("Ground Truth: {}", savings_ground_truth(0, 0, &subgraph, matches_graph)); + println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); + }*/ + // Search for duplicatable fragment for v in subgraph.iter() { let (h1, h2) = matches_graph.get_match(v); @@ -632,6 +667,42 @@ fn recurse_clique_index_search(mol: &Molecule, cx } +fn savings_ground_truth (ix: usize, + mut best: usize, + subgraph: &BitSet, + matches_graph: &CGraph, +) -> usize { + if subgraph.len() == 0 { + return ix; + } + let mut cx = ix; + + /*if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound(&subgraph) { + return ix; + }*/ + /*if ix >= best + matches_graph.color_bound(&subgraph) { + return ix; + }*/ + if ix + matches_graph.cover_bound(&subgraph) <= best{ + return ix; + } + + // Search for duplicatable fragment + for v in subgraph.iter() { + let subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); + + cx = cx.max(savings_ground_truth( + matches_graph.weights[v], + best, + &subgraph_clone, + matches_graph, + )); + best = best.max(cx); + } + + cx +} + pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { // Graph Initialization let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); From 8bda5723ec8c1f2adbae9086be0dcf8c8bbd6b8c Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 25 Jun 2025 14:52:13 -0700 Subject: [PATCH 24/32] Change how largest_remove is calculated and move ground truth --- src/assembly.rs | 81 ++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 7acd0127..a38bf0f8 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -105,6 +105,41 @@ impl CGraph { } } + pub fn savings_ground_truth(&self, subgraph: &BitSet) -> usize { + self.savings_ground_truth_recurse(0, 0, subgraph) + } + + fn savings_ground_truth_recurse(&self, ix: usize, mut best: usize, subgraph: &BitSet) -> usize { + if subgraph.len() == 0 { + return ix; + } + let mut cx = ix; + + /*if ix + subgraph.iter().count() <= best && ix + self.remaining_weight_bound(&subgraph) <= best { + return ix; + }*/ + /*if ix + self.color_bound(&subgraph) <= best { + return ix; + }*/ + if ix + self.cover_bound(&subgraph) <= best{ + return ix; + } + + // Search for duplicatable fragment + for v in subgraph.iter() { + let subgraph_clone = self.forward_neighbors(v, &subgraph); + + cx = cx.max(self.savings_ground_truth_recurse( + ix + self.weights[v], + best, + &subgraph_clone, + )); + best = best.max(cx); + } + + cx + } + pub fn len(&self) -> usize { self.matches.len() } @@ -520,7 +555,6 @@ fn recurse_index_search( fn recurse_clique_index_search(mol: &Molecule, fragments: &[BitSet], ix: usize, - largest_remove: usize, mut best: usize, bounds: &[Bound], states_searched: &mut usize, @@ -532,14 +566,14 @@ fn recurse_clique_index_search(mol: &Molecule, return ix; } let mut cx = ix; - + let largest_remove = matches_graph.weights[subgraph.iter().next().unwrap()] + 1; *states_searched += 1; // Branch and Bound for bound_type in bounds { let exceeds = match bound_type { Bound::Log => false, //ix - log_bound(fragments) >= best, - Bound::IntChain => false, //ix - addition_bound(fragments, largest_remove) >= best, + Bound::IntChain => ix - addition_bound(fragments, largest_remove) >= best, Bound::VecChainSimple => ix - vec_bound_simple(fragments, largest_remove, mol) >= best, Bound::VecChainSmallFrags => { ix - vec_bound_small_frags(fragments, largest_remove, mol) >= best @@ -591,8 +625,6 @@ fn recurse_clique_index_search(mol: &Molecule, let f1 = fragments.iter().enumerate().find(|(_, c)| h1.is_subset(c)); let f2 = fragments.iter().enumerate().find(|(_, c)| h2.is_subset(c)); - let largest_remove = h1.len(); - let (Some((i1, f1)), Some((i2, f2))) = (f1, f2) else { continue; }; @@ -653,7 +685,6 @@ fn recurse_clique_index_search(mol: &Molecule, mol, &fractures, ix - matches_graph.weights[v], - largest_remove, best, bounds, states_searched, @@ -667,42 +698,6 @@ fn recurse_clique_index_search(mol: &Molecule, cx } -fn savings_ground_truth (ix: usize, - mut best: usize, - subgraph: &BitSet, - matches_graph: &CGraph, -) -> usize { - if subgraph.len() == 0 { - return ix; - } - let mut cx = ix; - - /*if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound(&subgraph) { - return ix; - }*/ - /*if ix >= best + matches_graph.color_bound(&subgraph) { - return ix; - }*/ - if ix + matches_graph.cover_bound(&subgraph) <= best{ - return ix; - } - - // Search for duplicatable fragment - for v in subgraph.iter() { - let subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); - - cx = cx.max(savings_ground_truth( - matches_graph.weights[v], - best, - &subgraph_clone, - matches_graph, - )); - best = best.max(cx); - } - - cx -} - pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { // Graph Initialization let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); @@ -736,7 +731,6 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize mol, &[init], edge_count - 1, - edge_count, edge_count - 1, bounds, &mut total_search, @@ -777,7 +771,6 @@ pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>) mol, &[init], edge_count - 1, - edge_count, edge_count - 1, &bounds, &mut total_search, From 4585b11384eac267402fffb951c2d154365d7fe8 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 25 Jun 2025 15:27:20 -0700 Subject: [PATCH 25/32] Add cover bound with sorting --- src/assembly.rs | 74 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index a38bf0f8..87cfe96b 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -96,8 +96,6 @@ impl CGraph { } } - // Sort by degree - Self { graph: init_graph, weights: init_weights, @@ -312,6 +310,70 @@ impl CGraph { col_weights.iter().sum() } + pub fn cover_bound_sort(&self, subgraph: &BitSet) -> usize { + // Sort vertices + let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len());; + for v in subgraph { + vertices.push((v, self.degree(v, subgraph))); + } + + vertices.sort_by(|a, b| b.1.cmp(&a.1)); + + // Greedy coloring + let mut colors: Vec>> = vec![None; self.len()]; + let mut col_weights = vec![]; + let mut num_col = 0; + + for (v, _) in vertices { + let mut v_col = Vec::new(); + let mut used = vec![0; num_col]; + + // Find colors used in neighborhood of v + for u in subgraph.intersection(&self.graph[v]) { + let Some(u_col) = &colors[u] else { + continue; + }; + + for c in u_col { + used[*c] = 1; + } + } + + let mut total_weight = 0; + let v_val = self.weights[v]; + // Find colors to give to v + for c in 0..num_col { + if used[c] == 1 { + continue; + } + + v_col.push(c); + total_weight += col_weights[c]; + + if total_weight >= v_val { + break; + } + } + + if total_weight == 0 { + v_col.push(num_col); + col_weights.push(v_val); + num_col += 1 + } + else if total_weight < v_val { + let mut k = num_col - 1; + while used[k] == 1 { + k -= 1 + } + col_weights[k] += v_val - total_weight + } + + colors[v] = Some(v_col); + }; + + col_weights.iter().sum() + } + pub fn frag_bound(&self, subgraph: &BitSet, fragments: &[BitSet]) -> usize { let total_bonds = fragments.iter().map(|x| x.len()).sum::(); let mut bound = 0; @@ -595,16 +657,20 @@ fn recurse_clique_index_search(mol: &Molecule, if ix >= best + matches_graph.cover_bound(&subgraph) { return ix; } + if ix >= best + matches_graph.cover_bound_sort(&subgraph) { + return ix; + } /*println!("Neccesary: {}", {if ix >= best {ix - best} else { 0 }}); - println!("Ground Truth: {}", savings_ground_truth(0, 0, &subgraph, matches_graph)); + //println!("Ground Truth: {}", matches_graph.savings_ground_truth(&subgraph)); println!("Weight Sum: {}", matches_graph.remaining_weight_bound(&subgraph)); println!("Add: {}", addition_bound(fragments, largest_remove)); println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); println!("Vec: {}", vec_bound_simple(fragments, largest_remove, mol)); println!("Small Vec: {}", vec_bound_small_frags(fragments, largest_remove, mol)); println!("Color: {}", matches_graph.color_bound(&subgraph)); - println!("Cover: {}\n", matches_graph.cover_bound(&subgraph));*/ + println!("Cover: {}", matches_graph.cover_bound(&subgraph)); + println!("Cover Sort: {}\n", matches_graph.cover_bound_sort(&subgraph));*/ /*if addition_bound(fragments, largest_remove) < matches_graph.frag_bound(&subgraph, fragments) && ix > best { println!("Largest Remove: {}", largest_remove); From 389aafddf8394901746d5691746e3a05cc4e4d5e Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 25 Jun 2025 15:49:09 -0700 Subject: [PATCH 26/32] Combine sorted and unsorted cover bound --- src/assembly.rs | 89 ++++++++++--------------------------------------- 1 file changed, 18 insertions(+), 71 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 87cfe96b..366f7ea5 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -119,7 +119,7 @@ impl CGraph { /*if ix + self.color_bound(&subgraph) <= best { return ix; }*/ - if ix + self.cover_bound(&subgraph) <= best{ + if ix + self.cover_bound(&subgraph, true) <= best{ return ix; } @@ -250,81 +250,28 @@ impl CGraph { largest.iter().sum::() } - pub fn cover_bound(&self, subgraph: &BitSet) -> usize { - // Greedy coloring - let mut colors: Vec>> = vec![None; self.len()]; - let mut col_weights = vec![]; - let mut num_col = 0; - - for v in (0..self.matches.len()).rev() { - if !subgraph.contains(v) { - continue; - } - - let mut v_col = Vec::new(); - let mut used = vec![0; num_col]; - - // Find colors used in neighborhood of v - for u in subgraph.intersection(&self.graph[v]) { - let Some(u_col) = &colors[u] else { - continue; - }; - - for c in u_col { - used[*c] = 1; - } - } - - let mut total_weight = 0; - let v_val = self.weights[v]; - // Find colors to give to v - for c in 0..num_col { - if used[c] == 1 { - continue; - } - - v_col.push(c); - total_weight += col_weights[c]; - - if total_weight >= v_val { - break; - } - } - - if total_weight == 0 { - v_col.push(num_col); - col_weights.push(v_val); - num_col += 1 - } - else if total_weight < v_val { - let mut k = num_col - 1; - while used[k] == 1 { - k -= 1 - } - col_weights[k] += v_val - total_weight - } - - colors[v] = Some(v_col); - }; - - col_weights.iter().sum() - } - - pub fn cover_bound_sort(&self, subgraph: &BitSet) -> usize { + pub fn cover_bound(&self, subgraph: &BitSet, sort: bool) -> usize { // Sort vertices - let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len());; - for v in subgraph { - vertices.push((v, self.degree(v, subgraph))); + if sort { + let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len());; + for v in subgraph { + vertices.push((v, self.degree(v, subgraph))); + } + vertices.sort_by(|a, b| b.1.cmp(&a.1)); + self.cover_bound_helper(subgraph, vertices.iter().map(|(v, _)| *v)) } + else { + let vertices = (0..self.matches.len()).rev().filter(|v| subgraph.contains(*v)); + self.cover_bound_helper(subgraph, vertices) + } + } - vertices.sort_by(|a, b| b.1.cmp(&a.1)); - - // Greedy coloring + fn cover_bound_helper(&self, subgraph: &BitSet, iter: impl Iterator) -> usize { let mut colors: Vec>> = vec![None; self.len()]; let mut col_weights = vec![]; let mut num_col = 0; - for (v, _) in vertices { + for v in iter { let mut v_col = Vec::new(); let mut used = vec![0; num_col]; @@ -654,10 +601,10 @@ fn recurse_clique_index_search(mol: &Molecule, if ix >= best + matches_graph.frag_bound(&subgraph, fragments) { return ix; } - if ix >= best + matches_graph.cover_bound(&subgraph) { + if ix >= best + matches_graph.cover_bound(&subgraph, false) { return ix; } - if ix >= best + matches_graph.cover_bound_sort(&subgraph) { + if ix >= best + matches_graph.cover_bound(&subgraph, true) { return ix; } From cb550c934c890979267a01684ede324b120a3f50 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Fri, 27 Jun 2025 12:53:15 -0700 Subject: [PATCH 27/32] Fix and improve fragment bound --- src/assembly.rs | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 366f7ea5..b8d622fc 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -324,7 +324,18 @@ impl CGraph { pub fn frag_bound(&self, subgraph: &BitSet, fragments: &[BitSet]) -> usize { let total_bonds = fragments.iter().map(|x| x.len()).sum::(); let mut bound = 0; - let m = self.weights[subgraph.iter().next().unwrap()] + 1; + let sizes = { + let mut vec = vec![]; + let mut prev = 0; + for s in subgraph.iter().map(|v| self.weights[v] + 1) { + if s != prev { + vec.push(s); + prev = s; + } + } + + vec + }; /*let mut removes: Vec> = vec![Vec::new(); fragments.len()]; for v in subgraph { @@ -341,11 +352,11 @@ impl CGraph { println!("{:?}", num_bonds); println!("{:?}", removes);*/ - for i in 2..m + 1 { + for i in sizes { let mut bound_temp = 0; - let mut largest = 0; let mut has_bonds = fragments.len(); let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); + for v in subgraph.iter() { if has_bonds == 0 { break; @@ -354,6 +365,7 @@ impl CGraph { continue; } + let dup = &self.matches[v]; let bond = dup.1.iter().next().unwrap(); let mut j = 0; @@ -363,7 +375,6 @@ impl CGraph { if num_bonds[j] > 0 { let remove = std::cmp::min(dup.0.len(), num_bonds[j]); - largest = std::cmp::max(largest, remove); bound_temp += 1; num_bonds[j] -= remove; @@ -373,8 +384,21 @@ impl CGraph { } } - let log = (largest as f32).log2().ceil() as usize; - let leftover = num_bonds.iter().map(|x| (x / largest) + (x % largest != 0) as usize).sum::(); + /*let (leftover, x) = { + let mut tot = 0; + let mut largest = 0; + for s in num_bonds.iter() { + tot += s; + largest = std::cmp::max(largest, s); + } + (tot, largest) + };*/ + let leftover = num_bonds.iter().sum::(); + let x = match num_bonds.iter().max().unwrap() { + 0 => 1, + y => *y, + }; + let log = ((i as f32).log2() - (x as f32).log2()).ceil() as usize; bound = std::cmp::max(bound, total_bonds - bound_temp - log - leftover); } @@ -616,8 +640,8 @@ fn recurse_clique_index_search(mol: &Molecule, println!("Vec: {}", vec_bound_simple(fragments, largest_remove, mol)); println!("Small Vec: {}", vec_bound_small_frags(fragments, largest_remove, mol)); println!("Color: {}", matches_graph.color_bound(&subgraph)); - println!("Cover: {}", matches_graph.cover_bound(&subgraph)); - println!("Cover Sort: {}\n", matches_graph.cover_bound_sort(&subgraph));*/ + println!("Cover: {}", matches_graph.cover_bound(&subgraph, false)); + println!("Cover Sort: {}\n", matches_graph.cover_bound(&subgraph, true));*/ /*if addition_bound(fragments, largest_remove) < matches_graph.frag_bound(&subgraph, fragments) && ix > best { println!("Largest Remove: {}", largest_remove); From a0bb2db9be312eb018a0620995e660655ebd49d3 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Fri, 27 Jun 2025 13:16:01 -0700 Subject: [PATCH 28/32] Use addition chain instead of log in add bound --- src/assembly.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index b8d622fc..d1ad32f8 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -38,6 +38,7 @@ struct EdgeType { } static PARALLEL_MATCH_SIZE_THRESHOLD: usize = 100; +static ADD_CHAIN: &[usize] = &[0, 1, 2, 2, 3, 3, 4, 3, 4, 4]; /// Enum to represent the different bounds available during the computation of molecular assembly /// indices. @@ -384,15 +385,6 @@ impl CGraph { } } - /*let (leftover, x) = { - let mut tot = 0; - let mut largest = 0; - for s in num_bonds.iter() { - tot += s; - largest = std::cmp::max(largest, s); - } - (tot, largest) - };*/ let leftover = num_bonds.iter().sum::(); let x = match num_bonds.iter().max().unwrap() { 0 => 1, @@ -1140,14 +1132,21 @@ fn addition_bound(fragments: &[BitSet], m: usize) -> usize { // Test for all sizes m of largest removed duplicate for max in 2..m + 1 { - let log = (max as f32).log2().ceil(); + let log = { + if max <= 10 { + ADD_CHAIN[max - 1] + } + else { + (max as f32).log2().ceil() as usize + } + }; let mut aux_sum: usize = 0; for len in &frag_sizes { aux_sum += (len / max) + (len % max != 0) as usize } - max_s = max_s.max(size_sum - log as usize - aux_sum); + max_s = max_s.max(size_sum - log - aux_sum); } max_s From 02d8fc02e800e1f26e5fe926b867e9a57ea04115 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 30 Jun 2025 11:16:02 -0700 Subject: [PATCH 29/32] Fix fragment bound --- src/assembly.rs | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index d1ad32f8..9ff0f1ba 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -357,6 +357,7 @@ impl CGraph { let mut bound_temp = 0; let mut has_bonds = fragments.len(); let mut num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); + let mut smallest_remove = i; for v in subgraph.iter() { if has_bonds == 0 { @@ -378,6 +379,7 @@ impl CGraph { let remove = std::cmp::min(dup.0.len(), num_bonds[j]); bound_temp += 1; num_bonds[j] -= remove; + smallest_remove = std::cmp::min(smallest_remove, remove); if num_bonds[j] == 0 { has_bonds -= 1; @@ -386,12 +388,11 @@ impl CGraph { } let leftover = num_bonds.iter().sum::(); - let x = match num_bonds.iter().max().unwrap() { - 0 => 1, - y => *y, - }; - let log = ((i as f32).log2() - (x as f32).log2()).ceil() as usize; - bound = std::cmp::max(bound, total_bonds - bound_temp - log - leftover); + if leftover > 0 { + smallest_remove = 1; + } + let log = (smallest_remove as f32).log2().ceil() as usize; + bound = std::cmp::max(bound, total_bonds - bound_temp - leftover - log); } bound @@ -635,17 +636,6 @@ fn recurse_clique_index_search(mol: &Molecule, println!("Cover: {}", matches_graph.cover_bound(&subgraph, false)); println!("Cover Sort: {}\n", matches_graph.cover_bound(&subgraph, true));*/ - /*if addition_bound(fragments, largest_remove) < matches_graph.frag_bound(&subgraph, fragments) && ix > best { - println!("Largest Remove: {}", largest_remove); - println!("Add: {}", addition_bound(fragments, largest_remove)); - println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); - }*/ - - /*if savings_ground_truth(0, 0, &subgraph, matches_graph) > matches_graph.frag_bound(&subgraph, fragments) { - println!("Ground Truth: {}", savings_ground_truth(0, 0, &subgraph, matches_graph)); - println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); - }*/ - // Search for duplicatable fragment for v in subgraph.iter() { let (h1, h2) = matches_graph.get_match(v); @@ -695,7 +685,7 @@ fn recurse_clique_index_search(mol: &Molecule, println!("Kernel Time: {:?}", dur); }*/ //println!("{}, {}", subgraph_clone.len(), matches_graph.density(&subgraph_clone)); - if subgraph_clone.len() >= 10 && matches_graph.density(&subgraph_clone) >= 0.6 { + /*if subgraph_clone.len() >= 10 && matches_graph.density(&subgraph_clone) >= 0.6 { println!("{}", subgraph_clone.len()); println!("{:?}", subgraph_clone); println!("{:?}", subgraph_clone.iter().map(|v| matches_graph.weights[v]).collect::>()); @@ -707,7 +697,8 @@ fn recurse_clique_index_search(mol: &Molecule, } std::process::exit(1); - } + }*/ + subgraph_clone = kernelize_fast(matches_graph, subgraph_clone); }*/ cx = cx.min(recurse_clique_index_search( @@ -734,7 +725,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let start = Instant::now(); let matches_graph = CGraph::new(matches); - let dur = start.elapsed(); + //let dur = start.elapsed(); //println!("Graph Time: {:?}", dur); let mut subgraph = BitSet::with_capacity(num_matches); @@ -743,9 +734,9 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize } // Kernelization - let start = Instant::now(); + //let start = Instant::now(); subgraph = kernelize_fast(&matches_graph, subgraph); - let dur = start.elapsed(); + //let dur = start.elapsed(); //println!("Kernel Time: {:?}", dur); // Search @@ -754,7 +745,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize init.extend(mol.graph().edge_indices().map(|ix| ix.index())); let edge_count = mol.graph().edge_count(); - let start = Instant::now(); + //let start = Instant::now(); let index = recurse_clique_index_search( mol, @@ -768,7 +759,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize 1); let dur = start.elapsed(); - println!("Search Time: {:?}", dur); + //println!("Search Time: {:?}", dur); (index as u32, num_matches as u32, total_search) } @@ -1106,7 +1097,7 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize ); let dur = start.elapsed(); - println!("Search Time: {:?}", dur); + //println!("Search Time: {:?}", dur); (index as u32, matches.len() as u32, total_search) } From e43586cfdb39b358e686bbe03a3dc8480b9d9de0 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Mon, 30 Jun 2025 14:31:14 -0700 Subject: [PATCH 30/32] Add inclusion kernel and split bound --- src/assembly.rs | 164 ++++++++++++++++++++++++++++-------------------- 1 file changed, 95 insertions(+), 69 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 9ff0f1ba..0688bc20 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -120,9 +120,9 @@ impl CGraph { /*if ix + self.color_bound(&subgraph) <= best { return ix; }*/ - if ix + self.cover_bound(&subgraph, true) <= best{ + /*if ix + self.cover_bound(&subgraph, true) <= best{ return ix; - } + }*/ // Search for duplicatable fragment for v in subgraph.iter() { @@ -388,15 +388,43 @@ impl CGraph { } let leftover = num_bonds.iter().sum::(); - if leftover > 0 { - smallest_remove = 1; - } - let log = (smallest_remove as f32).log2().ceil() as usize; + let log = { + if leftover > 0 { + 0 + } + else { + (smallest_remove as f32).log2().ceil() as usize + } + }; bound = std::cmp::max(bound, total_bonds - bound_temp - leftover - log); } bound } + + pub fn split_bound(&self, subgraph: &BitSet, fragments: &[BitSet]) -> usize { + let mut subs = vec![BitSet::with_capacity(self.len()); fragments.len()]; + for v in subgraph.iter() { + let dup = &self.matches[v]; + let bond = dup.1.iter().next().unwrap(); + let mut j = 0; + while !fragments[j].contains(bond) { + j += 1; + } + + subs[j].insert(v); + } + + subs.iter().map(|g| + { + if g.len() <= 10 { + self.savings_ground_truth(&g) + } + else { + self.cover_bound(&g, true) + } + }).sum() + } } pub fn naive_assembly_depth(mol: &Molecule) -> u32 { @@ -587,6 +615,7 @@ fn recurse_clique_index_search(mol: &Molecule, subgraph: BitSet, matches_graph: &CGraph, depth: usize, + must_include: &Vec, ) -> usize { if subgraph.len() == 0 { return ix; @@ -596,6 +625,9 @@ fn recurse_clique_index_search(mol: &Molecule, *states_searched += 1; // Branch and Bound + if ix >= best + matches_graph.frag_bound(&subgraph, fragments) { + return ix; + } for bound_type in bounds { let exceeds = match bound_type { Bound::Log => false, //ix - log_bound(fragments) >= best, @@ -615,9 +647,6 @@ fn recurse_clique_index_search(mol: &Molecule, /*if ix >= best + matches_graph.color_bound(&subgraph) { return ix; }*/ - if ix >= best + matches_graph.frag_bound(&subgraph, fragments) { - return ix; - } if ix >= best + matches_graph.cover_bound(&subgraph, false) { return ix; } @@ -634,7 +663,8 @@ fn recurse_clique_index_search(mol: &Molecule, println!("Small Vec: {}", vec_bound_small_frags(fragments, largest_remove, mol)); println!("Color: {}", matches_graph.color_bound(&subgraph)); println!("Cover: {}", matches_graph.cover_bound(&subgraph, false)); - println!("Cover Sort: {}\n", matches_graph.cover_bound(&subgraph, true));*/ + println!("Cover Sort: {}", matches_graph.cover_bound(&subgraph, true)); + println!("Split: {}\n", matches_graph.split_bound(&subgraph, fragments));*/ // Search for duplicatable fragment for v in subgraph.iter() { @@ -677,29 +707,11 @@ fn recurse_clique_index_search(mol: &Molecule, fractures.push(h1.clone()); let mut subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); - /*if depth == 1 { - /*if subgraph_clone.len() >= 260 { - let start = Instant::now(); - subgraph_clone = kernelize(matches_graph, subgraph_clone); - let dur = start.elapsed(); - println!("Kernel Time: {:?}", dur); - }*/ - //println!("{}, {}", subgraph_clone.len(), matches_graph.density(&subgraph_clone)); - /*if subgraph_clone.len() >= 10 && matches_graph.density(&subgraph_clone) >= 0.6 { - println!("{}", subgraph_clone.len()); - println!("{:?}", subgraph_clone); - println!("{:?}", subgraph_clone.iter().map(|v| matches_graph.weights[v]).collect::>()); - for (edges, v) in subgraph_clone.iter().map(|v| &matches_graph.graph[v]).zip(subgraph_clone.iter()) { - let mut edges_clone = edges.clone(); - edges_clone.intersect_with(&subgraph_clone); - edges_clone.insert(v); - println!("{:?}", edges_clone.symmetric_difference(&subgraph_clone).collect::>()); - } - - std::process::exit(1); - }*/ - subgraph_clone = kernelize_fast(matches_graph, subgraph_clone); - }*/ + let mut must_include_clone = must_include.clone(); + if depth == 1 { + subgraph_clone = deletion_kernel(matches_graph, subgraph_clone); + must_include_clone.append(&mut inclusion_kernel(matches_graph, &subgraph_clone)); + } cx = cx.min(recurse_clique_index_search( mol, @@ -711,8 +723,13 @@ fn recurse_clique_index_search(mol: &Molecule, subgraph_clone, matches_graph, depth + 1, + &must_include_clone, )); best = best.min(cx); + + if must_include.contains(&v) { + return cx; + } } cx @@ -735,10 +752,15 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize // Kernelization //let start = Instant::now(); - subgraph = kernelize_fast(&matches_graph, subgraph); + subgraph = deletion_kernel(&matches_graph, subgraph); //let dur = start.elapsed(); //println!("Kernel Time: {:?}", dur); + /*if test2(&matches_graph, &subgraph) != -1 { + println!("!!!"); + } + std::process::exit(1);*/ + // Search let mut total_search = 0; let mut init = BitSet::new(); @@ -756,10 +778,11 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize &mut total_search, subgraph.clone(), &matches_graph, - 1); + 1, + &vec![]); let dur = start.elapsed(); - //println!("Search Time: {:?}", dur); + println!("Search Time: {:?}", dur); (index as u32, num_matches as u32, total_search) } @@ -774,7 +797,7 @@ pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>) for i in 0..num_matches { subgraph.insert(i); } - subgraph = kernelize_fast(&matches_graph, subgraph); + subgraph = deletion_kernel(&matches_graph, subgraph); // Search let mut total_search = 0; @@ -796,12 +819,13 @@ pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>) &mut total_search, subgraph.clone(), &matches_graph, - 1); + 1, + &vec![]); (index as u32, num_matches as u32, total_search) } -fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { +fn deletion_kernel(g: &CGraph, mut subgraph: BitSet) -> BitSet { let mut count = 0; let subgraph_copy = subgraph.clone(); @@ -809,11 +833,16 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { let v_val = g.weights[v]; let neighbors_v = g.neighbors(v, &subgraph); - let Some(w) = neighbors_v.iter().next() else { + let Some(w1) = neighbors_v.iter().next() else { + continue; + }; + let Some(w2) = neighbors_v.iter().last() else { continue; }; - for u in g.graph[w].intersection(&subgraph) { + let mut s = subgraph.clone(); + s.intersect_with(&g.graph[w1]); + for u in s.intersection(&&g.graph[w2]) { if g.are_adjacent(v, u) || v == u { continue; } @@ -837,47 +866,44 @@ fn kernelize(g: &CGraph, mut subgraph: BitSet) -> BitSet { subgraph } -fn kernelize_fast(g: &CGraph, mut subgraph: BitSet) -> BitSet { - let mut count = 0; - let subgraph_copy = subgraph.clone(); - - for v in subgraph_copy.iter() { - let v_val = g.weights[v]; - let neighbors_v = g.neighbors(v, &subgraph); +fn inclusion_kernel(g: &CGraph, subgraph: &BitSet) -> Vec { + let mut kernel = Vec::new(); + let tot = subgraph.iter().map(|v| g.weights[v]).sum::(); - let Some(w1) = neighbors_v.iter().next() else { + 'outer: for v in subgraph { + let vw = g.weights[v]; + let nw = g.graph[v].iter().map(|u| g.weights[u]).sum::(); + if vw >= tot - nw - vw { + kernel.push(v); continue; - }; - let Some(w2) = neighbors_v.iter().last() else { - continue; - }; + } - let mut s = subgraph.clone(); - s.intersect_with(&g.graph[w1]); - for u in s.intersection(&&g.graph[w2]) { - if g.are_adjacent(v, u) || v == u { - continue; - } + let mut neighbors: Vec = vec![]; - let u_val = g.weights[u]; - if v_val > u_val { + for u in subgraph.difference(&g.graph[v]) { + if u == v { continue; } + if g.weights[u] > vw { + continue 'outer; + } - let neighbors_u = g.neighbors(u, &subgraph); - - if neighbors_v.is_subset(&neighbors_u) { - count += 1; - subgraph.remove(v); - break; + for w in neighbors.iter() { + if g.are_adjacent(u, *w) { + continue 'outer; + } } + + neighbors.push(u); } + + kernel.push(v); } - //println!("Reduce count: {}", count); - subgraph + kernel } + #[allow(clippy::too_many_arguments)] fn parallel_recurse_index_search( mol: &Molecule, From 83940fe8e22b804695d592337fb6f4ff06399d62 Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 9 Jul 2025 14:23:00 -0700 Subject: [PATCH 31/32] Add command line args --- benches/search_benchmark.rs | 4 +- src/assembly.rs | 190 +++++++++++------------------------- src/main.rs | 46 ++++++++- 3 files changed, 101 insertions(+), 139 deletions(-) diff --git a/benches/search_benchmark.rs b/benches/search_benchmark.rs index d534104b..cce2d4df 100644 --- a/benches/search_benchmark.rs +++ b/benches/search_benchmark.rs @@ -44,7 +44,7 @@ pub fn reference_datasets(c: &mut Criterion) { for mol in mol_list.iter() { let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); let start = Instant::now(); - clique_index_search_bench(mol, matches); + clique_index_search_bench(mol, matches, assembly_theory::assembly::Kernel::Once); total += start.elapsed() } } @@ -60,7 +60,7 @@ pub fn reference_datasets(c: &mut Criterion) { criterion_group! { name = benchmark; - config = Criterion::default().sample_size(20); + config = Criterion::default().sample_size(10); targets = reference_datasets } criterion_main!(benchmark); diff --git a/src/assembly.rs b/src/assembly.rs index 0688bc20..27a8e99b 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -22,24 +22,22 @@ use std::{ } }; -use std::time::Instant; - use bit_set::BitSet; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use crate::{ - molecule::Bond, molecule::Element, molecule::Molecule, utils::connected_components_under_edges, + molecule::Bond, molecule::Element, molecule::Molecule, utils::connected_components_under_edges }; +static PARALLEL_MATCH_SIZE_THRESHOLD: usize = 100; +static ADD_CHAIN: &[usize] = &[0, 1, 2, 2, 3, 3, 4, 3, 4, 4]; + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] struct EdgeType { bond: Bond, ends: (Element, Element), } -static PARALLEL_MATCH_SIZE_THRESHOLD: usize = 100; -static ADD_CHAIN: &[usize] = &[0, 1, 2, 2, 3, 3, 4, 3, 4, 4]; - /// Enum to represent the different bounds available during the computation of molecular assembly /// indices. /// Bounds are used by `index_search()` to speed up assembly index computations. @@ -56,16 +54,29 @@ pub enum Bound { /// 'VecChainSmallFrags' bounds using information on the number of fragments of size 2 in the /// molecule VecChainSmallFrags, + Weight, + Color, + CoverNoSort, + CoverSort, + Fragment, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum Kernel { + Never, + Once, + Depth1, + All } #[derive(Debug)] -struct CGraph { +struct CompatGraph { graph: Vec, weights: Vec, matches: Vec<(BitSet, BitSet)>, } -impl CGraph { +impl CompatGraph { pub fn new(mut init_matches: Vec<(BitSet, BitSet)>) -> Self{ let size = init_matches.len(); init_matches.sort_by(|e1, e2| e2.0.len().cmp(&e1.0.len())); @@ -147,18 +158,6 @@ impl CGraph { self.graph[v].intersection(subgraph).count() } - pub fn degree_dist(&self) -> Vec { - (0..self.len()).map(|v| self.graph[v].iter().count()).collect() - } - - pub fn density(&self, subgraph: &BitSet) -> f32 { - let n = subgraph.len() as f32; - let deg_sum = subgraph.iter().map(|v| self.graph[v].intersection(subgraph).count()).sum::() as f32; - - deg_sum / (n * (n + 1_f32)) - } - - // Returns a safely mutable set of neighbors of v in subgraph. pub fn neighbors(&self, v: usize, subgraph: &BitSet) -> BitSet { let mut neighbors = self.graph[v].clone(); neighbors.intersect_with(subgraph); @@ -244,9 +243,6 @@ impl CGraph { colors[v] = max_idx as i32; } - //println!("{} ", largest.iter().sum::()); - //println!("{:?}", self.graph.iter().map(|x| x.len()).collect::>()); - //println!("{:?}", colors); largest.iter().sum::() } @@ -254,7 +250,7 @@ impl CGraph { pub fn cover_bound(&self, subgraph: &BitSet, sort: bool) -> usize { // Sort vertices if sort { - let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len());; + let mut vertices: Vec<(usize, usize)> = Vec::with_capacity(subgraph.len()); for v in subgraph { vertices.push((v, self.degree(v, subgraph))); } @@ -337,21 +333,6 @@ impl CGraph { vec }; - - /*let mut removes: Vec> = vec![Vec::new(); fragments.len()]; - for v in subgraph { - let m = &self.matches[v]; - let b = m.1.iter().next().unwrap(); - let mut j = 0; - while !fragments[j].contains(b) { - j += 1; - } - - removes[j].push(self.weights[v] + 1); - } - let num_bonds: Vec = fragments.iter().map(|x| x.len()).collect(); - println!("{:?}", num_bonds); - println!("{:?}", removes);*/ for i in sizes { let mut bound_temp = 0; @@ -401,30 +382,6 @@ impl CGraph { bound } - - pub fn split_bound(&self, subgraph: &BitSet, fragments: &[BitSet]) -> usize { - let mut subs = vec![BitSet::with_capacity(self.len()); fragments.len()]; - for v in subgraph.iter() { - let dup = &self.matches[v]; - let bond = dup.1.iter().next().unwrap(); - let mut j = 0; - while !fragments[j].contains(bond) { - j += 1; - } - - subs[j].insert(v); - } - - subs.iter().map(|g| - { - if g.len() <= 10 { - self.savings_ground_truth(&g) - } - else { - self.cover_bound(&g, true) - } - }).sum() - } } pub fn naive_assembly_depth(mol: &Molecule) -> u32 { @@ -540,6 +497,7 @@ fn recurse_index_search( Bound::VecChainSmallFrags => { ix - vec_bound_small_frags(fragments, largest_remove, mol) >= best } + _ => false }; if exceeds { return ix; @@ -613,9 +571,10 @@ fn recurse_clique_index_search(mol: &Molecule, bounds: &[Bound], states_searched: &mut usize, subgraph: BitSet, - matches_graph: &CGraph, + matches_graph: &CompatGraph, depth: usize, must_include: &Vec, + kernel_method: &Kernel, ) -> usize { if subgraph.len() == 0 { return ix; @@ -624,47 +583,28 @@ fn recurse_clique_index_search(mol: &Molecule, let largest_remove = matches_graph.weights[subgraph.iter().next().unwrap()] + 1; *states_searched += 1; - // Branch and Bound - if ix >= best + matches_graph.frag_bound(&subgraph, fragments) { - return ix; - } + // Bounds for bound_type in bounds { let exceeds = match bound_type { - Bound::Log => false, //ix - log_bound(fragments) >= best, + Bound::Log => ix - log_bound(fragments) >= best, Bound::IntChain => ix - addition_bound(fragments, largest_remove) >= best, Bound::VecChainSimple => ix - vec_bound_simple(fragments, largest_remove, mol) >= best, Bound::VecChainSmallFrags => { ix - vec_bound_small_frags(fragments, largest_remove, mol) >= best - } + }, + Bound::Weight => { + ix >= best + subgraph.iter().count() && + ix >= best + matches_graph.remaining_weight_bound(&subgraph) + }, + Bound::Color => ix >= best + matches_graph.color_bound(&subgraph), + Bound::CoverNoSort => ix >= best + matches_graph.cover_bound(&subgraph, false), + Bound::CoverSort => ix >= best + matches_graph.cover_bound(&subgraph, true), + Bound::Fragment => ix >= best + matches_graph.frag_bound(&subgraph, fragments), }; if exceeds { return ix; } } - /*if ix >= best + subgraph.iter().count() && ix >= best + matches_graph.remaining_weight_bound(&subgraph) { - return ix; - }*/ - /*if ix >= best + matches_graph.color_bound(&subgraph) { - return ix; - }*/ - if ix >= best + matches_graph.cover_bound(&subgraph, false) { - return ix; - } - if ix >= best + matches_graph.cover_bound(&subgraph, true) { - return ix; - } - - /*println!("Neccesary: {}", {if ix >= best {ix - best} else { 0 }}); - //println!("Ground Truth: {}", matches_graph.savings_ground_truth(&subgraph)); - println!("Weight Sum: {}", matches_graph.remaining_weight_bound(&subgraph)); - println!("Add: {}", addition_bound(fragments, largest_remove)); - println!("Frag: {}", matches_graph.frag_bound(&subgraph, fragments)); - println!("Vec: {}", vec_bound_simple(fragments, largest_remove, mol)); - println!("Small Vec: {}", vec_bound_small_frags(fragments, largest_remove, mol)); - println!("Color: {}", matches_graph.color_bound(&subgraph)); - println!("Cover: {}", matches_graph.cover_bound(&subgraph, false)); - println!("Cover Sort: {}", matches_graph.cover_bound(&subgraph, true)); - println!("Split: {}\n", matches_graph.split_bound(&subgraph, fragments));*/ // Search for duplicatable fragment for v in subgraph.iter() { @@ -708,7 +648,9 @@ fn recurse_clique_index_search(mol: &Molecule, let mut subgraph_clone = matches_graph.forward_neighbors(v, &subgraph); let mut must_include_clone = must_include.clone(); - if depth == 1 { + + // Kernelize + if *kernel_method == Kernel::All || (*kernel_method == Kernel:: Depth1 && depth == 1) { subgraph_clone = deletion_kernel(matches_graph, subgraph_clone); must_include_clone.append(&mut inclusion_kernel(matches_graph, &subgraph_clone)); } @@ -724,6 +666,7 @@ fn recurse_clique_index_search(mol: &Molecule, matches_graph, depth + 1, &must_include_clone, + kernel_method, )); best = best.min(cx); @@ -735,15 +678,11 @@ fn recurse_clique_index_search(mol: &Molecule, cx } -pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize) { +pub fn clique_index_search(mol: &Molecule, bounds: &[Bound], kernel_method: Kernel) -> (u32, u32, usize) { // Graph Initialization let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); let num_matches = matches.len(); - - let start = Instant::now(); - let matches_graph = CGraph::new(matches); - //let dur = start.elapsed(); - //println!("Graph Time: {:?}", dur); + let matches_graph = CompatGraph::new(matches); let mut subgraph = BitSet::with_capacity(num_matches); for i in 0..num_matches { @@ -751,15 +690,9 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize } // Kernelization - //let start = Instant::now(); - subgraph = deletion_kernel(&matches_graph, subgraph); - //let dur = start.elapsed(); - //println!("Kernel Time: {:?}", dur); - - /*if test2(&matches_graph, &subgraph) != -1 { - println!("!!!"); + if kernel_method != Kernel::Never { + subgraph = deletion_kernel(&matches_graph, subgraph); } - std::process::exit(1);*/ // Search let mut total_search = 0; @@ -767,8 +700,6 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize init.extend(mol.graph().edge_indices().map(|ix| ix.index())); let edge_count = mol.graph().edge_count(); - //let start = Instant::now(); - let index = recurse_clique_index_search( mol, &[init], @@ -779,25 +710,26 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize subgraph.clone(), &matches_graph, 1, - &vec![]); - - let dur = start.elapsed(); - println!("Search Time: {:?}", dur); + &vec![], + &kernel_method,); (index as u32, num_matches as u32, total_search) } -pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>) -> (u32, u32, usize) { +pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>, kernel_method: Kernel) -> (u32, u32, usize) { // Graph Initialization let num_matches = matches.len(); - let matches_graph = CGraph::new(matches); + let matches_graph = CompatGraph::new(matches); - // Kernelization let mut subgraph = BitSet::with_capacity(num_matches); for i in 0..num_matches { subgraph.insert(i); } - subgraph = deletion_kernel(&matches_graph, subgraph); + + // Kernelization + if kernel_method != Kernel::Never { + subgraph = deletion_kernel(&matches_graph, subgraph); + } // Search let mut total_search = 0; @@ -820,13 +752,13 @@ pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>) subgraph.clone(), &matches_graph, 1, - &vec![]); + &vec![], + &kernel_method,); (index as u32, num_matches as u32, total_search) } -fn deletion_kernel(g: &CGraph, mut subgraph: BitSet) -> BitSet { - let mut count = 0; +fn deletion_kernel(g: &CompatGraph, mut subgraph: BitSet) -> BitSet { let subgraph_copy = subgraph.clone(); for v in subgraph_copy.iter() { @@ -855,24 +787,22 @@ fn deletion_kernel(g: &CGraph, mut subgraph: BitSet) -> BitSet { let neighbors_u = g.neighbors(u, &subgraph); if neighbors_v.is_subset(&neighbors_u) { - count += 1; subgraph.remove(v); break; } } } - //println!("Reduce count: {}", count); subgraph } -fn inclusion_kernel(g: &CGraph, subgraph: &BitSet) -> Vec { +fn inclusion_kernel(g: &CompatGraph, subgraph: &BitSet) -> Vec { let mut kernel = Vec::new(); let tot = subgraph.iter().map(|v| g.weights[v]).sum::(); 'outer: for v in subgraph { let vw = g.weights[v]; - let nw = g.graph[v].iter().map(|u| g.weights[u]).sum::(); + let nw = g.neighbors(v, subgraph).iter().map(|u| g.weights[u]).sum::(); if vw >= tot - nw - vw { kernel.push(v); continue; @@ -928,7 +858,8 @@ fn parallel_recurse_index_search( Bound::VecChainSimple => ix - vec_bound_simple(fragments, largest_remove, mol) >= best, Bound::VecChainSmallFrags => { ix - vec_bound_small_frags(fragments, largest_remove, mol) >= best - } + }, + _ => false }; if exceeds { return ix; @@ -1108,8 +1039,6 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize let edge_count = mol.graph().edge_count(); let mut total_search = 0; - let start = Instant::now(); - let index = recurse_index_search( mol, &matches, @@ -1122,9 +1051,6 @@ pub fn serial_index_search(mol: &Molecule, bounds: &[Bound]) -> (u32, u32, usize -1 ); - let dur = start.elapsed(); - //println!("Search Time: {:?}", dur); - (index as u32, matches.len() as u32, total_search) } diff --git a/src/main.rs b/src/main.rs index 08daca99..7bba7049 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,7 @@ use std::fs; use std::path::PathBuf; use anyhow::{bail, Context, Result}; -use assembly_theory::assembly::{index_search, serial_index_search, Bound}; +use assembly_theory::assembly::{clique_index_search, index_search, serial_index_search, Bound, Kernel}; use assembly_theory::{loader, molecule::Molecule}; use clap::{Args, Parser, ValueEnum}; @@ -11,6 +11,18 @@ enum Bounds { Log, IntChain, VecChain, + Weight, + Color, + Cover, + Fragment, +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, ValueEnum, Ord, Debug)] +pub enum KernelOption { + Never, + Once, + Depth1, + All, } #[derive(Parser, Debug)] @@ -32,6 +44,12 @@ struct Cli { #[arg(long)] /// Disable all parallelism serial: bool, + + #[arg(long)] + kernel_method: Option, + + #[arg(long)] + no_clique: bool, } #[derive(Args, Debug)] @@ -53,17 +71,23 @@ fn make_boundlist(u: &[Bounds]) -> Vec { Bounds::Log => vec![Bound::Log], Bounds::IntChain => vec![Bound::IntChain], Bounds::VecChain => vec![Bound::VecChainSimple, Bound::VecChainSmallFrags], + Bounds::Weight => vec![Bound::Weight], + Bounds::Color => vec![Bound::Color], + Bounds::Cover => vec![Bound::CoverNoSort, Bound::CoverSort], + Bounds::Fragment => vec![Bound::Fragment], }) .collect::>(); boundlist.dedup(); boundlist } -fn index_message(mol: &Molecule, bounds: &[Bound], verbose: bool, serial: bool) -> String { +fn index_message(mol: &Molecule, bounds: &[Bound], verbose: bool, serial: bool, no_clique: bool, kernel: Kernel) -> String { let (index, duplicates, space) = if serial { serial_index_search(mol, bounds) - } else { + } else if no_clique { index_search(mol, bounds) + } else { + clique_index_search(mol, bounds, kernel) }; if verbose { let mut message = String::new(); @@ -89,6 +113,16 @@ fn main() -> Result<()> { return Ok(()); } + let kernel = match cli.kernel_method { + Some(x) => match x { + KernelOption::Never => Kernel::Never, + KernelOption::Once => Kernel::Once, + KernelOption::Depth1 => Kernel::Depth1, + KernelOption::All => Kernel::All, + } + None => Kernel::Once, + }; + let output = match cli.boundgroup { None => index_message( &molecule, @@ -99,14 +133,16 @@ fn main() -> Result<()> { ], cli.verbose, cli.serial, + cli.no_clique, + kernel ), Some(BoundGroup { no_bounds: true, .. - }) => index_message(&molecule, &[], cli.verbose, cli.serial), + }) => index_message(&molecule, &[], cli.verbose, cli.serial, cli.no_clique, kernel), Some(BoundGroup { no_bounds: false, bounds, - }) => index_message(&molecule, &make_boundlist(&bounds), cli.verbose, cli.serial), + }) => index_message(&molecule, &make_boundlist(&bounds), cli.verbose, cli.serial, cli.no_clique, kernel), }; println!("{output}"); From 0df4f3c6621cf14f63761d7cbcaad50683fc51dd Mon Sep 17 00:00:00 2001 From: Garrett-Pz Date: Wed, 9 Jul 2025 15:04:57 -0700 Subject: [PATCH 32/32] Reorganizing --- benches/search_benchmark.rs | 8 ++++++-- src/assembly.rs | 13 ++++--------- src/main.rs | 13 +++++++------ 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/benches/search_benchmark.rs b/benches/search_benchmark.rs index cce2d4df..8b4534e2 100644 --- a/benches/search_benchmark.rs +++ b/benches/search_benchmark.rs @@ -6,7 +6,7 @@ use std::fs; use std::path::Path; use assembly_theory::{ - assembly::{clique_index_search_bench}, + assembly::{clique_index_search_bench, Kernel, Bound}, loader, molecule::Molecule, }; @@ -17,6 +17,10 @@ pub fn reference_datasets(c: &mut Criterion) { // Define datasets, bounds, and labels. let datasets = ["gdb13_1201", "gdb17_200", "coconut_55"]; + let bounds = [ + Bound::IntChain, + ]; + let kernel = Kernel::Never; // Loop over all datasets of interest. for dataset in datasets.iter() { @@ -44,7 +48,7 @@ pub fn reference_datasets(c: &mut Criterion) { for mol in mol_list.iter() { let matches: Vec<(BitSet, BitSet)> = mol.matches().collect(); let start = Instant::now(); - clique_index_search_bench(mol, matches, assembly_theory::assembly::Kernel::Once); + clique_index_search_bench(mol, matches, &bounds, kernel); total += start.elapsed() } } diff --git a/src/assembly.rs b/src/assembly.rs index 27a8e99b..50d19ecb 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -115,7 +115,7 @@ impl CompatGraph { } } - pub fn savings_ground_truth(&self, subgraph: &BitSet) -> usize { + /*pub fn savings_ground_truth(&self, subgraph: &BitSet) -> usize { self.savings_ground_truth_recurse(0, 0, subgraph) } @@ -148,7 +148,7 @@ impl CompatGraph { } cx - } + }*/ pub fn len(&self) -> usize { self.matches.len() @@ -716,7 +716,7 @@ pub fn clique_index_search(mol: &Molecule, bounds: &[Bound], kernel_method: Kern (index as u32, num_matches as u32, total_search) } -pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>, kernel_method: Kernel) -> (u32, u32, usize) { +pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>, bounds: &[Bound], kernel_method: Kernel) -> (u32, u32, usize) { // Graph Initialization let num_matches = matches.len(); let matches_graph = CompatGraph::new(matches); @@ -736,11 +736,6 @@ pub fn clique_index_search_bench(mol: &Molecule, matches: Vec<(BitSet, BitSet)>, let mut init = BitSet::new(); init.extend(mol.graph().edge_indices().map(|ix| ix.index())); let edge_count = mol.graph().edge_count(); - let bounds = vec![ - Bound::IntChain, - Bound::VecChainSimple, - Bound::VecChainSmallFrags, - ]; let index = recurse_clique_index_search( mol, @@ -1076,7 +1071,7 @@ fn addition_bound(fragments: &[BitSet], m: usize) -> usize { // Test for all sizes m of largest removed duplicate for max in 2..m + 1 { let log = { - if max <= 10 { + if max <= ADD_CHAIN.len() { ADD_CHAIN[max - 1] } else { diff --git a/src/main.rs b/src/main.rs index 7bba7049..11f05986 100644 --- a/src/main.rs +++ b/src/main.rs @@ -114,12 +114,10 @@ fn main() -> Result<()> { } let kernel = match cli.kernel_method { - Some(x) => match x { - KernelOption::Never => Kernel::Never, - KernelOption::Once => Kernel::Once, - KernelOption::Depth1 => Kernel::Depth1, - KernelOption::All => Kernel::All, - } + Some(KernelOption::Never) => Kernel::Never, + Some(KernelOption::Once) => Kernel::Once, + Some(KernelOption::Depth1) => Kernel::Depth1, + Some(KernelOption::All) => Kernel::All, None => Kernel::Once, }; @@ -127,9 +125,12 @@ fn main() -> Result<()> { None => index_message( &molecule, &[ + Bound::Fragment, Bound::IntChain, Bound::VecChainSimple, Bound::VecChainSmallFrags, + Bound::CoverNoSort, + Bound::CoverSort, ], cli.verbose, cli.serial,