diff --git a/Cargo.lock b/Cargo.lock index 5e55b0c231..24703ff166 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -768,6 +768,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -2114,7 +2134,7 @@ dependencies = [ "arrow-csv", "arrow-schema", "bigdecimal", - "bincode", + "bincode 2.0.1", "bitvec", "boxcar", "bytemuck", @@ -2137,6 +2157,7 @@ dependencies = [ "sysinfo", "tempfile", "thiserror 2.0.17", + "tinyvec", ] [[package]] @@ -2884,7 +2905,7 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c255bdf46e07fb840d120a36dcc81f385140d7191c76a7391672675c01a55d" dependencies = [ - "bincode", + "bincode 1.3.3", "byteorder", "heed-traits", "serde", @@ -5066,7 +5087,7 @@ dependencies = [ "async-openai", "async-trait", "bigdecimal", - "bincode", + "bincode 2.0.1", "bytemuck", "bzip2 0.4.4", "chrono", @@ -5128,6 +5149,7 @@ dependencies = [ "tantivy", "tempfile", "thiserror 2.0.17", + "tikv-jemallocator", "tokio", "tracing", "uuid", @@ -6499,6 +6521,26 @@ dependencies = [ "ordered-float 2.10.1", ] +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "time" version = "0.3.44" @@ -6565,6 +6607,7 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ + "serde", "tinyvec_macros", ] @@ -6992,6 +7035,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "url" version = "2.5.7" @@ -7052,6 +7101,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "wait-timeout" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 14cd14d775..2c4826268e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ raphtory-core = { version = "0.16.2", path = "raphtory-core", default-features = raphtory-graphql = { version = "0.16.2", path = "raphtory-graphql", default-features = false } raphtory-storage = { version = "0.16.2", path = "raphtory-storage", default-features = false } async-graphql = { version = "7.0.16", features = ["dynamic-schema"] } -bincode = "1.3.3" +bincode = {version = "2", features = ["serde"]} async-graphql-poem = "7.0.16" dynamic-graphql = "0.10.1" derive_more = "2.0.1" @@ -100,6 +100,7 @@ num-integer = "0.1" rand_distr = "0.5.1" rustc-hash = "2.0.0" twox-hash = "2.1.0" +tinyvec = { version = "1.10", features = ["serde", "alloc"] } lock_api = { version = "0.4.11", features = ["arc_lock", "serde"] } dashmap = { version = "6.0.1", features = ["serde", "rayon"] } glam = "0.29.0" diff --git a/db4-graph/src/lib.rs b/db4-graph/src/lib.rs index 7b7e8b7fa2..1a9f06289b 100644 --- a/db4-graph/src/lib.rs +++ b/db4-graph/src/lib.rs @@ -21,7 +21,7 @@ use storage::{ layer_counter::GraphStats, locked::{ edges::WriteLockedEdgePages, graph_props::WriteLockedGraphPropPages, - nodes::WriteLockedNodePages, + nodes::WriteLockedNodeSegments, }, }, persist::strategy::{Config, PersistentStrategy}, @@ -35,7 +35,7 @@ use tempfile::TempDir; pub struct TemporalGraph { // mapping between logical and physical ids pub logical_to_physical: Arc, - pub node_count: AtomicUsize, + pub event_counter: AtomicUsize, storage: Arc>, graph_dir: Option, pub transaction_manager: Arc, @@ -152,14 +152,13 @@ impl, ES = ES, GS = GS>> Temporal let gid_resolver_dir = path.join("gid_resolver"); let resolver = GIDResolver::new_with_path(&gid_resolver_dir)?; - let node_count = AtomicUsize::new(storage.nodes().num_nodes()); let wal_dir = path.join("wal"); let wal = Arc::new(WalImpl::new(Some(wal_dir))?); Ok(Self { graph_dir: Some(path.into()), + event_counter: AtomicUsize::new(resolver.len()), logical_to_physical: resolver.into(), - node_count, storage: Arc::new(storage), transaction_manager: Arc::new(TransactionManager::new(wal.clone())), wal, @@ -205,9 +204,9 @@ impl, ES = ES, GS = GS>> Temporal Ok(Self { graph_dir, logical_to_physical, - node_count: AtomicUsize::new(0), storage: Arc::new(storage), transaction_manager: Arc::new(TransactionManager::new(wal.clone())), + event_counter: AtomicUsize::new(0), wal, }) } @@ -375,7 +374,7 @@ pub struct WriteLockedGraph<'a, EXT> where EXT: PersistentStrategy, ES = ES, GS = GS>, { - pub nodes: WriteLockedNodePages<'a, storage::NS>, + pub nodes: WriteLockedNodeSegments<'a, storage::NS>, pub edges: WriteLockedEdgePages<'a, storage::ES>, pub graph_props: WriteLockedGraphPropPages<'a, storage::GS>, pub graph: &'a TemporalGraph, @@ -397,21 +396,17 @@ impl<'a, EXT: PersistentStrategy, ES = ES, GS = GS>> self.graph } - pub fn resize_chunks_to_num_nodes(&mut self, num_nodes: usize) { - if num_nodes == 0 { - return; + pub fn resize_chunks_to_num_nodes(&mut self, max_vid: Option) { + if let Some(max_vid) = max_vid { + let (chunks_needed, _) = self.graph.storage.nodes().resolve_pos(max_vid); + self.graph.storage().nodes().grow(chunks_needed + 1); + std::mem::take(&mut self.nodes); + self.nodes = self.graph.storage.nodes().write_locked(); } - let (chunks_needed, _) = self.graph.storage.nodes().resolve_pos(VID(num_nodes - 1)); - self.graph.storage().nodes().grow(chunks_needed + 1); - std::mem::take(&mut self.nodes); - self.nodes = self.graph.storage.nodes().write_locked(); } - pub fn resize_chunks_to_num_edges(&mut self, num_edges: usize) { - if num_edges == 0 { - return; - } - let (chunks_needed, _) = self.graph.storage.edges().resolve_pos(EID(num_edges - 1)); + pub fn resize_chunks_to_num_edges(&mut self, max_eid: EID) { + let (chunks_needed, _) = self.graph.storage.edges().resolve_pos(max_eid); self.graph.storage().edges().grow(chunks_needed + 1); std::mem::take(&mut self.edges); self.edges = self.graph.storage.edges().write_locked(); diff --git a/db4-storage/Cargo.toml b/db4-storage/Cargo.toml index c3db7ff3c3..8649d81c15 100644 --- a/db4-storage/Cargo.toml +++ b/db4-storage/Cargo.toml @@ -36,6 +36,7 @@ itertools.workspace = true thiserror.workspace = true roaring.workspace = true sysinfo.workspace = true +tinyvec.workspace = true proptest = { workspace = true, optional = true } tempfile = { workspace = true, optional = true } iter-enum = { workspace = true, features = ["rayon"] } diff --git a/db4-storage/src/api/edges.rs b/db4-storage/src/api/edges.rs index 61136444cd..96a810db34 100644 --- a/db4-storage/src/api/edges.rs +++ b/db4-storage/src/api/edges.rs @@ -27,6 +27,7 @@ pub trait EdgeSegmentOps: Send + Sync + std::fmt::Debug + 'static { fn t_len(&self) -> usize; fn num_layers(&self) -> usize; + // Persistent layer count, not used for up to date counts fn layer_count(&self, layer_id: usize) -> u32; fn load( @@ -67,7 +68,10 @@ pub trait EdgeSegmentOps: Send + Sync + std::fmt::Debug + 'static { head_lock: impl DerefMut, ) -> Result<(), StorageError>; - fn increment_num_edges(&self) -> u32; + fn increment_num_edges(&self) -> u32 { + self.edges_counter() + .fetch_add(1, std::sync::atomic::Ordering::Relaxed) + } fn contains_edge( &self, diff --git a/db4-storage/src/api/nodes.rs b/db4-storage/src/api/nodes.rs index ebea776c8a..b813cbbe9b 100644 --- a/db4-storage/src/api/nodes.rs +++ b/db4-storage/src/api/nodes.rs @@ -21,13 +21,16 @@ use std::{ borrow::Cow, ops::{Deref, DerefMut, Range}, path::{Path, PathBuf}, - sync::Arc, + sync::{Arc, atomic::AtomicU32}, }; +use rayon::prelude::*; + use crate::{ LocalPOS, error::StorageError, gen_ts::LayerIter, + pages::node_store::increment_and_clamp, segments::node::segment::MemNodeSegment, utils::{Iter2, Iter3, Iter4}, }; @@ -47,12 +50,6 @@ pub trait NodeSegmentOps: Send + Sync + std::fmt::Debug + 'static { fn t_len(&self) -> usize; - fn event_id(&self) -> i64; - - fn increment_event_id(&self, i: i64); - - fn decrement_event_id(&self) -> i64; - fn load( page_id: usize, node_meta: Arc, @@ -81,14 +78,6 @@ pub trait NodeSegmentOps: Send + Sync + std::fmt::Debug + 'static { fn try_head_mut(&self) -> Option>; - fn num_nodes(&self) -> u32 { - self.layer_count(0) - } - - fn num_layers(&self) -> usize; - - fn layer_count(&self, layer_id: usize) -> u32; - fn notify_write( &self, head_lock: impl DerefMut, @@ -128,6 +117,21 @@ pub trait NodeSegmentOps: Send + Sync + std::fmt::Debug + 'static { &self, locked_head: impl DerefMut, ) -> Result<(), StorageError>; + + fn nodes_counter(&self) -> &AtomicU32; + + fn increment_num_nodes(&self, max_page_len: u32) { + increment_and_clamp(self.nodes_counter(), max_page_len); + } + + fn num_nodes(&self) -> u32 { + self.nodes_counter() + .load(std::sync::atomic::Ordering::Relaxed) + } + + fn num_layers(&self) -> usize; + + fn layer_count(&self, layer_id: usize) -> u32; } pub trait LockedNSSegment: std::fmt::Debug + Send + Sync { @@ -135,7 +139,23 @@ pub trait LockedNSSegment: std::fmt::Debug + Send + Sync { where Self: 'a; + fn num_nodes(&self) -> u32; + fn entry_ref<'a>(&'a self, pos: impl Into) -> Self::EntryRef<'a>; + + fn iter_entries<'a>(&'a self) -> impl Iterator> + Send + Sync + 'a { + let num_nodes = self.num_nodes(); + (0..num_nodes).map(move |vid| self.entry_ref(LocalPOS(vid))) + } + + fn par_iter_entries<'a>( + &'a self, + ) -> impl ParallelIterator> + Send + Sync + 'a { + let num_nodes = self.num_nodes(); + (0..num_nodes) + .into_par_iter() + .map(move |vid| self.entry_ref(LocalPOS(vid))) + } } pub trait NodeEntryOps<'a>: Send + Sync + 'a { diff --git a/db4-storage/src/gen_ts.rs b/db4-storage/src/gen_ts.rs index be9fdcfe60..70c20ce95f 100644 --- a/db4-storage/src/gen_ts.rs +++ b/db4-storage/src/gen_ts.rs @@ -41,7 +41,7 @@ impl<'a> From<&'a LayerIds> for LayerIter<'a> { pub struct GenericTimeOps<'a, Ref> { range: Option<(TimeIndexEntry, TimeIndexEntry)>, layer_id: LayerIter<'a>, - node: Ref, + item_ref: Ref, } impl<'a, Ref> GenericTimeOps<'a, Ref> { @@ -49,7 +49,7 @@ impl<'a, Ref> GenericTimeOps<'a, Ref> { Self { range: None, layer_id: layer_id.into(), - node, + item_ref: node, } } @@ -57,7 +57,7 @@ impl<'a, Ref> GenericTimeOps<'a, Ref> { Self { range: None, layer_id: layer_id.into(), - node, + item_ref: node, } } } @@ -296,9 +296,9 @@ where { pub fn edge_events(self) -> impl Iterator + Send + Sync + 'a { self.layer_id - .into_iter(self.node.num_layers()) + .into_iter(self.item_ref.num_layers()) .flat_map(move |layer_id| { - self.node + self.item_ref .additions_tc(layer_id, self.range) .map(|t_cell| t_cell.edge_events()) }) @@ -309,9 +309,9 @@ where self, ) -> impl Iterator + Send + Sync + 'a { self.layer_id - .into_iter(self.node.num_layers()) + .into_iter(self.item_ref.num_layers()) .flat_map(|layer_id| { - self.node + self.item_ref .additions_tc(layer_id, self.range) .map(|t_cell| t_cell.edge_events_rev()) }) @@ -323,12 +323,12 @@ impl<'a, Ref: WithTimeCells<'a> + 'a> GenericTimeOps<'a, Ref> { pub fn time_cells(self) -> impl Iterator + Send + Sync + 'a { let range = self.range; self.layer_id - .into_iter(self.node.num_layers()) + .into_iter(self.item_ref.num_layers()) .flat_map(move |layer_id| { - self.node.t_props_tc(layer_id, range).chain( - self.node + self.item_ref.t_props_tc(layer_id, range).chain( + self.item_ref .additions_tc(layer_id, range) - .chain(self.node.deletions_tc(layer_id, range)), + .chain(self.item_ref.deletions_tc(layer_id, range)), ) }) } @@ -356,7 +356,7 @@ impl<'a, Ref: WithTimeCells<'a> + 'a> TimeIndexOps<'a> for GenericTimeOps<'a, Re fn range(&self, w: Range) -> Self::RangeType { GenericTimeOps { range: Some((w.start, w.end)), - node: self.node, + item_ref: self.item_ref, layer_id: self.layer_id, } } diff --git a/db4-storage/src/lib.rs b/db4-storage/src/lib.rs index ebe5bf708b..0b1291e02a 100644 --- a/db4-storage/src/lib.rs +++ b/db4-storage/src/lib.rs @@ -43,6 +43,7 @@ pub mod persist; pub mod properties; pub mod resolver; pub mod segments; +pub mod state; pub mod utils; pub mod wal; diff --git a/db4-storage/src/pages/edge_page/writer.rs b/db4-storage/src/pages/edge_page/writer.rs index cde4d7fdb3..4fd665444c 100644 --- a/db4-storage/src/pages/edge_page/writer.rs +++ b/db4-storage/src/pages/edge_page/writer.rs @@ -2,12 +2,8 @@ use crate::{ LocalPOS, api::edges::EdgeSegmentOps, error::StorageError, pages::layer_counter::GraphStats, segments::edge::segment::MemEdgeSegment, }; -use arrow_array::{ArrayRef, BooleanArray}; use raphtory_api::core::entities::{VID, properties::prop::Prop}; -use raphtory_core::{ - entities::EID, - storage::timeindex::{AsTime, TimeIndexEntry}, -}; +use raphtory_core::storage::timeindex::{AsTime, TimeIndexEntry}; use std::ops::DerefMut; pub struct EdgeWriter< @@ -59,31 +55,6 @@ impl<'a, MP: DerefMut + std::fmt::Debug, ES: EdgeSegmen edge_pos } - // pub fn bulk_add_edges( - // &mut self, - // mask: &BooleanArray, - // time: &[i64], - // start_idx: usize, - // eids: &[EID], - // srcs: &[VID], - // dsts: &[VID], - // layer_id: usize, - // cols: &[ArrayRef], - // cols_prop_ids: &[usize], - // ) { - // self.writer.bulk_insert_edges_internal( - // mask, - // time, - // start_idx, - // eids, - // srcs, - // dsts, - // layer_id, - // cols, - // cols_prop_ids, - // ); - // } - pub fn delete_edge( &mut self, t: T, @@ -140,6 +111,7 @@ impl<'a, MP: DerefMut + std::fmt::Debug, ES: EdgeSegmen if !exists { self.increment_layer_num_edges(0); self.increment_layer_num_edges(layer_id); + self.page.increment_num_edges(); } self.writer diff --git a/db4-storage/src/pages/edge_store.rs b/db4-storage/src/pages/edge_store.rs index 71ed0d1be3..434f7da705 100644 --- a/db4-storage/src/pages/edge_store.rs +++ b/db4-storage/src/pages/edge_store.rs @@ -10,8 +10,10 @@ use crate::{ api::edges::{EdgeRefOps, EdgeSegmentOps, LockedESegment}, error::StorageError, pages::{ + SegmentCounts, layer_counter::GraphStats, locked::edges::{LockedEdgePage, WriteLockedEdgePages}, + row_group_par_iter, }, persist::strategy::Config, segments::edge::segment::MemEdgeSegment, @@ -484,6 +486,11 @@ impl, EXT: Config> EdgeStorageInner } } + pub fn reserve_new_eid(&self, row: usize) -> EID { + let (segment_id, local_pos) = self.reserve_free_pos(row); + local_pos.as_eid(segment_id, self.max_page_len()) + } + pub fn reserve_free_pos(&self, row: usize) -> (usize, LocalPOS) { let slot_idx = row % N; let maybe_free_page = { @@ -573,4 +580,31 @@ impl, EXT: Config> EdgeStorageInner }) }) } + + pub fn row_groups_par_iter( + &self, + ) -> impl IndexedParallelIterator + '_)> { + row_group_par_iter( + self.max_page_len() as usize, + self.segments.count(), + self.max_page_len(), + ) + .map(|(s_id, iter)| (s_id, iter.filter(|eid| self.has_eid(*eid)))) + } + + fn has_eid(&self, eid: EID) -> bool { + let (segment_id, pos) = self.resolve_pos(eid); + segment_id < self.segments.count() + && self + .segments + .get(segment_id) + .is_some_and(|s| pos.0 < s.num_edges()) + } + + pub(crate) fn segment_counts(&self) -> SegmentCounts { + SegmentCounts::new( + self.max_page_len(), + self.pages().iter().map(|(_, seg)| seg.num_edges()), + ) + } } diff --git a/db4-storage/src/pages/layer_counter.rs b/db4-storage/src/pages/layer_counter.rs index b3865ba0b1..5574c11f64 100644 --- a/db4-storage/src/pages/layer_counter.rs +++ b/db4-storage/src/pages/layer_counter.rs @@ -79,6 +79,10 @@ impl GraphStats { counter.load(std::sync::atomic::Ordering::Acquire) } + pub fn get_counter(&self, layer_id: usize) -> &AtomicUsize { + self.get_or_create_layer(layer_id) + } + fn get_or_create_layer(&self, layer_id: usize) -> &AtomicUsize { if let Some(counter) = self.layers.get(layer_id) { return counter; diff --git a/db4-storage/src/pages/locked/edges.rs b/db4-storage/src/pages/locked/edges.rs index a07f03147b..1bfe0005d3 100644 --- a/db4-storage/src/pages/locked/edges.rs +++ b/db4-storage/src/pages/locked/edges.rs @@ -119,4 +119,12 @@ impl<'a, ES: EdgeSegmentOps> WriteLockedEdgePages<'a, ES> { } Ok(()) } + + pub fn len(&self) -> usize { + self.writers.len() + } + + pub fn is_empty(&self) -> bool { + self.writers.is_empty() + } } diff --git a/db4-storage/src/pages/locked/nodes.rs b/db4-storage/src/pages/locked/nodes.rs index 48b4fd7f10..faaa782108 100644 --- a/db4-storage/src/pages/locked/nodes.rs +++ b/db4-storage/src/pages/locked/nodes.rs @@ -10,6 +10,7 @@ use raphtory_core::entities::VID; use rayon::prelude::*; use std::ops::DerefMut; +#[derive(Debug)] pub struct LockedNodePage<'a, NS> { page_id: usize, max_page_len: u32, @@ -69,11 +70,11 @@ impl<'a, NS: NodeSegmentOps> LockedNodePage<'a, NS> { } } -pub struct WriteLockedNodePages<'a, NS> { +pub struct WriteLockedNodeSegments<'a, NS> { writers: Vec>, } -impl Default for WriteLockedNodePages<'_, NS> { +impl Default for WriteLockedNodeSegments<'_, NS> { fn default() -> Self { Self { writers: Vec::new(), @@ -81,7 +82,7 @@ impl Default for WriteLockedNodePages<'_, NS> { } } -impl<'a, NS: NodeSegmentOps> WriteLockedNodePages<'a, NS> { +impl<'a, EXT, NS: NodeSegmentOps> WriteLockedNodeSegments<'a, NS> { pub fn new(writers: Vec>) -> Self { Self { writers } } diff --git a/db4-storage/src/pages/mod.rs b/db4-storage/src/pages/mod.rs index 6be52d59e5..ee8195d0d0 100644 --- a/db4-storage/src/pages/mod.rs +++ b/db4-storage/src/pages/mod.rs @@ -17,6 +17,7 @@ use raphtory_api::core::{ entities::properties::{meta::Meta, prop::Prop}, storage::dict_mapper::MaybeNew, }; +use rayon::prelude::*; use raphtory_core::{ entities::{EID, ELID, VID}, @@ -31,6 +32,7 @@ use std::{ atomic::{self, AtomicUsize}, }, }; +use tinyvec::TinyVec; pub mod edge_page; pub mod edge_store; @@ -126,6 +128,14 @@ impl< self.nodes.stats().latest().max(self.edges.stats().latest()) } + pub fn node_segment_counts(&self) -> SegmentCounts { + self.nodes.segment_counts() + } + + pub fn edge_segment_counts(&self) -> SegmentCounts { + self.edges.segment_counts() + } + pub fn load(graph_dir: impl AsRef) -> Result { let nodes_path = graph_dir.as_ref().join("nodes"); let edges_path = graph_dir.as_ref().join("edges"); @@ -433,14 +443,61 @@ impl< } } +#[derive(Debug)] +pub struct SegmentCounts { + max_seg_len: u32, + counts: TinyVec<[u32; node_store::N]>, // this might come to be a problem + _marker: std::marker::PhantomData, +} + +impl> SegmentCounts { + pub fn new(max_seg_len: u32, counts: impl IntoIterator) -> Self { + let counts: TinyVec<[u32; node_store::N]> = counts.into_iter().collect(); + + Self { + max_seg_len, + counts, + _marker: std::marker::PhantomData, + } + } + + pub fn into_iter(self) -> impl Iterator { + let max_seg_len = self.max_seg_len as usize; + self.counts.into_iter().enumerate().flat_map(move |(i, c)| { + let g_pos = i * max_seg_len as usize; + (0..c).map(move |offset| I::from(g_pos + offset as usize)) + }) + } + + pub(crate) fn counts(&self) -> &[u32] { + &self.counts + } + + pub(crate) fn max_seg_len(&self) -> u32 { + self.max_seg_len + } +} +impl + Send> SegmentCounts { + pub fn into_par_iter(self) -> impl ParallelIterator { + let max_seg_len = self.max_seg_len as usize; + (0..self.counts.len()).into_par_iter().flat_map(move |i| { + let c = self.counts[i]; + let g_pos = i * max_seg_len; + (0..c) + .into_par_iter() + .map(move |offset| I::from(g_pos + offset as usize)) + }) + } +} + impl Drop for GraphStore { fn drop(&mut self) { let node_types = self.nodes.prop_meta().get_all_node_types(); self._ext.set_node_types(node_types); - if let Some(graph_dir) = self.graph_dir.as_ref() { - if write_graph_config(graph_dir, &self._ext).is_err() { - eprintln!("Unrecoverable! Failed to write graph meta"); - } + if let Some(graph_dir) = self.graph_dir.as_ref() + && write_graph_config(graph_dir, &self._ext).is_err() + { + eprintln!("Unrecoverable! Failed to write graph meta"); } } } @@ -467,9 +524,29 @@ fn read_graph_config( #[inline(always)] pub fn resolve_pos>(i: I, max_page_len: u32) -> (usize, LocalPOS) { let i = i.into(); - let chunk = i / max_page_len as usize; + let seg = i / max_page_len as usize; let pos = i % max_page_len as usize; - (chunk, LocalPOS(pos as u32)) + (seg, LocalPOS(pos as u32)) +} + +pub fn row_group_par_iter>( + chunk_size: usize, + num_segments: usize, + max_seg_len: u32, +) -> impl IndexedParallelIterator)> { + let chunk_size = (chunk_size / num_segments).max(1); + let num_chunks = (max_seg_len as usize + chunk_size - 1) / chunk_size; + + (0..num_chunks).into_par_iter().map(move |chunk_id| { + let start = chunk_id * chunk_size; + let end = ((chunk_id + 1) * chunk_size).min(max_seg_len as usize); + + let iter = (start..end).flat_map(move |x| { + (0..num_segments).map(move |seg| I::from(seg * max_seg_len as usize + x)) + }); + + (chunk_id, iter) + }) } #[cfg(test)] @@ -488,6 +565,27 @@ mod test { use proptest::prelude::*; use raphtory_api::core::entities::properties::prop::Prop; use raphtory_core::{entities::VID, storage::timeindex::TimeIndexOps}; + use rayon::iter::ParallelIterator; + + #[test] + fn test_iterleave() { + let chunk_size = 3; + let num_segments = 3; + let max_seg_len = 4; + + let actual = super::row_group_par_iter(chunk_size, num_segments, max_seg_len) + .map(|(c, items)| (c, items.collect::>())) + .collect::>(); + + let expected = vec![ + (0, vec![0, 4, 8]), + (1, vec![1, 5, 9]), + (2, vec![2, 6, 10]), + (3, vec![3, 7, 11]), + ]; + + assert_eq!(actual, expected); + } fn check_edges(edges: Vec<(impl Into, impl Into)>, chunk_size: u32, par_load: bool) { // Set optional layer_id to None diff --git a/db4-storage/src/pages/node_page/writer.rs b/db4-storage/src/pages/node_page/writer.rs index 882c97d05e..c6312090f6 100644 --- a/db4-storage/src/pages/node_page/writer.rs +++ b/db4-storage/src/pages/node_page/writer.rs @@ -178,6 +178,7 @@ impl<'a, MP: DerefMut + 'a, NS: NodeSegmentOps> NodeWri self.page.increment_est_size(add); } + #[inline(always)] pub fn get_out_edge(&self, pos: LocalPOS, dst: VID, layer_id: usize) -> Option { self.page .get_out_edge(pos, dst, layer_id, self.mut_segment.deref()) @@ -200,13 +201,18 @@ impl<'a, MP: DerefMut + 'a, NS: NodeSegmentOps> NodeWri self.update_c_props(pos, layer_id, node_info_as_props(Some(gid), node_type), lsn); } - pub fn store_node_id(&mut self, pos: LocalPOS, layer_id: usize, gid: GidRef<'_>, lsn: u64) { - self.update_c_props(pos, layer_id, node_info_as_props(Some(gid), None), lsn); + pub fn store_node_id(&mut self, pos: LocalPOS, layer_id: usize, gid: Prop, lsn: u64) { + self.update_c_props(pos, layer_id, [(NODE_ID_IDX, gid)], lsn); } pub fn update_deletion_time(&mut self, t: T, node: LocalPOS, e_id: ELID, lsn: u64) { self.update_timestamp(t, node, e_id, lsn); } + + pub fn increment_seg_num_nodes(&mut self) { + self.page + .increment_num_nodes(self.mut_segment.max_page_len()); + } } pub fn node_info_as_props( @@ -224,7 +230,6 @@ impl<'a, MP: DerefMut + 'a, NS: NodeSegmentOps> Drop for NodeWriter<'a, MP, NS> { fn drop(&mut self) { - self.page.increment_event_id(1); self.page .notify_write(self.mut_segment.deref_mut()) .expect("Failed to persist node page"); diff --git a/db4-storage/src/pages/node_store.rs b/db4-storage/src/pages/node_store.rs index 113112a77a..ad1f7e9f2a 100644 --- a/db4-storage/src/pages/node_store.rs +++ b/db4-storage/src/pages/node_store.rs @@ -4,13 +4,15 @@ use crate::{ api::nodes::{LockedNSSegment, NodeSegmentOps}, error::StorageError, pages::{ + SegmentCounts, layer_counter::GraphStats, - locked::nodes::{LockedNodePage, WriteLockedNodePages}, + locked::nodes::{LockedNodePage, WriteLockedNodeSegments}, + row_group_par_iter, }, persist::strategy::Config, segments::node::segment::MemNodeSegment, }; -use parking_lot::RwLockWriteGuard; +use parking_lot::{RwLock, RwLockWriteGuard}; use raphtory_api::core::entities::properties::meta::Meta; use raphtory_core::{ entities::{EID, VID}, @@ -20,15 +22,17 @@ use rayon::prelude::*; use std::{ collections::HashMap, path::{Path, PathBuf}, - sync::Arc, + sync::{Arc, atomic::AtomicU32}, }; // graph // (nodes|edges) // graph segments // layers // chunks +pub const N: usize = 32; #[derive(Debug)] pub struct NodeStorageInner { - pages: boxcar::Vec>, + segments: boxcar::Vec>, stats: Arc, + free_segments: Box<[RwLock; N]>, nodes_path: Option, node_meta: Arc, edge_meta: Arc, @@ -46,18 +50,18 @@ impl, EXT: Config> ReadLockedNodeStorage, ) -> <::ArcLockedSegment as LockedNSSegment>::EntryRef<'_> { - let (page_id, pos) = self.storage.resolve_pos(node); - let locked_page = &self.locked_segments[page_id]; - locked_page.entry_ref(pos) + let (segment_id, pos) = self.storage.resolve_pos(node); + let locked_segment = &self.locked_segments[segment_id]; + locked_segment.entry_ref(pos) } pub fn try_node_ref( &self, node: VID, ) -> Option<<::ArcLockedSegment as LockedNSSegment>::EntryRef<'_>> { - let (page_id, pos) = self.storage.resolve_pos(node); - let locked_page = &self.locked_segments.get(page_id)?; - Some(locked_page.entry_ref(pos)) + let (segment_id, pos) = self.storage.resolve_pos(node); + let locked_segment = &self.locked_segments.get(segment_id)?; + Some(locked_segment.entry_ref(pos)) } pub fn len(&self) -> usize { @@ -73,10 +77,9 @@ impl, EXT: Config> ReadLockedNodeStorage impl Iterator< Item = <::ArcLockedSegment as LockedNSSegment>::EntryRef<'_>, > + '_ { - (0..self.len()).map(move |i| { - let vid = VID(i); - self.node_ref(vid) - }) + self.locked_segments + .iter() + .flat_map(move |segment| segment.iter_entries()) } pub fn par_iter( @@ -84,10 +87,26 @@ impl, EXT: Config> ReadLockedNodeStorage impl rayon::iter::ParallelIterator< Item = <::ArcLockedSegment as LockedNSSegment>::EntryRef<'_>, > + '_ { - (0..self.len()).into_par_iter().map(move |i| { - let vid = VID(i); - self.node_ref(vid) - }) + self.locked_segments + .par_iter() + .flat_map(move |segment| segment.par_iter_entries()) + } + + pub fn row_groups_par_iter( + &self, + ) -> impl IndexedParallelIterator + '_)> { + row_group_par_iter( + self.storage.max_segment_len() as usize, + self.locked_segments.len(), + self.storage.max_segment_len(), + ) + .map(|(s_id, iter)| (s_id, iter.filter(|vid| self.has_vid(*vid)))) + } + + fn has_vid(&self, vid: VID) -> bool { + let (segment_id, pos) = self.storage.resolve_pos(vid); + segment_id < self.locked_segments.len() + && pos.0 < self.locked_segments[segment_id].num_nodes() } } @@ -104,6 +123,7 @@ impl NodeStorageInner { self.stats.get(0) } + // FIXME: this should be called by the high level APIs on layer filter pub fn layer_num_nodes(&self, layer_id: usize) -> usize { self.stats.get(layer_id) } @@ -113,7 +133,7 @@ impl NodeStorageInner { } pub fn segments(&self) -> &boxcar::Vec> { - &self.pages + &self.segments } pub fn nodes_path(&self) -> Option<&Path> { @@ -122,10 +142,10 @@ impl NodeStorageInner { /// Return the position of the chunk and the position within the chunk pub fn resolve_pos(&self, i: impl Into) -> (usize, LocalPOS) { - resolve_pos(i.into(), self.max_page_len()) + resolve_pos(i.into(), self.max_segment_len()) } - pub fn max_page_len(&self) -> u32 { + pub fn max_segment_len(&self) -> u32 { self.ext.max_node_page_len() } } @@ -137,9 +157,11 @@ impl, EXT: Config> NodeStorageInner edge_meta: Arc, ext: EXT, ) -> Self { + let free_segments = (0..N).map(RwLock::new).collect::>(); let empty = Self { - pages: boxcar::Vec::new(), + segments: boxcar::Vec::new(), stats: GraphStats::new().into(), + free_segments: free_segments.try_into().unwrap(), nodes_path, node_meta, edge_meta, @@ -163,9 +185,10 @@ impl, EXT: Config> NodeStorageInner } empty } + pub fn locked(self: &Arc) -> ReadLockedNodeStorage { let locked_segments = self - .pages + .segments .iter() .map(|(_, segment)| segment.locked()) .collect::>(); @@ -175,15 +198,15 @@ impl, EXT: Config> NodeStorageInner } } - pub fn write_locked<'a>(&'a self) -> WriteLockedNodePages<'a, NS> { - WriteLockedNodePages::new( - self.pages + pub fn write_locked<'a>(&'a self) -> WriteLockedNodeSegments<'a, NS> { + WriteLockedNodeSegments::new( + self.segments .iter() .map(|(page_id, page)| { LockedNodePage::new( page_id, &self.stats, - self.max_page_len(), + self.max_segment_len(), page.as_ref(), page.head_mut(), ) @@ -192,10 +215,67 @@ impl, EXT: Config> NodeStorageInner ) } + pub fn reserve_vid(&self, row: usize) -> VID { + let (seg, pos) = self.reserve_free_pos(row); + pos.as_vid(seg, self.max_segment_len()) + } + + pub fn reserve_free_pos(&self, row: usize) -> (usize, LocalPOS) { + let slot_idx = row % N; + let maybe_free_page = { + let lock_slot = self.free_segments[slot_idx].read_recursive(); + let page_id = *lock_slot; + let page = self.segments.get(page_id); + page.and_then(|page| { + self.reserve_segment_row(page) + .map(|pos| (page.segment_id(), LocalPOS(pos))) + }) + }; + + if let Some(reserved_pos) = maybe_free_page { + reserved_pos + } else { + // not lucky, go wait on your slot + let mut slot = self.free_segments[slot_idx].write(); + loop { + if let Some(page) = self.segments.get(*slot) + && let Some(pos) = self.reserve_segment_row(page) + { + return (page.segment_id(), LocalPOS(pos)); + } + *slot = self.push_new_segment(); + } + } + } + + fn reserve_segment_row(&self, segment: &Arc) -> Option { + // TODO: if this becomes a hotspot, we can switch to a fetch_add followed by a fetch_min + // this means when we read the counter we need to clamp it to max_page_len so the iterators don't break + increment_and_clamp(segment.nodes_counter(), self.max_segment_len()) + } + + fn push_new_segment(&self) -> usize { + let segment_id = self.segments.push_with(|segment_id| { + Arc::new(NS::new( + segment_id, + self.node_meta.clone(), + self.edge_meta.clone(), + self.nodes_path.clone(), + self.ext.clone(), + )) + }); + + while self.segments.get(segment_id).is_none() { + std::thread::yield_now(); + } + + segment_id + } + pub fn node<'a>(&'a self, node: impl Into) -> NS::Entry<'a> { let (page_id, pos) = self.resolve_pos(node); let node_page = self - .pages + .segments .get(page_id) .expect("Internal error: page not found"); node_page.entry(pos) @@ -203,7 +283,7 @@ impl, EXT: Config> NodeStorageInner pub fn try_node(&self, node: VID) -> Option> { let (page_id, pos) = self.resolve_pos(node); - let node_page = self.pages.get(page_id)?; + let node_page = self.segments.get(page_id)?; Some(node_page.entry(pos)) } @@ -221,7 +301,7 @@ impl, EXT: Config> NodeStorageInner &'a self, segment_id: usize, ) -> Option, NS>> { - let segment = &self.pages[segment_id]; + let segment = &self.segments[segment_id]; let head = segment.try_head_mut()?; Some(NodeWriter::new(segment, &self.stats, head)) } @@ -232,6 +312,7 @@ impl, EXT: Config> NodeStorageInner ext: EXT, ) -> Result { let nodes_path = nodes_path.as_ref(); + let max_page_len = ext.max_node_page_len(); let node_meta = Arc::new(Meta::new_for_nodes()); if !nodes_path.exists() { @@ -325,10 +406,35 @@ impl, EXT: Config> NodeStorageInner .max() .unwrap_or(i64::MIN); + let mut free_pages = pages + .iter() + .filter_map(|(_, page)| { + let len = page.num_nodes(); + if len < max_page_len { + Some(RwLock::new(page.segment_id())) + } else { + None + } + }) + .collect::>(); + + let mut next_free_page = free_pages + .last() + .map(|page| *(page.read())) + .map(|last| last + 1) + .unwrap_or_else(|| pages.count()); + + free_pages.resize_with(N, || { + let lock = RwLock::new(next_free_page); + next_free_page += 1; + lock + }); + let stats = GraphStats::load(layer_counts, earliest, latest); Ok(Self { - pages, + segments: pages, + free_segments: free_pages.try_into().unwrap(), nodes_path: Some(nodes_path.to_path_buf()), stats: stats.into(), node_meta, @@ -339,10 +445,10 @@ impl, EXT: Config> NodeStorageInner pub fn get_edge(&self, src: VID, dst: VID, layer_id: usize) -> Option { let (src_chunk, src_pos) = self.resolve_pos(src); - if src_chunk >= self.pages.count() { + if src_chunk >= self.segments.count() { return None; } - let src_page = &self.pages[src_chunk]; + let src_page = &self.segments[src_chunk]; src_page.get_out_edge(src_pos, dst, layer_id, src_page.head()) } @@ -351,14 +457,14 @@ impl, EXT: Config> NodeStorageInner } pub fn get_or_create_segment(&self, segment_id: usize) -> &Arc { - if let Some(segment) = self.pages.get(segment_id) { + if let Some(segment) = self.segments.get(segment_id) { return segment; } - let count = self.pages.count(); + let count = self.segments.count(); if count > segment_id { // something has allocated the segment, wait for it to be added loop { - if let Some(segment) = self.pages.get(segment_id) { + if let Some(segment) = self.segments.get(segment_id) { return segment; } else { // wait for the segment to be created @@ -367,10 +473,10 @@ impl, EXT: Config> NodeStorageInner } } else { // we need to create the segment - self.pages.reserve(segment_id + 1 - count); + self.segments.reserve(segment_id + 1 - count); loop { - let new_segment_id = self.pages.push_with(|segment_id| { + let new_segment_id = self.segments.push_with(|segment_id| { Arc::new(NS::new( segment_id, self.node_meta.clone(), @@ -382,7 +488,7 @@ impl, EXT: Config> NodeStorageInner if new_segment_id >= segment_id { loop { - if let Some(segment) = self.pages.get(segment_id) { + if let Some(segment) = self.segments.get(segment_id) { return segment; } else { // wait for the segment to be created @@ -393,4 +499,27 @@ impl, EXT: Config> NodeStorageInner } } } + + pub(crate) fn segment_counts(&self) -> SegmentCounts { + SegmentCounts::new( + self.max_segment_len(), + self.segments().iter().map(|(_, seg)| seg.num_nodes()), + ) + } +} + +pub fn increment_and_clamp(counter: &AtomicU32, max_segment_len: u32) -> Option { + counter + .fetch_update( + std::sync::atomic::Ordering::Relaxed, + std::sync::atomic::Ordering::Relaxed, + |current| { + if current < max_segment_len { + Some(current + 1) + } else { + None + } + }, + ) + .ok() } diff --git a/db4-storage/src/properties/mod.rs b/db4-storage/src/properties/mod.rs index 14ae383845..6d1a9356fa 100644 --- a/db4-storage/src/properties/mod.rs +++ b/db4-storage/src/properties/mod.rs @@ -23,7 +23,7 @@ use std::sync::Arc; pub mod props_meta_writer; -#[derive(Debug, Default, serde::Serialize)] +#[derive(Debug, Default)] pub struct Properties { c_properties: Vec, diff --git a/db4-storage/src/segments/edge/segment.rs b/db4-storage/src/segments/edge/segment.rs index 66c072ef30..312e48caca 100644 --- a/db4-storage/src/segments/edge/segment.rs +++ b/db4-storage/src/segments/edge/segment.rs @@ -144,61 +144,6 @@ impl MemEdgeSegment { .map(|entry| (entry.src, entry.dst)) } - pub fn bulk_insert_edges_internal( - &mut self, - mask: &BooleanArray, - time: &[i64], - time_sec_index: usize, - eids: &[EID], - srcs: &[VID], - dsts: &[VID], - layer_id: usize, - cols: &[ArrayRef], - col_mapping: &[usize], // mapping from cols to the property id - ) { - self.ensure_layer(layer_id); - let est_size = self.layers[layer_id].est_size(); - let t_col_offset = self.layers[layer_id].properties().t_len(); - - let max_page_len = self.layers.get(layer_id).unwrap().max_page_len; - eids.iter() - .zip(srcs.iter().zip(dsts.iter())) - .zip(time) - .enumerate() - .fold( - (t_col_offset, time_sec_index), - |(t_col_offset, time_sec_index), (i, ((eid, (src, dst)), time))| { - if mask.value(i) { - let (_, local_pos) = resolve_pos(*eid, max_page_len); - let row = self.reserve_local_row(local_pos, *src, *dst, layer_id); - let mut prop = self.layers[layer_id].properties_mut().get_mut_entry(row); - prop.ensure_times_from_props(); - prop.set_time(TimeIndexEntry(*time, time_sec_index), t_col_offset); - (t_col_offset + 1, time_sec_index + 1) - } else { - (t_col_offset, time_sec_index) - } - }, - ); - - let props = self.layers[layer_id].properties_mut(); - - // ensure the columns are present - for prop_id in col_mapping { - props.t_properties_mut().ensure_column(*prop_id); - } - - for (prop_id, col) in col_mapping.iter().zip(cols) { - let column = props.t_column_mut(*prop_id).unwrap(); - column.append(col, mask); - } - - props.reset_t_len(); - - let layer_est_size = self.layers[layer_id].est_size(); - self.est_size += layer_est_size.saturating_sub(est_size); - } - pub fn insert_edge_internal( &mut self, t: T, @@ -429,6 +374,7 @@ impl LockedESegment for ArcLockedSegmentView { &'a self, layer_ids: &'b LayerIds, ) -> impl ParallelIterator> + 'a { + dbg!(layer_ids); match layer_ids { LayerIds::None => Iter4::I(rayon::iter::empty()), LayerIds::All => Iter4::J(self.edge_par_iter_layer(0)), @@ -644,322 +590,6 @@ mod test { assert_eq!(segment.t_len(), 3); } - #[test] - fn test_bulk_insert_edges_internal_basic() { - let mut segment = create_test_segment(); - - // Prepare bulk insert data - let mask = BooleanArray::from(vec![true, true, true]); - let times = vec![1i64, 2i64, 3i64]; - let eids = vec![EID(0), EID(1), EID(2)]; - let srcs = vec![VID(1), VID(3), VID(5)]; - let dsts = vec![VID(2), VID(4), VID(6)]; - let cols: Vec> = - vec![Arc::new(StringArray::from(vec!["test1", "test2", "test3"]))]; - let col_mapping = vec![0]; // property id 0 - - // Bulk insert edges - segment.bulk_insert_edges_internal( - &mask, - ×, - 0, // time_sec_index - &eids, - &srcs, - &dsts, - 0, // layer_id - &cols, - &col_mapping, - ); - - // Verify edges exist - assert!(segment.contains_edge(LocalPOS(0), 0)); - assert!(segment.contains_edge(LocalPOS(1), 0)); - assert!(segment.contains_edge(LocalPOS(2), 0)); - - // Verify edge data - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(1), 0), Some((VID(3), VID(4)))); - assert_eq!(segment.get_edge(LocalPOS(2), 0), Some((VID(5), VID(6)))); - - // Verify time length increased - assert_eq!(segment.t_len(), 3); - - for (index, local_pos) in [LocalPOS(0), LocalPOS(1), LocalPOS(2)].iter().enumerate() { - let actual = segment.layers[0] - .t_prop(*local_pos, 0) - .into_iter() - .flat_map(|p| p.iter()) - .collect::>(); - - let i = local_pos.0 as i64; - assert_eq!( - actual, - vec![( - TimeIndexEntry::new(i + 1, index), - Prop::str(format!("test{}", i + 1)) - )] - ); - } - } - - #[test] - fn test_bulk_insert_with_mask() { - let mut segment = create_test_segment(); - - // Prepare bulk insert data with selective mask - let mask = BooleanArray::from(vec![true, false, true, false]); - let times = vec![1i64, 2i64, 3i64, 4i64]; - let eids = vec![EID(0), EID(1), EID(2), EID(3)]; - let srcs = vec![VID(1), VID(3), VID(5), VID(7)]; - let dsts = vec![VID(2), VID(4), VID(6), VID(8)]; - let cols: Vec> = vec![Arc::new(StringArray::from(vec![ - "test1", "test2", "test3", "test4", - ]))]; - let col_mapping = vec![0]; - - // Bulk insert edges - segment.bulk_insert_edges_internal( - &mask, - ×, - 0, - &eids, - &srcs, - &dsts, - 0, - &cols, - &col_mapping, - ); - - // Only edges at positions 0 and 2 should exist (mask was true) - assert!(segment.contains_edge(LocalPOS(0), 0)); - assert!(!segment.contains_edge(LocalPOS(1), 0)); - assert!(segment.contains_edge(LocalPOS(2), 0)); - assert!(!segment.contains_edge(LocalPOS(3), 0)); - - // Verify correct edge data for existing edges - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(2), 0), Some((VID(5), VID(6)))); - - // Only 2 edges should contribute to time length - assert_eq!(segment.t_len(), 2); - } - - #[test] - fn test_bulk_vs_individual_equivalence() { - let mut segment1 = create_test_segment(); - let mut segment2 = create_test_segment(); - - // Individual insertions - segment1.insert_edge_internal( - TimeIndexEntry::new(1, 0), - LocalPOS(0), - VID(1), - VID(2), - 0, - vec![(0, Prop::from("test1"))], - 1, - ); - segment1.insert_edge_internal( - TimeIndexEntry::new(2, 1), - LocalPOS(1), - VID(3), - VID(4), - 0, - vec![(0, Prop::from("test2"))], - 1, - ); - segment1.insert_edge_internal( - TimeIndexEntry::new(3, 2), - LocalPOS(2), - VID(5), - VID(6), - 0, - vec![(0, Prop::from("test3"))], - 1, - ); - - // Equivalent bulk insertion - let mask = BooleanArray::from(vec![true, true, true]); - let times = vec![1i64, 2i64, 3i64]; - let eids = vec![EID(0), EID(1), EID(2)]; - let srcs = vec![VID(1), VID(3), VID(5)]; - let dsts = vec![VID(2), VID(4), VID(6)]; - let cols: Vec> = - vec![Arc::new(StringArray::from(vec!["test1", "test2", "test3"]))]; - let col_mapping = vec![0]; - - segment2.bulk_insert_edges_internal( - &mask, - ×, - 0, - &eids, - &srcs, - &dsts, - 0, - &cols, - &col_mapping, - ); - - // Both segments should have the same edges - for pos in [LocalPOS(0), LocalPOS(1), LocalPOS(2)] { - assert_eq!( - segment1.contains_edge(pos, 0), - segment2.contains_edge(pos, 0) - ); - assert_eq!(segment1.get_edge(pos, 0), segment2.get_edge(pos, 0)); - } - - // Both should have same time length - assert_eq!(segment1.t_len(), segment2.t_len()); - } - - #[test] - fn test_interleaved_operations() { - let mut segment = create_test_segment(); - - // Start with individual insertion - segment.insert_edge_internal( - TimeIndexEntry::new(1, 0), - LocalPOS(0), - VID(1), - VID(2), - 0, - vec![(0, Prop::from("individual1"))], - 1, - ); - - // Bulk insert some edges - let mask = BooleanArray::from(vec![true, true]); - let times = vec![2i64, 3i64]; - let eids = vec![EID(1), EID(2)]; - let srcs = vec![VID(3), VID(5)]; - let dsts = vec![VID(4), VID(6)]; - let cols: Vec> = vec![Arc::new(StringArray::from(vec!["bulk1", "bulk2"]))]; - let col_mapping = vec![0]; - - segment.bulk_insert_edges_internal( - &mask, - ×, - 1, // time_sec_index continues from previous - &eids, - &srcs, - &dsts, - 0, - &cols, - &col_mapping, - ); - - // Insert another individual edge - segment.insert_edge_internal( - TimeIndexEntry::new(4, 3), - LocalPOS(3), - VID(7), - VID(8), - 0, - vec![(0, Prop::from("individual2"))], - 1, - ); - - // Another bulk insert - let mask2 = BooleanArray::from(vec![true, false, true]); - let times2 = vec![5i64, 6i64, 7i64]; - let eids2 = vec![EID(4), EID(5), EID(6)]; - let srcs2 = vec![VID(9), VID(11), VID(13)]; - let dsts2 = vec![VID(10), VID(12), VID(14)]; - let cols2: Vec> = - vec![Arc::new(StringArray::from(vec!["bulk3", "bulk4", "bulk5"]))]; - - segment.bulk_insert_edges_internal( - &mask2, - ×2, - 4, // time_sec_index continues - &eids2, - &srcs2, - &dsts2, - 0, - &cols2, - &col_mapping, - ); - - // Verify all edges exist correctly - assert!(segment.contains_edge(LocalPOS(0), 0)); // individual1 - assert!(segment.contains_edge(LocalPOS(1), 0)); // bulk1 - assert!(segment.contains_edge(LocalPOS(2), 0)); // bulk2 - assert!(segment.contains_edge(LocalPOS(3), 0)); // individual2 - assert!(segment.contains_edge(LocalPOS(4), 0)); // bulk3 - assert!(!segment.contains_edge(LocalPOS(5), 0)); // masked out - assert!(segment.contains_edge(LocalPOS(6), 0)); // bulk5 - - // Verify edge data - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(1), 0), Some((VID(3), VID(4)))); - assert_eq!(segment.get_edge(LocalPOS(2), 0), Some((VID(5), VID(6)))); - assert_eq!(segment.get_edge(LocalPOS(3), 0), Some((VID(7), VID(8)))); - assert_eq!(segment.get_edge(LocalPOS(4), 0), Some((VID(9), VID(10)))); - assert_eq!(segment.get_edge(LocalPOS(6), 0), Some((VID(13), VID(14)))); - - // Total time length should be 6 (4 individual + 2 from first bulk + 2 from second bulk) - assert_eq!(segment.t_len(), 6); - } - - #[test] - fn test_bulk_insert_multiple_layers() { - let mut segment = create_test_segment(); - - // Insert into layer 0 - let mask = BooleanArray::from(vec![true, true]); - let times = vec![1i64, 2i64]; - let eids = vec![EID(0), EID(1)]; - let srcs = vec![VID(1), VID(3)]; - let dsts = vec![VID(2), VID(4)]; - let cols: Vec> = - vec![Arc::new(StringArray::from(vec!["layer0_1", "layer0_2"]))]; - let col_mapping = vec![0]; - - segment.bulk_insert_edges_internal( - &mask, - ×, - 0, - &eids, - &srcs, - &dsts, - 0, // layer 0 - &cols, - &col_mapping, - ); - - // Insert into layer 1 - let mask2 = BooleanArray::from(vec![true]); - let times2 = vec![3i64]; - let eids2 = vec![EID(0)]; // same eid, different layer - let srcs2 = vec![VID(5)]; - let dsts2 = vec![VID(6)]; - let cols2: Vec> = vec![Arc::new(StringArray::from(vec!["layer1_1"]))]; - - segment.bulk_insert_edges_internal( - &mask2, - ×2, - 2, - &eids2, - &srcs2, - &dsts2, - 1, // layer 1 - &cols2, - &col_mapping, - ); - - // Verify edges in both layers - assert!(segment.contains_edge(LocalPOS(0), 0)); - assert!(segment.contains_edge(LocalPOS(1), 0)); - assert!(segment.contains_edge(LocalPOS(0), 1)); - assert!(!segment.contains_edge(LocalPOS(1), 1)); - - // Verify correct layer data - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(1), 0), Some((VID(3), VID(4)))); - assert_eq!(segment.get_edge(LocalPOS(0), 1), Some((VID(5), VID(6)))); - } - #[test] fn est_size_changes() { use super::*; diff --git a/db4-storage/src/segments/node/segment.rs b/db4-storage/src/segments/node/segment.rs index bc7c5bcdd6..bfffbd58f1 100644 --- a/db4-storage/src/segments/node/segment.rs +++ b/db4-storage/src/segments/node/segment.rs @@ -3,6 +3,7 @@ use crate::{ api::nodes::{LockedNSSegment, NodeSegmentOps}, error::StorageError, loop_lock_write, + pages::node_store::increment_and_clamp, persist::strategy::PersistentStrategy, segments::{ HasRow, SegmentContainer, @@ -27,7 +28,7 @@ use std::{ path::PathBuf, sync::{ Arc, - atomic::{AtomicI64, AtomicUsize, Ordering}, + atomic::{AtomicU32, AtomicUsize, Ordering}, }, }; @@ -359,20 +360,34 @@ impl MemNodeSegment { pub fn node_ref(&self, pos: LocalPOS) -> MemNodeRef<'_> { MemNodeRef::new(pos, self) } + + pub fn max_page_len(&self) -> u32 { + self.max_page_len + } } #[derive(Debug)] pub struct NodeSegmentView { inner: Arc>, segment_id: usize, - event_id: AtomicI64, est_size: AtomicUsize, + max_num_node: AtomicU32, _ext: EXT, } #[derive(Debug)] pub struct ArcLockedSegmentView { inner: ArcRwLockReadGuard, + num_nodes: u32, +} + +impl ArcLockedSegmentView { + pub fn new( + inner: ArcRwLockReadGuard, + num_nodes: u32, + ) -> Self { + Self { inner, num_nodes } + } } impl LockedNSSegment for ArcLockedSegmentView { @@ -382,6 +397,10 @@ impl LockedNSSegment for ArcLockedSegmentView { let pos = pos.into(); MemNodeRef::new(pos, &self.inner) } + + fn num_nodes(&self) -> u32 { + self.num_nodes + } } impl>> NodeSegmentOps for NodeSegmentView

{ @@ -403,22 +422,6 @@ impl>> NodeSegmentOps for NodeSegm self.head().t_len() } - fn event_id(&self) -> i64 { - self.event_id.load(Ordering::Relaxed) - } - - fn increment_event_id(&self, i: i64) { - self.event_id.fetch_add(i, Ordering::Relaxed); - } - - fn decrement_event_id(&self) -> i64 { - self.event_id - .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { - if x > 0 { Some(x - 1) } else { None } - }) - .unwrap_or_default() - } - fn load( _page_id: usize, _node_meta: Arc, @@ -447,7 +450,7 @@ impl>> NodeSegmentOps for NodeSegm .into(), segment_id: page_id, _ext: ext, - event_id: Default::default(), + max_num_node: AtomicU32::new(0), est_size: AtomicUsize::new(0), } } @@ -515,9 +518,7 @@ impl>> NodeSegmentOps for NodeSegm } fn locked(self: &Arc) -> Self::ArcLockedSegment { - ArcLockedSegmentView { - inner: self.inner.read_arc(), - } + ArcLockedSegmentView::new(self.inner.read_arc(), self.num_nodes()) } fn num_layers(&self) -> usize { @@ -546,6 +547,14 @@ impl>> NodeSegmentOps for NodeSegm ) -> Result<(), StorageError> { Ok(()) } + + fn nodes_counter(&self) -> &AtomicU32 { + &self.max_num_node + } + + fn increment_num_nodes(&self, max_page_len: u32) { + increment_and_clamp(self.nodes_counter(), max_page_len); + } } #[cfg(test)] diff --git a/db4-storage/src/state.rs b/db4-storage/src/state.rs new file mode 100644 index 0000000000..b86bd23697 --- /dev/null +++ b/db4-storage/src/state.rs @@ -0,0 +1,829 @@ +use rayon::{ + iter::plumbing::{Producer, ProducerCallback, UnindexedConsumer, bridge}, + prelude::*, +}; +use std::{ + ops::{Index, IndexMut}, + sync::Arc, +}; + +use crate::pages::SegmentCounts; + +/// Index resolver for sharded storage with fixed-size chunks +/// +/// Given a sharding scheme where items are distributed across chunks: +/// - chunk_id = index / max_page_len +/// - local_pos = index % max_page_len +/// +/// This struct provides O(1) lookup to map any global index to a flat array position, +/// accounting for partially filled chunks. +/// +/// # Example +/// With max_page_len = 1000: +/// - Chunk 0: 1000 items (offsets[0] = 0, offsets[1] = 1000) +/// - Chunk 1: 500 items (offsets[1] = 1000, offsets[2] = 1500) +/// - Chunk 2: 1000 items (offsets[2] = 1500, offsets[3] = 2500) +/// +/// To resolve index 1200: +/// - chunk = 1200 / 1000 = 1 +/// - local_pos = 1200 % 1000 = 200 +/// - flat_index = offsets[1] + 200 = 1000 + 200 = 1200 +#[derive(Debug, Clone)] +pub struct StateIndex { + /// Cumulative offsets: offsets[chunk_id] = starting position in flat array for that chunk + /// Length is equal to number of chunks + 1 (includes final cumulative value) + offsets: Box<[usize]>, + /// Maximum items per chunk + max_page_len: u32, + /// Phantom data for index type + _marker: std::marker::PhantomData, +} + +impl From> for StateIndex +where + I: From + Into, +{ + fn from(counts: SegmentCounts) -> Self { + Self::new( + counts.counts().iter().map(|c| *c as usize), + counts.max_seg_len(), + ) + } +} + +impl + Into> StateIndex { + /// Create a new StateIndex with the given chunk configuration + /// + /// # Arguments + /// * `chunk_sizes` - The actual size of each chunk (can be <= max_page_len) + /// * `max_page_len` - Maximum capacity of each chunk + pub fn new(chunk_sizes: impl IntoIterator, max_page_len: u32) -> Self { + // Build cumulative offsets (includes final cumulative value) + let mut offsets = Vec::new(); + let mut cumulative = 0; + for size in chunk_sizes { + offsets.push(cumulative); + cumulative += size; + } + offsets.push(cumulative); // Add final cumulative value + + Self { + offsets: offsets.into_boxed_slice(), + max_page_len, + _marker: std::marker::PhantomData, + } + } + + /// Resolve a global index to a flat array index + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Some(flat_index) if the index is valid, None otherwise + #[inline(always)] + pub fn resolve(&self, index: I) -> Option { + let index: usize = index.into(); + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = *self.offsets.get(chunk)?; + let flat_index = offset + local_pos; + + // Verify the flat_index is within bounds of this chunk + let next_offset = *self.offsets.get(chunk + 1)?; + if flat_index < next_offset { + Some(flat_index) + } else { + None + } + } + + /// Resolve a global index to a flat array index without bounds checking + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// The flat array index + /// + /// # Safety + /// Panics if the index is out of bounds + #[inline(always)] + pub fn resolve_unchecked(&self, index: I) -> usize { + let index: usize = index.into(); + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = self.offsets[chunk]; + offset + local_pos + } + + /// Get the number of chunks + #[inline] + pub fn num_chunks(&self) -> usize { + self.offsets.len().saturating_sub(1) + } + + /// Get the total number of items across all chunks + #[inline] + pub fn len(&self) -> usize { + self.offsets[self.num_chunks()] + } + + /// Check if there are no items + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get the maximum page length + #[inline] + pub fn max_page_len(&self) -> u32 { + self.max_page_len + } + + /// Create an iterator over all valid global indices + /// + /// This iterates through all chunks and yields the global indices for each item. + /// For example, with chunk_sizes [10, 1, 5] and max_page_len 10: + /// - Chunk 0: yields 0..10 + /// - Chunk 1: yields 10..11 + /// - Chunk 2: yields 20..25 + pub fn iter(&self) -> StateIndexIter<'_, I> { + StateIndexIter { + index: self, + current_chunk: 0, + current_local: 0, + } + } + + /// Create a parallel iterator over all valid global indices with their flat indices + /// + /// This iterates through all chunks in parallel and yields tuples of (flat_index, global_index). + /// The flat_index starts at 0 and increments for each item in iteration order. + /// + /// For example, with chunk_sizes [10, 1, 5] and max_page_len 10: + /// - Chunk 0: yields (0, 0)..(9, 9) + /// - Chunk 1: yields (10, 10) + /// - Chunk 2: yields (11, 20)..(15, 24) + pub fn par_iter(&self) -> impl ParallelIterator + '_ + where + I: Send + Sync, + { + let max_page_len = self.max_page_len as usize; + let num_chunks = self.num_chunks(); + (0..num_chunks).into_par_iter().flat_map(move |chunk_idx| { + let chunk_start = self.offsets[chunk_idx]; + let chunk_end = self.offsets[chunk_idx + 1]; + let chunk_size = chunk_end - chunk_start; + let global_base = chunk_idx * max_page_len; + (0..chunk_size).into_par_iter().map(move |local_offset| { + let flat_idx = chunk_start + local_offset; + let global_idx = I::from(global_base + local_offset); + (flat_idx, global_idx) + }) + }) + } + + pub fn arc_into_iter(self: Arc) -> impl Iterator { + let max_page_len = self.max_page_len as usize; + let num_chunks = self.num_chunks(); + (0..num_chunks).flat_map(move |chunk_idx| { + let chunk_start = self.offsets[chunk_idx]; + let chunk_end = self.offsets[chunk_idx + 1]; + let chunk_size = chunk_end - chunk_start; + let global_base = chunk_idx * max_page_len; + (0..chunk_size).map(move |local_offset| { + let flat_idx = chunk_start + local_offset; + let global_idx = I::from(global_base + local_offset); + (flat_idx, global_idx) + }) + }) + } +} + +impl + Into> StateIndex { + /// Create a parallel iterator over all valid global indices with their flat indices + /// + /// This iterates through all chunks in parallel and yields tuples of (flat_index, global_index). + /// The flat_index starts at 0 and increments for each item in iteration order. + /// + /// For example, with chunk_sizes [10, 1, 5] and max_page_len 10: + /// - Chunk 0: yields (0, 0)..(9, 9) + /// - Chunk 1: yields (10, 10) + /// - Chunk 2: yields (11, 20)..(15, 24) + pub fn into_par_iter(self: Arc) -> impl ParallelIterator + where + I: Send + Sync, + { + let max_page_len = self.max_page_len as usize; + let num_chunks = self.num_chunks(); + (0..num_chunks).into_par_iter().flat_map(move |chunk_idx| { + let chunk_start = self.offsets[chunk_idx]; + let chunk_end = self.offsets[chunk_idx + 1]; + let chunk_size = chunk_end - chunk_start; + let global_base = chunk_idx * max_page_len; + (0..chunk_size).into_par_iter().map(move |local_offset| { + let flat_idx = chunk_start + local_offset; + let global_idx = I::from(global_base + local_offset); + (flat_idx, global_idx) + }) + }) + } +} + +/// Iterator over global indices in a StateIndex +#[derive(Debug)] +pub struct StateIndexIter<'a, I> { + index: &'a StateIndex, + current_chunk: usize, + current_local: usize, +} + +impl<'a, I: From + Into> Iterator for StateIndexIter<'a, I> { + type Item = I; + + fn next(&mut self) -> Option { + loop { + if self.current_chunk >= self.index.num_chunks() { + return None; + } + + let chunk_start = self.index.offsets[self.current_chunk]; + let chunk_end = self.index.offsets[self.current_chunk + 1]; + let chunk_size = chunk_end - chunk_start; + + if self.current_local < chunk_size { + let global_idx = + self.current_chunk * self.index.max_page_len as usize + self.current_local; + self.current_local += 1; + return Some(I::from(global_idx)); + } + + // Move to next chunk + self.current_chunk += 1; + self.current_local = 0; + } + } + + fn size_hint(&self) -> (usize, Option) { + let total = self.index.len(); + let consumed = if self.current_chunk < self.index.num_chunks() { + self.index.offsets[self.current_chunk] + self.current_local + } else { + total + }; + let remaining = total.saturating_sub(consumed); + (remaining, Some(remaining)) + } +} + +impl<'a, I: From + Into> ExactSizeIterator for StateIndexIter<'a, I> { + fn len(&self) -> usize { + let total = self.index.len(); + let consumed = if self.current_chunk < self.index.num_chunks() { + self.index.offsets[self.current_chunk] + self.current_local + } else { + total + }; + total.saturating_sub(consumed) + } +} + +/// Address resolver for sharded storage with fixed-size chunks +/// +/// This struct combines a StateIndex with a flat array to provide O(1) access +/// to elements in a sharded storage scheme with partially filled chunks. +#[derive(Debug)] +pub struct State { + /// Index resolver + index: StateIndex, + /// Flat array of state cells + state: Box<[A]>, +} + +impl + Into> State { + /// Create a new State with the given chunk configuration + /// + /// # Arguments + /// * `chunk_sizes` - The actual size of each chunk (can be <= max_page_len) + /// * `max_page_len` - Maximum capacity of each chunk + /// + /// # Example + /// ``` + /// use db4_storage::state::State; + /// use std::sync::atomic::AtomicUsize; + /// + /// // 3 chunks with sizes 1000, 500, 1000 and max capacity 1000 + /// let state: State = State::new(vec![1000, 500, 1000], 1000); + /// ``` + pub fn new(chunk_sizes: Vec, max_page_len: u32) -> Self { + let index = StateIndex::::new(chunk_sizes, max_page_len); + let total_size = index.len(); + + // Initialize state array with default values + let state: Box<[A]> = (0..total_size) + .map(|_| A::default()) + .collect::>() + .into_boxed_slice(); + + Self { index, state } + } + + /// Get a reference to the StateIndex + #[inline] + pub fn index(&self) -> &StateIndex { + &self.index + } + + /// Get a reference to the cell for the given global index + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Some(&A) if the index is valid, None otherwise + #[inline(always)] + pub fn get(&self, index: I) -> Option<&A> { + let flat_index = self.index.resolve(index)?; + self.state.get(flat_index) + } + + /// Get a mutable reference to the cell for the given global index + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Some(&mut A) if the index is valid, None otherwise + #[inline(always)] + pub fn get_mut(&mut self, index: I) -> Option<&mut A> { + let flat_index = self.index.resolve(index)?; + self.state.get_mut(flat_index) + } + + /// Get a reference to the cell for the given global index without bounds checking + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Reference to the corresponding cell + /// + /// # Safety + /// Panics if the index is out of bounds + #[inline(always)] + pub fn get_unchecked(&self, index: I) -> &A { + let flat_index = self.index.resolve_unchecked(index); + &self.state[flat_index] + } + + /// Get a mutable reference to the cell for the given global index without bounds checking + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Mutable reference to the corresponding cell + /// + /// # Safety + /// Panics if the index is out of bounds + #[inline(always)] + pub fn get_mut_unchecked(&mut self, index: I) -> &mut A { + let flat_index = self.index.resolve_unchecked(index); + &mut self.state[flat_index] + } + + /// Get the number of chunks + #[inline] + pub fn num_chunks(&self) -> usize { + self.index.num_chunks() + } + + /// Get the total number of state cells + #[inline] + pub fn len(&self) -> usize { + self.state.len() + } + + /// Check if the state is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.state.is_empty() + } + + /// Get the maximum page length + #[inline] + pub fn max_page_len(&self) -> u32 { + self.index.max_page_len() + } + + /// Create an iterator over all elements in the state + /// + /// Yields references to each element in order of their global indices. + pub fn iter(&self) -> StateIter<'_, A, I> { + StateIter { + state: self, + inner: self.index.iter(), + } + } +} + +/// Iterator over elements in a State +#[derive(Debug)] +pub struct StateIter<'a, A, I> { + state: &'a State, + inner: StateIndexIter<'a, I>, +} + +impl<'a, A: Default, I: From + Into> Iterator for StateIter<'a, A, I> { + type Item = &'a A; + + fn next(&mut self) -> Option { + let global_idx = self.inner.next()?; + Some(self.state.get_unchecked(global_idx)) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl<'a, A: Default, I: From + Into> ExactSizeIterator for StateIter<'a, A, I> { + fn len(&self) -> usize { + self.inner.len() + } +} + +impl + Into + std::fmt::Debug + Copy> Index for State { + type Output = A; + + #[inline(always)] + fn index(&self, index: I) -> &Self::Output { + self.get(index) + .unwrap_or_else(|| panic!("index out of bounds: {:?}", index)) + } +} + +impl + Into + std::fmt::Debug + Copy> IndexMut + for State +{ + #[inline(always)] + fn index_mut(&mut self, index: I) -> &mut Self::Output { + self.get_mut(index) + .unwrap_or_else(|| panic!("index out of bounds: {:?}", index)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + + #[test] + fn test_state_index_resolve() { + let index: StateIndex = StateIndex::new(vec![1000, 500, 1000], 1000); + + assert_eq!(index.num_chunks(), 3); + assert_eq!(index.len(), 2500); + assert_eq!(index.max_page_len(), 1000); + + // Test chunk 0 + assert_eq!(index.resolve(0), Some(0)); + assert_eq!(index.resolve(999), Some(999)); + + // Test chunk 1 + assert_eq!(index.resolve(1000), Some(1000)); + assert_eq!(index.resolve(1499), Some(1499)); + + // Test chunk 2 + assert_eq!(index.resolve(2000), Some(1500)); + assert_eq!(index.resolve(2999), Some(2499)); + + // Test out of bounds + assert_eq!(index.resolve(3000), None); + assert_eq!(index.resolve(1500), None); // In chunk 1 but beyond its actual size + } + + #[test] + fn test_basic_get() { + let state: State = State::new(vec![1000, 500, 1000], 1000); + + // Test chunk 0 + state.get_unchecked(0).store(42, Ordering::Relaxed); + assert_eq!(state.get_unchecked(0).load(Ordering::Relaxed), 42); + + state.get_unchecked(999).store(123, Ordering::Relaxed); + assert_eq!(state.get_unchecked(999).load(Ordering::Relaxed), 123); + + // Test chunk 1 (offset should be 1000) + state.get_unchecked(1000).store(77, Ordering::Relaxed); + assert_eq!(state.get_unchecked(1000).load(Ordering::Relaxed), 77); + + state.get_unchecked(1499).store(88, Ordering::Relaxed); + assert_eq!(state.get_unchecked(1499).load(Ordering::Relaxed), 88); + + // Test chunk 2 (offset should be 1500) + state.get_unchecked(2000).store(99, Ordering::Relaxed); + assert_eq!(state.get_unchecked(2000).load(Ordering::Relaxed), 99); + + state.get_unchecked(2999).store(111, Ordering::Relaxed); + assert_eq!(state.get_unchecked(2999).load(Ordering::Relaxed), 111); + } + + #[test] + fn test_get_option() { + let state: State = State::new(vec![100, 50], 100); + + assert!(state.get(0).is_some()); + assert!(state.get(99).is_some()); + assert!(state.get(100).is_some()); + assert!(state.get(149).is_some()); + + // Out of bounds chunk + assert!(state.get(200).is_none()); + assert!(state.get(1000).is_none()); + + // In bounds chunk but beyond chunk's actual size + assert!(state.get(150).is_none()); + } + + #[test] + #[should_panic] + fn test_out_of_bounds_chunk() { + let state: State = State::new(vec![100], 100); + state.get_unchecked(200); // Should panic + } + + #[test] + fn test_partially_filled_chunks() { + // Simulate real scenario: chunks with varying fill levels + let state: State = State::new(vec![1000, 300, 1000, 50], 1000); + + // First chunk - fully filled + state.get_unchecked(0).store(1, Ordering::Relaxed); + state.get_unchecked(999).store(2, Ordering::Relaxed); + assert_eq!(state.get_unchecked(0).load(Ordering::Relaxed), 1); + assert_eq!(state.get_unchecked(999).load(Ordering::Relaxed), 2); + + // Second chunk - partially filled (300 items) + // Global indices: 1000-1299 + state.get_unchecked(1000).store(3, Ordering::Relaxed); + state.get_unchecked(1299).store(4, Ordering::Relaxed); + assert_eq!(state.get_unchecked(1000).load(Ordering::Relaxed), 3); + assert_eq!(state.get_unchecked(1299).load(Ordering::Relaxed), 4); + + // Third chunk - fully filled + // Global indices: 2000-2999 + state.get_unchecked(2000).store(5, Ordering::Relaxed); + state.get_unchecked(2999).store(6, Ordering::Relaxed); + assert_eq!(state.get_unchecked(2000).load(Ordering::Relaxed), 5); + assert_eq!(state.get_unchecked(2999).load(Ordering::Relaxed), 6); + + // Fourth chunk - minimally filled (50 items) + // Global indices: 3000-3049 + state.get_unchecked(3000).store(7, Ordering::Relaxed); + state.get_unchecked(3049).store(8, Ordering::Relaxed); + assert_eq!(state.get_unchecked(3000).load(Ordering::Relaxed), 7); + assert_eq!(state.get_unchecked(3049).load(Ordering::Relaxed), 8); + + assert_eq!(state.len(), 2350); // 1000 + 300 + 1000 + 50 + assert_eq!(state.num_chunks(), 4); + } + + #[test] + fn test_resolve_pos_consistency() { + // Test that our addressing matches the resolve_pos function + let max_page_len = 1000u32; + let state: State = State::new(vec![1000, 500, 1000], max_page_len); + + // Helper to simulate resolve_pos + let resolve_pos = |i: usize| -> (usize, u32) { + let chunk = i / max_page_len as usize; + let pos = (i % max_page_len as usize) as u32; + (chunk, pos) + }; + + for index in [0, 500, 999, 1000, 1250, 1499, 2000, 2500, 2999] { + let (chunk, local_pos) = resolve_pos(index); + + // Verify our addressing scheme matches + let computed_chunk = index / max_page_len as usize; + let computed_local = index % max_page_len as usize; + + assert_eq!(chunk, computed_chunk); + assert_eq!(local_pos, computed_local as u32); + + // Verify we can access the cell + state.get_unchecked(index).store(index, Ordering::Relaxed); + assert_eq!(state.get_unchecked(index).load(Ordering::Relaxed), index); + } + } + + #[test] + fn test_generic_over_different_types() { + // Test with usize + let state_usize: State = State::new(vec![10, 5], 10); + assert_eq!(*state_usize.get_unchecked(0), 0); + assert_eq!(*state_usize.get_unchecked(10), 0); + + // Test with Option + let state_option: State> = State::new(vec![10, 5], 10); + assert_eq!(*state_option.get_unchecked(0), None); + assert_eq!(*state_option.get_unchecked(10), None); + + // Test with AtomicUsize + let state_atomic: State = State::new(vec![10, 5], 10); + state_atomic.get_unchecked(0).store(42, Ordering::Relaxed); + assert_eq!(state_atomic.get_unchecked(0).load(Ordering::Relaxed), 42); + } + + #[test] + fn test_mutable_access() { + let mut state: State = State::new(vec![100, 50], 100); + + // Test get_mut + *state.get_mut(0).unwrap() = 42; + assert_eq!(*state.get(0).unwrap(), 42); + + *state.get_mut(50).unwrap() = 99; + assert_eq!(*state.get(50).unwrap(), 99); + + // Test get_mut in second chunk + *state.get_mut(100).unwrap() = 123; + assert_eq!(*state.get(100).unwrap(), 123); + + // Test get_mut_unchecked + *state.get_mut_unchecked(10) = 77; + assert_eq!(*state.get_unchecked(10), 77); + + // Test out of bounds returns None + assert!(state.get_mut(200).is_none()); + } + + #[test] + fn test_index_trait() { + let mut state: State = State::new(vec![100, 50], 100); + + // Test Index trait + state[0] = 42; + assert_eq!(state[0], 42); + + state[99] = 100; + assert_eq!(state[99], 100); + + // Test in second chunk + state[100] = 200; + assert_eq!(state[100], 200); + + state[149] = 300; + assert_eq!(state[149], 300); + } + + #[test] + #[should_panic(expected = "index out of bounds")] + fn test_index_out_of_bounds() { + let state: State = State::new(vec![100], 100); + let _ = state[200]; + } + + #[test] + fn test_offsets_include_final_cumulative() { + let state: State = State::new(vec![1000, 500, 1000], 1000); + + // offsets should be [0, 1000, 1500, 2500] + assert_eq!(state.num_chunks(), 3); + assert_eq!(state.len(), 2500); + + // Verify via StateIndex API + assert_eq!(state.index().len(), state.len()); + } + + #[test] + fn test_state_index_can_be_used_independently() { + // StateIndex can be used independently of State + let index: StateIndex = StateIndex::new(vec![1000, 500, 1000], 1000); + + // Create your own array + let mut data = vec![0usize; index.len()]; + + // Use the index to access elements + if let Some(flat_idx) = index.resolve(1200) { + data[flat_idx] = 42; + } + + if let Some(flat_idx) = index.resolve(1200) { + assert_eq!(data[flat_idx], 42); + } + } + + #[test] + fn test_state_index_iter() { + let index: StateIndex = StateIndex::new(vec![10, 1, 5], 10); + + let global_indices: Vec = index.iter().collect(); + + // Chunk 0: global indices 0-9 (10 items) + // Chunk 1: global index 10 (1 item) + // Chunk 2: global indices 20-24 (5 items) + let expected = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // Chunk 0 + 10, // Chunk 1 + 20, 21, 22, 23, 24, // Chunk 2 + ]; + + assert_eq!(global_indices, expected); + assert_eq!(index.iter().len(), 16); + } + + #[test] + fn test_state_index_par_iter() { + let index: StateIndex = StateIndex::new(vec![10, 1, 5], 10); + + let mut results: Vec<(usize, usize)> = index.par_iter().collect(); + results.sort_by_key(|(flat_idx, _)| *flat_idx); // Sort by flat index + + // Expected: (flat_idx, global_idx) tuples + // Chunk 0: flat indices 0-9, global indices 0-9 + // Chunk 1: flat index 10, global index 10 + // Chunk 2: flat indices 11-15, global indices 20-24 + let expected = vec![ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), // Chunk 0 + (10, 10), // Chunk 1 + (11, 20), + (12, 21), + (13, 22), + (14, 23), + (15, 24), // Chunk 2 + ]; + + assert_eq!(results, expected); + + // Verify count matches + assert_eq!(index.par_iter().count(), 16); + + // Verify flat indices are sequential + let flat_indices: Vec = results.iter().map(|(flat_idx, _)| *flat_idx).collect(); + assert_eq!(flat_indices, (0..16).collect::>()); + } + + #[test] + fn test_state_iter() { + let mut state: State = State::new(vec![10, 1, 5], 10); + + // Collect global indices first to avoid borrow checker issues + let global_indices: Vec = state.index().iter().collect(); + + // Initialize state with global indices + for global_idx in global_indices { + state[global_idx] = global_idx * 10; + } + + // Collect values via iter + let values: Vec = state.iter().copied().collect(); + + let expected = vec![ + 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, // Chunk 0 + 100, // Chunk 1 + 200, 210, 220, 230, 240, // Chunk 2 + ]; + + assert_eq!(values, expected); + assert_eq!(state.iter().len(), 16); + } + + #[test] + fn test_state_iter_with_atomics() { + let state: State = State::new(vec![10, 5], 10); + + // Collect global indices first to avoid borrow checker issues + let global_indices: Vec = state.index().iter().collect(); + + // Set values via global indices + for global_idx in global_indices { + state + .get_unchecked(global_idx) + .store(global_idx, Ordering::Relaxed); + } + + // Read via iterator + let values: Vec = state.iter().map(|a| a.load(Ordering::Relaxed)).collect(); + + let expected = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // Chunk 0 + 10, 11, 12, 13, 14, // Chunk 1 + ]; + + assert_eq!(values, expected); + } +} diff --git a/raphtory-api/src/core/entities/properties/prop/prop_enum.rs b/raphtory-api/src/core/entities/properties/prop/prop_enum.rs index acde96f75d..3e6adea4b4 100644 --- a/raphtory-api/src/core/entities/properties/prop/prop_enum.rs +++ b/raphtory-api/src/core/entities/properties/prop/prop_enum.rs @@ -23,7 +23,7 @@ use std::{ }; use thiserror::Error; -use crate::core::entities::properties::prop::prop_array::*; +use crate::core::entities::{properties::prop::prop_array::*, GID}; use arrow_array::{cast::AsArray, ArrayRef, LargeListArray, StructArray}; use arrow_schema::{DataType, Field, FieldRef}; @@ -57,7 +57,16 @@ impl From> for Prop { fn from(value: GidRef<'_>) -> Self { match value { GidRef::U64(n) => Prop::U64(n), - GidRef::Str(s) => Prop::str(s), + GidRef::Str(s) => Prop::Str(ArcStr(s.into())), + } + } +} + +impl From for Prop { + fn from(value: GID) -> Self { + match value { + GID::U64(n) => Prop::U64(n), + GID::Str(s) => Prop::Str(ArcStr(s.into())), } } } diff --git a/raphtory-core/src/entities/graph/logical_to_physical.rs b/raphtory-core/src/entities/graph/logical_to_physical.rs index ab3c6609f4..444d900270 100644 --- a/raphtory-core/src/entities/graph/logical_to_physical.rs +++ b/raphtory-core/src/entities/graph/logical_to_physical.rs @@ -1,17 +1,10 @@ -use dashmap::{mapref::entry::Entry, RwLockWriteGuard, SharedValue}; -use either::Either; -use hashbrown::raw::RawTable; +use dashmap::mapref::entry::Entry; use once_cell::sync::OnceCell; use raphtory_api::core::{ entities::{GidRef, GidType, VID}, storage::{dict_mapper::MaybeNew, FxDashMap}, }; -use rayon::prelude::*; use serde::{Deserialize, Deserializer, Serialize}; -use std::{ - borrow::Borrow, - hash::{BuildHasher, Hash}, -}; use thiserror::Error; #[derive(Debug, Deserialize, Serialize)] @@ -42,40 +35,6 @@ impl Map { _ => None, } } - - pub fn run_with_locked) -> Result<(), E> + Send + Sync>( - &self, - work_fn: FN, - ) -> Result<(), E> { - match self { - Map::U64(map) => { - let shards = map.shards(); - shards - .par_iter() - .enumerate() - .try_for_each(|(shard_id, shard)| { - work_fn(ResolverShard::U64(ResolverShardT::new( - shard.write(), - map, - shard_id, - ))) - }) - } - Map::Str(map) => { - let shards = map.shards(); - shards - .par_iter() - .enumerate() - .try_for_each(|(shard_id, shard)| { - work_fn(ResolverShard::Str(ResolverShardT::new( - shard.write(), - map, - shard_id, - ))) - }) - } - } - } } impl Default for Map { @@ -89,94 +48,6 @@ pub struct Mapping { map: OnceCell, } -pub enum ResolverShard<'a> { - U64(ResolverShardT<'a, u64>), - Str(ResolverShardT<'a, String>), -} - -impl<'a> ResolverShard<'a> { - pub fn shard_id(&self) -> usize { - match self { - ResolverShard::U64(ResolverShardT { shard_id, .. }) => *shard_id, - ResolverShard::Str(ResolverShardT { shard_id, .. }) => *shard_id, - } - } - - pub fn as_u64(&mut self) -> Option<&mut ResolverShardT<'a, u64>> { - if let ResolverShard::U64(shard) = self { - Some(shard) - } else { - None - } - } - - pub fn as_str(&mut self) -> Option<&mut ResolverShardT<'a, String>> { - if let ResolverShard::Str(shard) = self { - Some(shard) - } else { - None - } - } -} - -pub struct ResolverShardT<'a, T> { - guard: RwLockWriteGuard<'a, RawTable<(T, SharedValue)>>, - map: &'a FxDashMap, - shard_id: usize, -} - -impl<'a, T: Eq + Hash + Clone> ResolverShardT<'a, T> { - pub fn new( - guard: RwLockWriteGuard<'a, RawTable<(T, SharedValue)>>, - map: &'a FxDashMap, - shard_id: usize, - ) -> Self { - Self { - guard, - map, - shard_id, - } - } - pub fn resolve_node( - &mut self, - id: &Q, - next_id: impl FnOnce(&Q) -> Either, - ) -> Option - where - T: Borrow, - Q: Eq + Hash + ToOwned + ?Sized, - { - let shard_ind = self.map.determine_map(id.borrow()); - if shard_ind != self.shard_id { - // This shard does not contain the id, return None - return None; - } - let factory = self.map.hasher().clone(); - let hash = factory.hash_one(id); - - match self.guard.get(hash, |(k, _)| k.borrow() == id) { - Some((_, vid)) => { - // Node already exists, do nothing - Some(*(vid.get())) - } - None => { - // Node does not exist, create it - let vid = next_id(id); - - if let Either::Left(vid) = vid { - self.guard - .insert(hash, (id.borrow().to_owned(), SharedValue::new(vid)), |t| { - factory.hash_one(&t.0) - }); - Some(vid) - } else { - vid.right() - } - } - } - } -} - impl Mapping { pub fn len(&self) -> usize { self.map.get().map_or(0, |map| match map { @@ -185,14 +56,6 @@ impl Mapping { }) } - pub fn run_with_locked) -> Result<(), E> + Send + Sync>( - &self, - work_fn: FN, - ) -> Result<(), E> { - let inner_map = self.map.get().unwrap(); - inner_map.run_with_locked(work_fn) - } - pub fn is_empty(&self) -> bool { self.len() == 0 } diff --git a/raphtory-core/src/entities/properties/tcell.rs b/raphtory-core/src/entities/properties/tcell.rs index c81474e9f3..3ef808b5d7 100644 --- a/raphtory-core/src/entities/properties/tcell.rs +++ b/raphtory-core/src/entities/properties/tcell.rs @@ -23,7 +23,7 @@ enum TCellVariants { TCellN(TCellN), } -const BTREE_CUTOFF: usize = 128; +const BTREE_CUTOFF: usize = 32; impl TCell { pub fn new(t: TimeIndexEntry, value: A) -> Self { diff --git a/raphtory-core/src/lib.rs b/raphtory-core/src/lib.rs index 791b0765ae..c754214f76 100644 --- a/raphtory-core/src/lib.rs +++ b/raphtory-core/src/lib.rs @@ -24,24 +24,8 @@ //! * `macOS` //! -use std::{thread, time::Duration}; - -use parking_lot::RwLock; - pub mod entities; #[cfg(feature = "python")] mod python; pub mod storage; pub mod utils; - -pub(crate) fn loop_lock_write(l: &RwLock) -> parking_lot::RwLockWriteGuard<'_, A> { - const MAX_BACKOFF_US: u64 = 1000; // 1ms max - let mut backoff_us = 1; - loop { - if let Some(guard) = l.try_write_for(Duration::from_micros(50)) { - return guard; - } - thread::park_timeout(Duration::from_micros(backoff_us)); - backoff_us = (backoff_us * 2).min(MAX_BACKOFF_US); - } -} diff --git a/raphtory-core/src/storage/lazy_vec.rs b/raphtory-core/src/storage/lazy_vec.rs index b5f5cfe5ad..0c75965c63 100644 --- a/raphtory-core/src/storage/lazy_vec.rs +++ b/raphtory-core/src/storage/lazy_vec.rs @@ -1,6 +1,6 @@ use arrow_array::BooleanArray; use serde::{Deserialize, Serialize}; -use std::{fmt::Debug, iter}; +use std::fmt::Debug; #[derive(thiserror::Error, Debug, PartialEq)] #[error("Cannot set previous value '{previous_value:?}' to '{new_value:?}' in position '{index}'")] @@ -277,7 +277,7 @@ where #[cfg(test)] fn iter(&self) -> Box + Send + '_> { match self { - LazyVec::Empty => Box::new(iter::empty()), + LazyVec::Empty => Box::new(std::iter::empty()), LazyVec::LazyVec1(default, tuples) => { Box::new(tuples.iter().map(|value| value.unwrap_or(default))) } @@ -290,7 +290,7 @@ where #[cfg(test)] fn iter_opt(&self) -> Box> + Send + '_> { match self { - LazyVec::Empty => Box::new(iter::empty()), + LazyVec::Empty => Box::new(std::iter::empty()), LazyVec::LazyVec1(_, tuples) => Box::new(tuples.iter()), LazyVec::LazyVecN(_, vector) => Box::new(vector.iter()), } diff --git a/raphtory-core/src/storage/mod.rs b/raphtory-core/src/storage/mod.rs index 8807dcf115..d1673769d6 100644 --- a/raphtory-core/src/storage/mod.rs +++ b/raphtory-core/src/storage/mod.rs @@ -3,6 +3,7 @@ use crate::{ storage::lazy_vec::IllegalSet, }; use arrow_array::{ + builder::StringViewBuilder, cast::AsArray, types::{ Float32Type, Float64Type, Int32Type, Int64Type, UInt16Type, UInt32Type, UInt64Type, @@ -29,7 +30,7 @@ pub mod lazy_vec; pub mod locked_view; pub mod timeindex; -#[derive(Debug, Serialize, Deserialize, PartialEq, Default)] +#[derive(Debug, Default)] pub struct TColumns { t_props_log: Vec, num_rows: usize, @@ -128,7 +129,7 @@ impl TColumns { } } -#[derive(Debug, Serialize, Deserialize, PartialEq)] +#[derive(Debug)] pub enum PropColumn { Empty(usize), Bool(LazyVec), @@ -233,63 +234,6 @@ impl PropColumn { } } - pub fn append(&mut self, col: &dyn Array, mask: &BooleanArray) { - self.init_from_prop_type(col.data_type()); - match self { - PropColumn::Bool(v) => v.append(col.as_boolean(), mask), - PropColumn::I64(v) => v.append(col.as_primitive::(), mask), - PropColumn::U32(v) => v.append(col.as_primitive::(), mask), - PropColumn::U64(v) => v.append(col.as_primitive::(), mask), - PropColumn::F32(v) => v.append(col.as_primitive::(), mask), - PropColumn::F64(v) => v.append(col.as_primitive::(), mask), - PropColumn::Str(v) => { - let iter = col - .as_string_opt::() - .map(|iter| Either::Left(iter.into_iter())) - .or_else(|| { - col.as_string_opt::() - .map(|iter| Either::Right(iter.into_iter())) - }) - .expect("Failed to cast to StringArray"); - v.append(iter.map(|opt| opt.map(ArcStr::from)), mask) - } - PropColumn::U8(v) => v.append(col.as_primitive::(), mask), - PropColumn::U16(v) => v.append(col.as_primitive::(), mask), - PropColumn::I32(v) => v.append(col.as_primitive::(), mask), - PropColumn::NDTime(v) => v.append( - col.as_any() - .downcast_ref::() - .expect("Failed to cast to Timestamp") - .iter() - .map(|value| DateTime::from_timestamp_millis(value?).map(|dt| dt.naive_utc())), - mask, - ), - PropColumn::DTime(v) => v.append( - col.as_any() - .downcast_ref::() - .expect("failed to cast to Timestamp") - .iter() - .map(|value| DateTime::from_timestamp_millis(value?)), - mask, - ), - PropColumn::Decimal(v) => v.append( - // this needs a review if it actually works - col.as_any() - .downcast_ref::() - .expect("Failed to cast to Timestamp") - .iter() - .map(|bd| bd.map(BigDecimal::from)), - mask, - ), - // PropColumn::List(v) => v.append(col, mask), - // PropColumn::Map(v) => v.append(col, mask), - // - // PropColumn::Array(v) => v.append(col, mask), - // PropColumn::Empty(_) => {} - _ => { /* ignore unsupported types for now */ } - } - } - pub fn upsert(&mut self, index: usize, prop: Prop) -> Result<(), TPropColumnError> { self.init_empty_col(&prop); match (self, prop) { diff --git a/raphtory-storage/src/graph/edges/edges.rs b/raphtory-storage/src/graph/edges/edges.rs index 2648517b2a..b15f9c35f0 100644 --- a/raphtory-storage/src/graph/edges/edges.rs +++ b/raphtory-storage/src/graph/edges/edges.rs @@ -72,17 +72,8 @@ impl<'a> EdgesStorageRef<'a> { self, ) -> impl ParallelIterator + use<'a>)> + 'a { match self { - EdgesStorageRef::Mem(storage) => Iter2::I1( - storage - .segmented_par_iter() - .map(|(segment, iter)| (segment, Iter2::I1(iter))), - ), - EdgesStorageRef::Unlocked(edges) => Iter2::I2( - edges - .storage() - .segmented_par_iter() - .map(|(segment, iter)| (segment, Iter2::I2(iter))), - ), + EdgesStorageRef::Mem(storage) => Iter2::I1(storage.storage().row_groups_par_iter()), + EdgesStorageRef::Unlocked(edges) => Iter2::I2(edges.storage().row_groups_par_iter()), } } diff --git a/raphtory-storage/src/graph/graph.rs b/raphtory-storage/src/graph/graph.rs index 38176450d2..b5108741d3 100644 --- a/raphtory-storage/src/graph/graph.rs +++ b/raphtory-storage/src/graph/graph.rs @@ -14,7 +14,7 @@ use db4_graph::TemporalGraph; use raphtory_api::core::entities::{properties::meta::Meta, LayerIds, LayerVariants, EID, VID}; use raphtory_core::entities::{nodes::node_ref::NodeRef, properties::graph_meta::GraphMeta}; use std::{fmt::Debug, iter, sync::Arc}; -use storage::{Extension, GraphPropEntry}; +use storage::{pages::SegmentCounts, state::StateIndex, Extension, GraphPropEntry}; use thiserror::Error; #[derive(Clone, Debug)] @@ -29,6 +29,12 @@ pub enum Immutable { ReadLockedImmutable, } +impl From> for GraphStorage { + fn from(value: Arc) -> Self { + Self::Unlocked(value) + } +} + impl From for GraphStorage { fn from(value: TemporalGraph) -> Self { Self::Unlocked(Arc::new(value)) @@ -259,4 +265,22 @@ impl GraphStorage { GraphStorage::Unlocked(storage) => storage.extension(), } } + + pub fn node_segment_counts(&self) -> SegmentCounts { + match self { + GraphStorage::Mem(storage) => storage.graph.storage().node_segment_counts(), + GraphStorage::Unlocked(storage) => storage.storage().node_segment_counts(), + } + } + + pub fn node_state_index(&self) -> StateIndex { + self.node_segment_counts().into() + } + + pub fn edge_segment_counts(&self) -> SegmentCounts { + match self { + GraphStorage::Mem(storage) => storage.graph.storage().edge_segment_counts(), + GraphStorage::Unlocked(storage) => storage.storage().edge_segment_counts(), + } + } } diff --git a/raphtory-storage/src/graph/nodes/nodes_ref.rs b/raphtory-storage/src/graph/nodes/nodes_ref.rs index 1aec0c1d8d..f170f8dafd 100644 --- a/raphtory-storage/src/graph/nodes/nodes_ref.rs +++ b/raphtory-storage/src/graph/nodes/nodes_ref.rs @@ -44,4 +44,12 @@ impl<'a> NodesStorageEntry<'a> { pub fn iter(&self) -> impl Iterator> { for_all_variants!(self, nodes => nodes.iter()) } + + /// Returns a parallel iterator over nodes row groups + /// the (usize) part is the row group not the segment + pub fn row_groups_par_iter( + &self, + ) -> impl ParallelIterator + '_)> { + for_all_variants!(self, nodes => nodes.row_groups_par_iter()) + } } diff --git a/raphtory-storage/src/mutation/addition_ops_ext.rs b/raphtory-storage/src/mutation/addition_ops_ext.rs index 70cba75036..4ca8fd4498 100644 --- a/raphtory-storage/src/mutation/addition_ops_ext.rs +++ b/raphtory-storage/src/mutation/addition_ops_ext.rs @@ -20,7 +20,7 @@ use raphtory_core::{ }; use storage::{ pages::{node_page::writer::node_info_as_props, session::WriteSession}, - persist::strategy::PersistentStrategy, + persist::strategy::{Config, PersistentStrategy}, properties::props_meta_writer::PropsMetaWriter, resolver::GIDResolverOps, Extension, WalImpl, ES, GS, NS, @@ -234,9 +234,11 @@ impl InternalAdditionOps for TemporalGraph { match id { NodeRef::External(id) => { let id = self.logical_to_physical.get_or_init(id, || { - self.node_count - .fetch_add(1, std::sync::atomic::Ordering::Relaxed) - .into() + let (seg, pos) = self.storage().nodes().reserve_free_pos( + self.event_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed), + ); + pos.as_vid(seg, self.extension().max_node_page_len()) })?; Ok(id) diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index b710a80068..3a1cdec585 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -107,6 +107,7 @@ dotenv = { workspace = true } # for vector testing streaming-stats = { workspace = true } indoc = { workspace = true } raphtory = { workspace = true, features = ["test-utils"] } # enable test-utils for integration tests +tikv-jemallocator = "0.6.1" [build-dependencies] prost-build = { workspace = true, optional = true } diff --git a/raphtory/src/algorithms/centrality/betweenness.rs b/raphtory/src/algorithms/centrality/betweenness.rs index 4e8865fe3a..fbf79dccd9 100644 --- a/raphtory/src/algorithms/centrality/betweenness.rs +++ b/raphtory/src/algorithms/centrality/betweenness.rs @@ -1,6 +1,9 @@ use crate::{ core::entities::VID, - db::{api::state::NodeState, graph::node::NodeView}, + db::{ + api::state::{Index, NodeState}, + graph::node::NodeView, + }, prelude::{GraphViewOps, NodeViewOps}, }; use std::collections::{HashMap, VecDeque}; @@ -21,8 +24,9 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( k: Option, normalized: bool, ) -> NodeState<'graph, f64, G> { + let index = Index::for_graph(g); // Initialize a hashmap to store betweenness centrality values. - let mut betweenness: Vec = vec![0.0; g.unfiltered_num_nodes()]; + let mut betweenness: Vec = vec![0.0; g.count_nodes()]; // Get the nodes and the total number of nodes in the graph. let nodes = g.nodes(); @@ -31,49 +35,47 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( // Main loop over each node to compute betweenness centrality. for node in nodes.iter().take(k_sample) { - let mut stack = Vec::new(); - let mut predecessors: HashMap> = HashMap::new(); - let mut sigma: HashMap = HashMap::new(); - let mut dist: HashMap = HashMap::new(); + let mut stack: Vec = Vec::new(); + let mut predecessors: HashMap> = HashMap::new(); + let mut sigma: HashMap = HashMap::new(); + let mut dist: HashMap = HashMap::new(); let mut queue = VecDeque::new(); // Initialize distance and sigma values for each node. for node in nodes.iter() { - dist.insert(node.node.0, -1); - sigma.insert(node.node.0, 0.0); + dist.insert(node.node, -1); + sigma.insert(node.node, 0.0); } - dist.insert(node.node.0, 0); - sigma.insert(node.node.0, 1.0); - queue.push_back(node.node.0); + dist.insert(node.node, 0); + sigma.insert(node.node, 1.0); + queue.push_back(node.node); // BFS loop to find shortest paths. while let Some(current_node_id) = queue.pop_front() { stack.push(current_node_id); - for neighbor in - NodeView::new_internal(g.clone(), VID::from(current_node_id)).out_neighbours() - { + for neighbor in NodeView::new_internal(g.clone(), current_node_id).out_neighbours() { // Path discovery - if dist[&neighbor.node.0] < 0 { - queue.push_back(neighbor.node.0); - dist.insert(neighbor.node.0, dist[¤t_node_id] + 1); + if dist[&neighbor.node] < 0 { + queue.push_back(neighbor.node); + dist.insert(neighbor.node, dist[¤t_node_id] + 1); } // Path counting - if dist[&neighbor.node.0] == dist[¤t_node_id] + 1 { + if dist[&neighbor.node] == dist[¤t_node_id] + 1 { sigma.insert( - neighbor.node.0, - sigma[&neighbor.node.0] + sigma[¤t_node_id], + neighbor.node, + sigma[&neighbor.node] + sigma[¤t_node_id], ); predecessors - .entry(neighbor.node.0) + .entry(neighbor.node) .or_default() .push(current_node_id); } } } - let mut delta: HashMap = HashMap::new(); + let mut delta: HashMap = HashMap::new(); for node in nodes.iter() { - delta.insert(node.node.0, 0.0); + delta.insert(node.node, 0.0); } // Accumulation @@ -83,8 +85,9 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( let new_delta_v = delta[v] + coeff; delta.insert(*v, new_delta_v); } - if w != node.node.0 { - betweenness[w] += delta[&w]; + if w != node.node { + let pos = index.index(&w).unwrap(); + betweenness[pos] += delta[&w]; } } } @@ -93,7 +96,8 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( if normalized { let factor = 1.0 / ((n as f64 - 1.0) * (n as f64 - 2.0)); for node in nodes.iter() { - betweenness[node.node.index()] *= factor; + let pos = index.index(&node.node).unwrap(); + betweenness[pos] *= factor; } } diff --git a/raphtory/src/algorithms/centrality/hits.rs b/raphtory/src/algorithms/centrality/hits.rs index f93c2fe0e3..6f4d3e251f 100644 --- a/raphtory/src/algorithms/centrality/hits.rs +++ b/raphtory/src/algorithms/centrality/hits.rs @@ -81,6 +81,12 @@ pub fn hits( let step2 = ATask::new(move |evv: &mut EvalNodeView| { let hub_score = evv.get().hub_score; let auth_score = evv.get().auth_score; + if evv.graph().base_graph.unfiltered_num_nodes() <= 10 { + println!( + "DEBUG step2: node={:?}, state_pos={}, hub_score={}, auth_score={}", + evv.node, evv.state_pos, hub_score, auth_score + ); + } for t in evv.out_neighbours() { t.update(&recv_hub_score, hub_score) } @@ -108,6 +114,16 @@ pub fn hits( evv.get_mut().hub_score = recv_auth_score / evv.read_global_state(&total_auth_score).unwrap(); + if evv.graph().base_graph.unfiltered_num_nodes() <= 10 { + println!( + "DEBUG step4: node={:?}, state_pos={}, new_hub={}, new_auth={}", + evv.node, + evv.state_pos, + evv.get().hub_score, + evv.get().auth_score + ); + } + let prev_hub_score = evv.prev().hub_score; let curr_hub_score = evv.get().hub_score; @@ -141,8 +157,16 @@ pub fn hits( vec![], vec![Job::new(step2), Job::new(step3), Job::new(step4), step5], None, - |_, _, _, local| { - NodeState::new_from_eval_mapped(g.clone(), local, |h| (h.hub_score, h.auth_score)) + |_, _, _, local, index| { + if g.unfiltered_num_nodes() <= 10 { + println!("\nDEBUG Final local state (index -> (hub, auth)):"); + for (i, h) in local.iter().enumerate() { + println!(" local[{}] = ({}, {})", i, h.hub_score, h.auth_score); + } + } + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |h| { + (h.hub_score, h.auth_score) + }) }, threads, iter_count, diff --git a/raphtory/src/algorithms/centrality/pagerank.rs b/raphtory/src/algorithms/centrality/pagerank.rs index f9d10842c6..cf762530d5 100644 --- a/raphtory/src/algorithms/centrality/pagerank.rs +++ b/raphtory/src/algorithms/centrality/pagerank.rs @@ -161,7 +161,9 @@ pub fn unweighted_page_rank( vec![Job::new(step1)], vec![Job::new(step2), Job::new(step3), Job::new(step4), step5], Some(vec![PageRankState::new(num_nodes); num_nodes]), - |_, _, _, local| NodeState::new_from_eval_mapped(g.clone(), local, |v| v.score), + |_, _, _, local, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| v.score) + }, threads, iter_count, None, diff --git a/raphtory/src/algorithms/components/in_components.rs b/raphtory/src/algorithms/components/in_components.rs index d2e6925575..71b1effe8a 100644 --- a/raphtory/src/algorithms/components/in_components.rs +++ b/raphtory/src/algorithms/components/in_components.rs @@ -37,7 +37,7 @@ struct InState { /// pub fn in_components(g: &G, threads: Option) -> NodeState<'static, Nodes<'static, G>, G> where - G: StaticGraphViewOps, + G: StaticGraphViewOps + std::fmt::Debug, { let ctx: Context = g.into(); let step1 = ATask::new(move |vv: &mut EvalNodeView| { @@ -71,12 +71,12 @@ where vec![Job::new(step1)], vec![], None, - |_, _, _, local: Vec| { - NodeState::new_from_eval_mapped(g.clone(), local, |v| { + |_, _, _, local: Vec, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| { Nodes::new_filtered( g.clone(), g.clone(), - Some(Index::from_iter(v.in_components)), + Index::from_iter(v.in_components), None, ) }) @@ -127,6 +127,6 @@ pub fn in_component<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>>( node.base_graph.clone(), node.base_graph.clone(), distances.into(), - Some(Index::new(nodes)), + Index::Partial(nodes.into()), ) } diff --git a/raphtory/src/algorithms/components/out_components.rs b/raphtory/src/algorithms/components/out_components.rs index 3709764c7a..cb2edf0d5f 100644 --- a/raphtory/src/algorithms/components/out_components.rs +++ b/raphtory/src/algorithms/components/out_components.rs @@ -37,7 +37,7 @@ struct OutState { /// pub fn out_components(g: &G, threads: Option) -> NodeState<'static, Nodes<'static, G>, G> where - G: StaticGraphViewOps, + G: StaticGraphViewOps + std::fmt::Debug, { let ctx: Context = g.into(); let step1 = ATask::new(move |vv: &mut EvalNodeView| { @@ -71,12 +71,12 @@ where vec![Job::new(step1)], vec![], None, - |_, _, _, local: Vec| { - NodeState::new_from_eval_mapped(g.clone(), local, |v| { + |_, _, _, local: Vec, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| { Nodes::new_filtered( g.clone(), g.clone(), - Some(Index::from_iter(v.out_components)), + Index::from_iter(v.out_components), None, ) }) @@ -127,6 +127,6 @@ pub fn out_component<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>>( node.base_graph.clone(), node.base_graph.clone(), distances.into(), - Some(Index::new(nodes)), + Index::Partial(nodes.into()), ) } diff --git a/raphtory/src/algorithms/components/scc.rs b/raphtory/src/algorithms/components/scc.rs index 5557d3f73c..49425fec03 100644 --- a/raphtory/src/algorithms/components/scc.rs +++ b/raphtory/src/algorithms/components/scc.rs @@ -1,7 +1,10 @@ use crate::{ core::entities::VID, db::{ - api::{state::NodeState, view::StaticGraphViewOps}, + api::{ + state::{Index, NodeState}, + view::StaticGraphViewOps, + }, graph::node::NodeView, }, prelude::*, @@ -148,12 +151,14 @@ where ); */ let groups = tarjan_scc(graph); + let index = Index::for_graph(graph); - let mut values = vec![usize::MAX; graph.unfiltered_num_nodes()]; + let mut values = vec![usize::MAX; graph.count_nodes()]; for (id, group) in groups.into_iter().enumerate() { - for VID(node) in group { - values[node] = id; + for vid in &group { + let pos = index.index(vid).unwrap(); + values[pos] = id; } } diff --git a/raphtory/src/algorithms/cores/k_core.rs b/raphtory/src/algorithms/cores/k_core.rs index 4640dbc0e1..a2709b6cfa 100644 --- a/raphtory/src/algorithms/cores/k_core.rs +++ b/raphtory/src/algorithms/cores/k_core.rs @@ -1,7 +1,10 @@ use crate::{ core::{entities::VID, state::compute_state::ComputeStateVec}, db::{ - api::view::{NodeViewOps, StaticGraphViewOps}, + api::{ + state::Index, + view::{NodeViewOps, StaticGraphViewOps}, + }, graph::views::node_subgraph::NodeSubgraph, task::{ context::Context, @@ -78,10 +81,10 @@ where vec![Job::new(step1)], vec![Job::read_only(step2)], None, - |_, _, _, local| { + |_, _, _, local, index| { g.nodes() .iter() - .filter(|node| local[node.node.0].alive) + .filter(|node| local[index.index(&node.node).unwrap()].alive) .map(|node| node.node) .collect() }, diff --git a/raphtory/src/algorithms/dynamics/temporal/epidemics.rs b/raphtory/src/algorithms/dynamics/temporal/epidemics.rs index e3d6cd3c50..46e8bb729e 100644 --- a/raphtory/src/algorithms/dynamics/temporal/epidemics.rs +++ b/raphtory/src/algorithms/dynamics/temporal/epidemics.rs @@ -9,6 +9,7 @@ use crate::{ }, prelude::*, }; +use either::Either; use indexmap::IndexSet; use rand::{ distr::{Bernoulli, Distribution}, @@ -252,7 +253,7 @@ where g.clone(), g.clone(), values.into(), - Some(Index::new(index)), + Index::Partial(index.into()), )) } diff --git a/raphtory/src/algorithms/embeddings/fast_rp.rs b/raphtory/src/algorithms/embeddings/fast_rp.rs index 6ff61b94f8..1608157857 100644 --- a/raphtory/src/algorithms/embeddings/fast_rp.rs +++ b/raphtory/src/algorithms/embeddings/fast_rp.rs @@ -97,8 +97,10 @@ where vec![Job::new(step1)], vec![Job::read_only(step2)], None, - |_, _, _, local: Vec| { - NodeState::new_from_eval_mapped(g.clone(), local, |v| v.embedding_state) + |_, _, _, local: Vec, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| { + v.embedding_state + }) }, threads, num_iters, diff --git a/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs b/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs index 8c26674d67..fa8f66a7e1 100644 --- a/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs +++ b/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs @@ -5,6 +5,7 @@ use crate::{ view::*, }, }; +use either::Either; use indexmap::IndexSet; use itertools::Itertools; use rayon::prelude::*; @@ -47,6 +48,10 @@ pub fn local_clustering_coefficient_batch( )) }) .unzip(); - let result: Option<_> = Some(Index::new(index)); - NodeState::new(graph.clone(), graph.clone(), values.into(), result) + NodeState::new( + graph.clone(), + graph.clone(), + values.into(), + Index::Partial(index.into()), + ) } diff --git a/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs b/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs index 0be0541854..7e1bb351bf 100644 --- a/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs +++ b/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs @@ -242,7 +242,7 @@ where vec![Job::new(neighbourhood_update_step)], vec![Job::new(intersection_compute_step)], None, - |egs, _, _, _| { + |egs, _, _, _, _| { tri_mc.iter().map(|mc| egs.finalize::<[usize; 8], [usize;8], [usize; 8], ArrConst,8>>(mc)).collect_vec() }, threads, @@ -294,7 +294,7 @@ where vec![], vec![Job::new(star_count_step)], None, - |egs, _ , _ , _ | { + |egs, _ , _ , _ ,_| { triadic_motifs.iter().enumerate().map(|(i,tri)| { let mut tmp = egs.finalize::<[usize; 32], [usize;32], [usize; 32], ArrConst,32>>(&star_clone[i]) .iter().copied() diff --git a/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs b/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs index b91f1d9c75..5cbd025b4a 100644 --- a/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs +++ b/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs @@ -4,7 +4,7 @@ use crate::{ core::state::{accumulator_id::accumulators, compute_state::ComputeStateVec}, db::{ api::{ - state::NodeState, + state::{Index, NodeState}, view::{NodeViewOps, *}, }, graph::views::node_subgraph::NodeSubgraph, @@ -18,6 +18,7 @@ use crate::{ }; use itertools::Itertools; use num_traits::Zero; +use rand::seq::index; use raphtory_api::core::entities::VID; use rayon::prelude::*; use rustc_hash::FxHashSet; @@ -212,20 +213,12 @@ where for v in u.neighbours() { // Find triangles on the UV edge let intersection_nbs = { - match ( - u.entry(&neighbours_set) - .read_ref() - .unwrap_or(&FxHashSet::default()), - v.entry(&neighbours_set) - .read_ref() - .unwrap_or(&FxHashSet::default()), - ) { - (u_set, v_set) => { - let intersection = - u_set.intersection(v_set).cloned().collect::>(); - intersection - } - } + let default = FxHashSet::default(); + let u_entry = u.entry(&neighbours_set); + let u_set = u_entry.read_ref().unwrap_or(&default); + let v_entry = v.entry(&neighbours_set); + let v_set = v_entry.read_ref().unwrap_or(&default); + u_set.intersection(v_set).cloned().collect::>() }; if intersection_nbs.is_empty() { @@ -303,11 +296,11 @@ where vec![Job::new(neighbourhood_update_step)], vec![Job::new(intersection_compute_step)], None, - |_, _, _els, mut local| { + |_, _, _els, mut local, index| { let mut tri_motifs = HashMap::new(); - for node in graph.nodes() { + for node in kcore_subgraph.nodes() { let v_gid = node.name(); - let triangle = mem::take(&mut local[node.node.0].triangle); + let triangle = mem::take(&mut local[index.index(&node.node).unwrap()].triangle); if triangle.is_empty() { tri_motifs.insert(v_gid.clone(), vec![[0; 8]; delta_len]); } else { @@ -365,12 +358,12 @@ where vec![Job::new(star_motif_step)], vec![], None, - |_, _, _, local| { + |_, _, _, local, index| { let values: Vec<_> = g .nodes() .par_iter() .map(|n| { - let mc = &local[n.node.index()]; + let mc = &local[index.index(&n.node).unwrap()]; let v_gid = n.name(); let triangles = triadic_motifs .get(&v_gid) diff --git a/raphtory/src/algorithms/motifs/triangle_count.rs b/raphtory/src/algorithms/motifs/triangle_count.rs index 2e4381bbd6..de8ad3d6cd 100644 --- a/raphtory/src/algorithms/motifs/triangle_count.rs +++ b/raphtory/src/algorithms/motifs/triangle_count.rs @@ -107,7 +107,7 @@ pub fn triangle_count(graph: &G, threads: Option) init_tasks, tasks, None, - |egs, _, _, _| egs.finalize(&count), + |egs, _, _, _, _| egs.finalize(&count), threads, 1, None, diff --git a/raphtory/src/algorithms/motifs/triplet_count.rs b/raphtory/src/algorithms/motifs/triplet_count.rs index c85a2dc462..590f899224 100644 --- a/raphtory/src/algorithms/motifs/triplet_count.rs +++ b/raphtory/src/algorithms/motifs/triplet_count.rs @@ -104,7 +104,7 @@ pub fn triplet_count(g: &G, threads: Option) -> us vec![], vec![Job::new(step1)], None, - |egs, _, _, _| egs.finalize(&count), + |egs, _, _, _, _| egs.finalize(&count), threads, 1, None, diff --git a/raphtory/src/algorithms/pathing/dijkstra.rs b/raphtory/src/algorithms/pathing/dijkstra.rs index 27e93a13c5..92efacbba4 100644 --- a/raphtory/src/algorithms/pathing/dijkstra.rs +++ b/raphtory/src/algorithms/pathing/dijkstra.rs @@ -9,6 +9,7 @@ use crate::{ errors::GraphError, prelude::*, }; +use either::Either; use indexmap::IndexSet; use raphtory_api::core::{ entities::{ @@ -65,6 +66,7 @@ pub fn dijkstra_single_source_shortest_paths, direction: Direction, ) -> Result), G>, GraphError> { + let index = Index::for_graph(g); let source_ref = source.as_node_ref(); let source_node = match g.node(source_ref) { Some(src) => src, @@ -85,10 +87,11 @@ pub fn dijkstra_single_source_shortest_paths, Vec<_>) = paths .into_iter() .map(|(id, (cost, path))| { - let nodes = Nodes::new_filtered(g.clone(), g.clone(), Some(Index::new(path)), None); + let nodes = + Nodes::new_filtered(g.clone(), g.clone(), Index::Partial(path.into()), None); (id, (cost, nodes)) }) .unzip(); @@ -197,6 +202,6 @@ pub fn dijkstra_single_source_shortest_paths, T: AsNodeRef } } NodeState::new_from_map(g.clone(), paths, |v| { - Nodes::new_filtered(g.clone(), g.clone(), Some(Index::from_iter(v)), None) + Nodes::new_filtered(g.clone(), g.clone(), Index::from_iter(v), None) }) } diff --git a/raphtory/src/algorithms/pathing/temporal_reachability.rs b/raphtory/src/algorithms/pathing/temporal_reachability.rs index 7368f171e2..560f3c1f99 100644 --- a/raphtory/src/algorithms/pathing/temporal_reachability.rs +++ b/raphtory/src/algorithms/pathing/temporal_reachability.rs @@ -181,25 +181,25 @@ pub fn temporally_reachable_nodes( })); let mut runner: TaskRunner = TaskRunner::new(ctx); - let result: HashMap> = runner.run( + let (index, values) = runner.run( vec![Job::new(step1)], vec![Job::new(step2), step3], None, - |_, ess, _, _| { - ess.finalize(&taint_history, |taint_history| { + |_, ess, _, _, index| { + let data = ess.finalize_vec(&taint_history, |taint_history| { let mut hist = taint_history .into_iter() .map(|tmsg| (tmsg.event_time, tmsg.src_node)) .collect_vec(); hist.sort(); hist - }) + }); + (index, data) }, threads, max_hops, None, None, ); - let result: FxHashMap<_, _> = result.into_iter().map(|(k, v)| (VID(k), v)).collect(); - NodeState::new_from_map(g.clone(), result, |v| v) + NodeState::new_from_eval_with_index(g.clone(), values, index) } diff --git a/raphtory/src/core/state/compute_state.rs b/raphtory/src/core/state/compute_state.rs index f604dd81d3..b4ad7e94d5 100644 --- a/raphtory/src/core/state/compute_state.rs +++ b/raphtory/src/core/state/compute_state.rs @@ -26,7 +26,11 @@ pub trait ComputeState: std::fmt::Debug + Clone + Send + Sync { i: usize, ) -> Option<&A>; - fn iter(&self, ss: usize, extend_to: usize) -> Box + '_>; + fn iter( + &self, + ss: usize, + extend_to: usize, + ) -> Box + Send + '_>; fn agg>(&mut self, ss: usize, a: IN, ki: usize) where @@ -118,7 +122,11 @@ impl ComputeState for ComputeStateVec { vec.current(ss).get(i) } - fn iter(&self, ss: usize, extend_to: usize) -> Box + '_> { + fn iter( + &self, + ss: usize, + extend_to: usize, + ) -> Box + Send + '_> { let vec = self .current() .as_any() diff --git a/raphtory/src/core/state/mod.rs b/raphtory/src/core/state/mod.rs index 834060f451..89adcc16ba 100644 --- a/raphtory/src/core/state/mod.rs +++ b/raphtory/src/core/state/mod.rs @@ -295,8 +295,9 @@ mod state_test { let mut actual = part1_state .clone() - .finalize(&sum, 0, &g, |c| c) + .finalize_vec(&sum, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -305,8 +306,9 @@ mod state_test { let mut actual = part1_state .clone() - .finalize(&min, 0, &g, |c| c) + .finalize_vec(&min, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -318,8 +320,9 @@ mod state_test { let mut actual = part2_state .clone() - .finalize(&sum, 0, &g, |c| c) + .finalize_vec(&sum, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -328,8 +331,9 @@ mod state_test { let mut actual = part2_state .clone() - .finalize(&min, 0, &g, |c| c) + .finalize_vec(&min, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -342,8 +346,9 @@ mod state_test { ShuffleComputeState::merge_mut(&mut part1_state, &part2_state, sum, 0); let mut actual = part1_state .clone() - .finalize(&sum, 0, &g, |c| c) + .finalize_vec(&sum, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -360,8 +365,9 @@ mod state_test { ShuffleComputeState::merge_mut(&mut part1_state, &part2_state, min, 0); let mut actual = part1_state .clone() - .finalize(&min, 0, &g, |c| c) + .finalize_vec(&min, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); diff --git a/raphtory/src/core/state/morcel_state.rs b/raphtory/src/core/state/morcel_state.rs index f3a8c59481..6a54db9eed 100644 --- a/raphtory/src/core/state/morcel_state.rs +++ b/raphtory/src/core/state/morcel_state.rs @@ -138,7 +138,7 @@ impl MorcelComputeState { &self, ss: usize, agg_ref: &AccId, - ) -> Box> + '_> + ) -> Box> + Send + '_> where A: StateType, { diff --git a/raphtory/src/core/state/shuffle_state.rs b/raphtory/src/core/state/shuffle_state.rs index 715001395a..cb4bed57e2 100644 --- a/raphtory/src/core/state/shuffle_state.rs +++ b/raphtory/src/core/state/shuffle_state.rs @@ -1,3 +1,7 @@ +use either::Either; +use raphtory_api::iter::IntoDynBoxed; +use raphtory_core::utils::iter::GenLockedIter; + use super::{ accumulator_id::AccId, compute_state::ComputeState, @@ -128,27 +132,27 @@ impl ShuffleComputeState { pub fn accumulate_into>( &mut self, ss: usize, - p_id: usize, + state_pos: usize, a: IN, agg_ref: &AccId, ) where A: StateType, { - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].accumulate_into(ss, offset, a, agg_ref) } pub fn read_with_pid>( &self, ss: usize, - p_id: usize, + state_pos: usize, agg_ref: &AccId, ) -> Option where A: StateType, OUT: std::fmt::Debug, { - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].read::(offset, agg_ref.id(), ss) } @@ -167,27 +171,27 @@ impl ShuffleComputeState { pub fn read>( &self, ss: usize, - p_id: usize, + state_pos: usize, agg_ref: &AccId, ) -> Option where A: StateType, OUT: std::fmt::Debug, { - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].read::(offset, agg_ref.id(), ss) } pub fn read_ref>( &self, ss: usize, - p_id: usize, + state_pos: usize, agg_ref: &AccId, ) -> Option<&A> where A: StateType, { - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].read_ref::(offset, agg_ref.id(), ss) } @@ -204,13 +208,22 @@ impl ShuffleComputeState { .read::(GLOBAL_STATE_KEY, agg_ref.id(), ss) } - pub fn finalize, G: StaticGraphViewOps>( + pub fn finalize< + A, + B, + F, + IN, + OUT, + ACC: Accumulator, + G: StaticGraphViewOps, + C: FromIterator<(usize, B)>, + >( &self, agg_def: &AccId, ss: usize, _g: &G, f: F, - ) -> HashMap + ) -> C where OUT: StateType, A: StateType, @@ -225,12 +238,33 @@ impl ShuffleComputeState { }) .collect() } + pub fn finalize_vec, G: StaticGraphViewOps>( + &self, + agg_def: &AccId, + ss: usize, + _g: &G, + f: F, + ) -> Vec + where + OUT: StateType, + A: StateType, + F: Fn(OUT) -> B + Copy, + { + self.iter(ss, *agg_def) + .map(|(_, a)| { + let out = a + .map(|a| ACC::finish(a)) + .unwrap_or_else(|| ACC::finish(&ACC::zero())); + f(out) + }) + .collect() + } pub fn iter<'a, A: StateType, IN: 'a, OUT: 'a, ACC: Accumulator>( &'a self, ss: usize, acc_id: AccId, - ) -> impl Iterator)> + 'a { + ) -> impl Iterator)> + Send + 'a { self.parts .iter() .flat_map(move |part| part.iter(ss, &acc_id)) @@ -312,6 +346,24 @@ impl EvalShardState { } } + pub fn finalize_vec>( + self, + agg_def: &AccId, + f: F, + ) -> Vec + where + OUT: StateType, + A: StateType, + F: Fn(OUT) -> B + Copy, + { + let inner = self.shard_states.consume(); + if let Ok(inner) = inner { + inner.finalize_vec(agg_def, self.ss, &self.g, f) + } else { + vec![] + } + } + pub fn values(&self) -> &Shard { &self.shard_states } diff --git a/raphtory/src/db/api/state/group_by.rs b/raphtory/src/db/api/state/group_by.rs index e2b2361c4c..d4b7cb53d6 100644 --- a/raphtory/src/db/api/state/group_by.rs +++ b/raphtory/src/db/api/state/group_by.rs @@ -6,6 +6,7 @@ use crate::{ prelude::{GraphViewOps, NodeStateOps}, }; use dashmap::DashMap; +use either::Either; use indexmap::IndexSet; use raphtory_api::core::entities::VID; use rayon::prelude::*; @@ -37,12 +38,7 @@ impl<'graph, V: Hash + Eq + Send + Sync + Clone, G: GraphViewOps<'graph>> NodeGr self.groups.iter().map(|(v, nodes)| { ( v, - Nodes::new_filtered( - self.graph.clone(), - self.graph.clone(), - Some(nodes.clone()), - None, - ), + Nodes::new_filtered(self.graph.clone(), self.graph.clone(), nodes.clone(), None), ) }) } @@ -83,12 +79,7 @@ impl<'graph, V: Hash + Eq + Send + Sync + Clone, G: GraphViewOps<'graph>> NodeGr self.groups.get(index).map(|(v, nodes)| { ( v, - Nodes::new_filtered( - self.graph.clone(), - self.graph.clone(), - Some(nodes.clone()), - None, - ), + Nodes::new_filtered(self.graph.clone(), self.graph.clone(), nodes.clone(), None), ) }) } diff --git a/raphtory/src/db/api/state/lazy_node_state.rs b/raphtory/src/db/api/state/lazy_node_state.rs index 9ab1632ec8..bdf061b75a 100644 --- a/raphtory/src/db/api/state/lazy_node_state.rs +++ b/raphtory/src/db/api/state/lazy_node_state.rs @@ -15,12 +15,14 @@ use crate::{ }, prelude::*, }; +use either::Either; use indexmap::IndexSet; use rayon::prelude::*; use std::{ borrow::Borrow, fmt::{Debug, Formatter}, }; +use storage::state::StateIndex; #[derive(Clone)] pub struct LazyNodeState<'graph, Op, G, GH = G> { @@ -162,7 +164,7 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra self.nodes.base_graph.clone(), self.nodes.graph.clone(), values.into(), - Some(Index::new(keys)), + Index::Partial(keys.into()), ) } else { let values = self.collect_vec(); @@ -170,7 +172,7 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra self.nodes.base_graph.clone(), self.nodes.graph.clone(), values.into(), - None, + Index::for_graph(self.nodes.graph.clone()), ) } } @@ -268,34 +270,6 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra .map(move |node| (node, self.op.apply(&storage, node.node))) } - fn get_by_index( - &self, - index: usize, - ) -> Option<( - NodeView<'_, &Self::BaseGraph, &Self::Graph>, - Self::Value<'_>, - )> { - if self.graph().filtered() { - self.iter().nth(index) - } else { - let vid = match self.graph().node_list() { - NodeList::All { len } => { - if index < len { - VID(index) - } else { - return None; - } - } - NodeList::List { elems } => elems.key(index)?, - }; - let cg = self.graph().core_graph(); - Some(( - NodeView::new_one_hop_filtered(self.base_graph(), self.graph(), vid), - self.op.apply(cg, vid), - )) - } - } - fn get_by_node(&self, node: N) -> Option> { let node = (&self.graph()).node(node); node.map(|node| self.op.apply(self.graph().core_graph(), node.node)) diff --git a/raphtory/src/db/api/state/node_state.rs b/raphtory/src/db/api/state/node_state.rs index c399f12913..f1c465a9be 100644 --- a/raphtory/src/db/api/state/node_state.rs +++ b/raphtory/src/db/api/state/node_state.rs @@ -22,96 +22,140 @@ use std::{ marker::PhantomData, sync::Arc, }; +use storage::state::StateIndex; -#[derive(Debug, Default)] -pub struct Index { - index: Arc>, +#[derive(Debug)] +pub enum Index { + Full(Arc>), + Partial(Arc>), +} + +impl From> for Index { + fn from(index: StateIndex) -> Self { + Self::Full(index.into()) + } +} + +impl Default for Index { + fn default() -> Self { + Self::Partial(Arc::new(Default::default())) + } } impl Clone for Index { fn clone(&self) -> Self { - let index = self.index.clone(); - Self { index } + match self { + Index::Full(index) => Index::Full(index.clone()), + Index::Partial(index) => Index::Partial(index.clone()), + } } } impl + From + Send + Sync> FromIterator for Index { fn from_iter>(iter: T) -> Self { - Self { - index: Arc::new(IndexSet::from_iter(iter)), - } + Self::Partial(Arc::new(IndexSet::from_iter(iter))) } } impl Index { - pub fn for_graph<'graph>(graph: impl GraphViewOps<'graph>) -> Option { + pub fn for_graph<'graph>(graph: impl GraphViewOps<'graph>) -> Self { if graph.filtered() { if graph.node_list_trusted() { match graph.node_list() { - NodeList::All { .. } => None, - NodeList::List { elems } => Some(elems), + NodeList::All { .. } => { + Self::Full(graph.core_graph().node_state_index().into()) + } + NodeList::List { elems } => elems, } } else { - Some(Self::from_iter(graph.nodes().iter().map(|node| node.node))) + Self::from_iter(graph.nodes().iter().map(|node| node.node)) } } else { - None + Self::Full(graph.core_graph().node_state_index().into()) } } } impl + From + Send + Sync> Index { pub fn new(keys: impl Into>>) -> Self { - Self { index: keys.into() } + Self::Partial(keys.into()) } #[inline] pub fn iter(&self) -> impl Iterator + '_ { - self.index.iter().copied() + match self { + Index::Full(index) => Either::Left(index.iter()), + Index::Partial(index) => Either::Right(index.iter().copied()), + } } - pub fn into_par_iter(self) -> impl IndexedParallelIterator { - (0..self.len()) - .into_par_iter() - .map(move |i| *self.index.get_index(i).unwrap()) + pub fn into_par_iter(self) -> impl ParallelIterator { + match self { + Index::Full(index) => Either::Left(index.into_par_iter().map(|(_, k)| k)), + Index::Partial(index) => Either::Right( + (0..index.len()) + .into_par_iter() + .map(move |i| *index.get_index(i).unwrap()), + ), + } } pub fn into_iter(self) -> impl Iterator { - (0..self.len()).map(move |i| *self.index.get_index(i).unwrap()) + match self { + Index::Full(index) => Either::Left(index.arc_into_iter().map(|(_, k)| k)), + Index::Partial(index) => { + Either::Right((0..index.len()).map(move |i| *index.get_index(i).unwrap())) + } + } } #[inline] pub fn index(&self, key: &K) -> Option { - self.index.get_index_of(key) - } - - #[inline] - pub fn key(&self, index: usize) -> Option { - self.index.get_index(index).copied() + // self.index.get_index_of(key) + match self { + Index::Full(index) => index.resolve(*key), + Index::Partial(index) => index.get_index_of(key), + } } #[inline] pub fn len(&self) -> usize { - self.index.len() + match self { + Index::Full(index) => index.len(), + Index::Partial(index) => index.len(), + } } pub fn is_empty(&self) -> bool { - self.index.is_empty() + self.len() == 0 } #[inline] pub fn contains(&self, key: &K) -> bool { - self.index.contains(key) + match self { + Index::Full(index) => index.resolve(*key).is_some(), + Index::Partial(index) => index.contains(key), + } } - pub fn par_iter(&self) -> impl IndexedParallelIterator + '_ { - (0..self.len()) - .into_par_iter() - .map(move |i| *self.index.get_index(i).unwrap()) + pub fn par_iter(&self) -> impl ParallelIterator + '_ { + match self { + Index::Full(index) => Either::Left(index.par_iter()), + Index::Partial(index) => Either::Right( + (0..index.len()) + .into_par_iter() + .map(move |i| (i, *index.get_index(i).unwrap())), + ), + } } pub fn intersection(&self, other: &Self) -> Self { - self.index.intersection(&other.index).copied().collect() + match (self, other) { + (Self::Full(_), Self::Partial(a)) => Self::Partial(a.clone()), + (Self::Partial(a), Self::Full(_)) => Self::Partial(a.clone()), + (Self::Partial(a), Self::Partial(b)) => a.intersection(b).copied().collect(), + _ => self.clone(), + } } } @@ -120,7 +164,7 @@ pub struct NodeState<'graph, V, G, GH = G> { base_graph: G, graph: GH, values: Arc<[V]>, - keys: Option>, + keys: Index, _marker: PhantomData<&'graph ()>, } @@ -207,19 +251,27 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { /// /// # Arguments /// - `graph`: the graph view - /// - `values`: the unfiltered values (i.e., `values.len() == graph.unfiltered_num_nodes()`). This method handles the filtering. + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). pub fn new_from_eval(graph: G, values: Vec) -> Self where V: Clone, { let index = Index::for_graph(graph.clone()); - let values = match &index { - None => values, - Some(index) => index - .iter() - .map(|vid| values[vid.index()].clone()) - .collect(), - }; + // Values are already in flat index order from TaskRunner + Self::new(graph.clone(), graph, values.into(), index) + } + + /// Construct a node state from an eval result + /// + /// # Arguments + /// - `graph`: the graph view + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). + /// - `index`: the index mapping VID to flat position in values + pub fn new_from_eval_with_index(graph: G, values: Vec, index: Index) -> Self + where + V: Clone, + { + // Values are already in flat index order from TaskRunner Self::new(graph.clone(), graph, values.into(), index) } @@ -227,23 +279,42 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { /// /// # Arguments /// - `graph`: the graph view - /// - `values`: the unfiltered values (i.e., `values.len() == graph.unfiltered_num_nodes()`). This method handles the filtering. + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). + /// - `map`: Closure mapping input to output values + pub fn new_from_eval_mapped_with_index( + graph: G, + values: Vec, + index: Index, + map: impl Fn(R) -> V, + ) -> Self + where + V: std::fmt::Debug, + { + // Values are already in flat index order from TaskRunner, just map them + let values = values.into_iter().map(map).collect(); + Self::new(graph.clone(), graph, values, index) + } + + /// Construct a node state from an eval result, mapping values + /// + /// # Arguments + /// - `graph`: the graph view + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). /// - `map`: Closure mapping input to output values - pub fn new_from_eval_mapped(graph: G, values: Vec, map: impl Fn(R) -> V) -> Self { + pub fn new_from_eval_mapped(graph: G, values: Vec, map: impl Fn(R) -> V) -> Self + where + V: std::fmt::Debug, + { let index = Index::for_graph(graph.clone()); - let values = match &index { - None => values.into_iter().map(map).collect(), - Some(index) => index - .iter() - .map(|vid| map(values[vid.index()].clone())) - .collect(), - }; + // Values are already in flat index order from TaskRunner, just map them + let values = values.into_iter().map(map).collect(); Self::new(graph.clone(), graph, values, index) } /// create a new empty NodeState pub fn new_empty(graph: G) -> Self { - Self::new(graph.clone(), graph, [].into(), Some(Index::default())) + let index = Index::for_graph(&graph); + Self::new(graph.clone(), graph, [].into(), index) } /// create a new NodeState from a list of values for the node (takes care of creating an index for @@ -272,13 +343,18 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { .iter() .flat_map(|node| Some((node.node, map(values.remove(&node.node)?)))) .unzip(); - Self::new(graph.clone(), graph, values.into(), Some(Index::new(index))) + Self::new( + graph.clone(), + graph, + values.into(), + Index::Partial(index.into()), + ) } } } impl<'graph, V, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> NodeState<'graph, V, G, GH> { - pub fn new(base_graph: G, graph: GH, values: Arc<[V]>, keys: Option>) -> Self { + pub fn new(base_graph: G, graph: GH, values: Arc<[V]>, keys: Index) -> Self { Self { base_graph, graph, @@ -288,10 +364,6 @@ impl<'graph, V, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> NodeState<'gr } } - pub fn into_inner(self) -> (Arc<[V]>, Option>) { - (self.values, self.keys) - } - pub fn values(&self) -> &Arc<[V]> { &self.values } @@ -374,29 +446,12 @@ impl< where 'graph: 'a, { - match &self.keys { - Some(index) => index - .iter() - .zip(self.values.iter()) - .map(|(n, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), - v, - ) - }) - .into_dyn_boxed(), - None => self - .values - .iter() - .enumerate() - .map(|(i, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, VID(i)), - v, - ) - }) - .into_dyn_boxed(), - } + self.keys.iter().zip(self.values.iter()).map(move |(n, v)| { + ( + NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), + v, + ) + }) } fn nodes(&self) -> Nodes<'graph, Self::BaseGraph, Self::Graph> { @@ -423,53 +478,17 @@ impl< where 'graph: 'a, { - match &self.keys { - Some(index) => { - Either::Left(index.par_iter().zip(self.values.par_iter()).map(|(n, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), - v, - ) - })) - } - None => Either::Right(self.values.par_iter().enumerate().map(|(i, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, VID(i)), - v, - ) - })), - } - } - - fn get_by_index( - &self, - index: usize, - ) -> Option<( - NodeView<'_, &Self::BaseGraph, &Self::Graph>, - Self::Value<'_>, - )> { - match &self.keys { - Some(node_index) => node_index.key(index).map(|n| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), - &self.values[index], - ) - }), - None => self.values.get(index).map(|v| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, VID(index)), - v, - ) - }), - } + self.keys.par_iter().map(move |(val_id, n)| { + ( + NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), + &self.values[val_id], + ) + }) } fn get_by_node(&self, node: N) -> Option> { let id = self.graph.internalise_node(node.as_node_ref())?; - match &self.keys { - Some(index) => index.index(&id).map(|i| &self.values[i]), - None => Some(&self.values[id.0]), - } + self.keys.index(&id).map(|i| &self.values[i]) } fn len(&self) -> usize { @@ -479,6 +498,8 @@ impl< #[cfg(test)] mod test { + use raphtory_storage::core_ops::CoreGraphOps; + use crate::{ db::api::state::{node_state::NodeState, AsOrderedNodeStateOps, OrderedNodeStateOps}, prelude::*, @@ -488,21 +509,8 @@ mod test { fn float_state() { let g = Graph::new(); g.add_node(0, 0, NO_PROPS, None).unwrap(); - let float_state = NodeState { - base_graph: g.clone(), - graph: g.clone(), - values: [0.0f64].into(), - keys: None, - _marker: Default::default(), - }; - - let int_state = NodeState { - base_graph: g.clone(), - graph: g.clone(), - values: [1i64].into(), - keys: None, - _marker: Default::default(), - }; + let float_state = NodeState::new_from_values(g.clone(), [0.0f64]); + let int_state = NodeState::new_from_values(g.clone(), [1i64]); let min_float = float_state.min_item().unwrap().1; let min_int = int_state.min_item().unwrap().1; assert_eq!(min_float, &0.0); diff --git a/raphtory/src/db/api/state/node_state_ops.rs b/raphtory/src/db/api/state/node_state_ops.rs index efe2c7a865..0004284937 100644 --- a/raphtory/src/db/api/state/node_state_ops.rs +++ b/raphtory/src/db/api/state/node_state_ops.rs @@ -6,6 +6,7 @@ use crate::{ }, prelude::{GraphViewOps, NodeViewOps}, }; +use either::Either; use indexmap::IndexSet; use num_traits::AsPrimitive; use rayon::prelude::*; @@ -69,14 +70,6 @@ pub trait NodeStateOps<'graph>: where 'graph: 'a; - fn get_by_index( - &self, - index: usize, - ) -> Option<( - NodeView<'_, &Self::BaseGraph, &Self::Graph>, - Self::Value<'_>, - )>; - fn get_by_node(&self, node: N) -> Option>; fn len(&self) -> usize; @@ -111,7 +104,7 @@ pub trait NodeStateOps<'graph>: self.base_graph().clone(), self.graph().clone(), values.into(), - Some(Index::new(keys)), + Index::Partial(keys.into()), ) } @@ -171,7 +164,7 @@ pub trait NodeStateOps<'graph>: self.base_graph().clone(), self.graph().clone(), values.into(), - Some(Index::new(keys)), + Index::Partial(keys.into()), ) } diff --git a/raphtory/src/db/api/view/graph.rs b/raphtory/src/db/api/view/graph.rs index f8fc5d3387..2dc3145c99 100644 --- a/raphtory/src/db/api/view/graph.rs +++ b/raphtory/src/db/api/view/graph.rs @@ -8,6 +8,7 @@ use crate::{ db::{ api::{ properties::{internal::InternalMetadataOps, Metadata, Properties}, + state::Index, view::{internal::*, *}, }, graph::{ @@ -325,25 +326,43 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { .storage() .set_event_id(storage.read_event_id()); - let graph_storage = GraphStorage::from(temporal_graph); + let temporal_graph = Arc::new(temporal_graph); + + let graph_storage = GraphStorage::from(temporal_graph.clone()); { // scope for the write lock - let mut new_storage = graph_storage.write_lock()?; - new_storage.resize_chunks_to_num_nodes(self.count_nodes()); - for layer_id in &layer_map { - new_storage.nodes.ensure_layer(*layer_id); - } let mut node_map = vec![VID::default(); storage.unfiltered_num_nodes()]; let node_map_shared = atomic_usize_from_mut_slice(bytemuck::cast_slice_mut(&mut node_map)); + // reverse index pos -> new_vid + let index = Index::for_graph(self); + self.nodes().par_iter().for_each(|node| { + let vid = node.node; + if let Some(pos) = index.index(&vid) { + let new_vid = temporal_graph.storage().nodes().reserve_vid(pos); + node_map_shared[pos].store(new_vid.index(), Ordering::Relaxed); + } + }); + + let get_new_vid = |old_vid: VID, index: &Index, node_map: &[VID]| -> VID { + let pos = index + .index(&old_vid) + .expect("old_vid should exist in index"); + node_map[pos] + }; + let mut new_storage = graph_storage.write_lock()?; + + for layer_id in &layer_map { + new_storage.nodes.ensure_layer(*layer_id); + } + new_storage.nodes.par_iter_mut().try_for_each(|shard| { - for (index, node) in self.nodes().iter().enumerate() { - let new_id = VID(index); + for node in self.nodes().iter() { + let new_id = get_new_vid(node.node, &index, &node_map); let gid = node.id(); - node_map_shared[node.node.index()].store(new_id.index(), Ordering::Relaxed); if let Some(node_pos) = shard.resolve_pos(new_id) { let mut writer = shard.writer(); if let Some(node_type) = node.node_type() { @@ -360,7 +379,7 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { 0, ); } else { - writer.store_node_id(node_pos, 0, gid.as_ref(), 0); + writer.store_node_id(node_pos, 0, gid.clone().into(), 0); } graph_storage .write_session()? @@ -382,17 +401,26 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { Ok::<(), MutationError>(()) })?; - new_storage.resize_chunks_to_num_edges(self.count_edges()); + let mut new_eids = vec![]; + let mut max_eid = 0usize; + for (row, _) in self.edges().iter().enumerate() { + let new_eid = new_storage.graph().storage().edges().reserve_new_eid(row); + new_eids.push(new_eid); + max_eid = new_eid.0.max(max_eid); + } + new_storage.resize_chunks_to_num_edges(EID(max_eid)); for layer_id in &layer_map { new_storage.edges.ensure_layer(*layer_id); } + let edge_storage = new_storage.graph().storage().edges().clone(); + new_storage.edges.par_iter_mut().try_for_each(|shard| { - for (eid, edge) in self.edges().iter().enumerate() { - let src = node_map[edge.edge.src().index()]; - let dst = node_map[edge.edge.dst().index()]; - let eid = EID(eid); + for (row, edge) in self.edges().iter().enumerate() { + let src = get_new_vid(edge.edge.src(), &index, &node_map); + let dst = get_new_vid(edge.edge.dst(), &index, &node_map); + let eid = edge_storage.reserve_new_eid(row); if let Some(edge_pos) = shard.resolve_pos(eid) { let mut writer = shard.writer(); // make the edge for the first time @@ -453,8 +481,8 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { new_storage.nodes.par_iter_mut().try_for_each(|shard| { for (eid, edge) in self.edges().iter().enumerate() { let eid = EID(eid); - let src_id = node_map[edge.edge.src().index()]; - let dst_id = node_map[edge.edge.dst().index()]; + let src_id = get_new_vid(edge.edge.src(), &index, &node_map); + let dst_id = get_new_vid(edge.edge.dst(), &index, &node_map); let maybe_src_pos = shard.resolve_pos(src_id); let maybe_dst_pos = shard.resolve_pos(dst_id); @@ -614,7 +642,7 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { self.get_layer_names_from_ids(self.layer_ids()) } - #[inline] + // #[inline] fn earliest_time(&self) -> Option { match self.filter_state() { FilterState::Neither => self.earliest_time_global(), diff --git a/raphtory/src/db/api/view/internal/list_ops.rs b/raphtory/src/db/api/view/internal/list_ops.rs index 41d63aee49..a3da49bbb0 100644 --- a/raphtory/src/db/api/view/internal/list_ops.rs +++ b/raphtory/src/db/api/view/internal/list_ops.rs @@ -2,6 +2,7 @@ use crate::{ core::entities::{EID, VID}, db::api::{state::Index, view::Base}, }; +use raphtory_storage::graph::graph::GraphStorage; use rayon::{iter::Either, prelude::*}; use std::hash::Hash; @@ -62,27 +63,6 @@ impl + From + Send + Sync> List { } } - pub fn par_iter(&self) -> impl IndexedParallelIterator + '_ { - match self { - List::All { len } => Either::Left((0..*len).into_par_iter().map(From::from)), - List::List { elems } => Either::Right(elems.par_iter()), - } - } - - pub fn into_par_iter(self) -> impl IndexedParallelIterator { - match self { - List::All { len } => Either::Left((0..len).into_par_iter().map(From::from)), - List::List { elems } => Either::Right(elems.into_par_iter()), - } - } - - pub fn iter(&self) -> impl Iterator + '_ { - match self { - List::All { len } => Either::Left((0..*len).map(From::from)), - List::List { elems } => Either::Right(elems.iter()), - } - } - pub fn len(&self) -> usize { match self { List::All { len } => *len, @@ -95,16 +75,24 @@ impl + From + Send + Sync> List { } } -impl + From + Send + Sync + 'static> IntoIterator - for List -{ - type Item = I; - type IntoIter = Box + Send + Sync>; +impl List { + pub fn nodes_iter(self, g: &GraphStorage) -> impl Iterator { + match self { + List::All { .. } => { + let sc = g.node_segment_counts(); + Either::Left(sc.into_iter()) + } + List::List { elems } => Either::Right(elems.into_iter()), + } + } - fn into_iter(self) -> Self::IntoIter { + pub fn nodes_par_iter(self, g: &GraphStorage) -> impl ParallelIterator { match self { - List::All { len } => Box::new((0..len).map(From::from)), - List::List { elems } => Box::new(elems.into_iter()), + List::All { .. } => { + let sc = g.node_segment_counts(); + Either::Left(sc.into_par_iter()) + } + List::List { elems } => Either::Right(elems.into_par_iter()), } } } diff --git a/raphtory/src/db/graph/nodes.rs b/raphtory/src/db/graph/nodes.rs index 0682300881..4585821e53 100644 --- a/raphtory/src/db/graph/nodes.rs +++ b/raphtory/src/db/graph/nodes.rs @@ -13,6 +13,7 @@ use crate::{ }, prelude::*, }; +use either::Either; use raphtory_storage::{ core_ops::is_view_compatible, graph::{graph::GraphStorage, nodes::node_storage_ops::NodeStorageOps}, @@ -30,7 +31,7 @@ use std::{ pub struct Nodes<'graph, G, GH = G> { pub(crate) base_graph: G, pub(crate) graph: GH, - pub(crate) nodes: Option>, + pub(crate) nodes: Index, pub(crate) node_types_filter: Option>, _marker: PhantomData<&'graph ()>, } @@ -113,10 +114,11 @@ where { pub fn new(graph: G) -> Self { let base_graph = graph.clone(); + let node_index = base_graph.core_graph().node_state_index(); Self { base_graph, graph, - nodes: None, + nodes: node_index.into(), node_types_filter: None, _marker: PhantomData, } @@ -148,7 +150,7 @@ where pub fn new_filtered( base_graph: G, graph: GH, - nodes: Option>, + nodes: Index, node_types_filter: Option>, ) -> Self { Self { @@ -162,8 +164,8 @@ where pub fn node_list(&self) -> NodeList { match self.nodes.clone() { - None => self.graph.node_list(), - Some(elems) => NodeList::List { elems }, + elems @ Index::Partial(_) => NodeList::List { elems }, + _ => self.graph.node_list(), } } @@ -171,7 +173,7 @@ where let g = self.graph.core_graph().lock(); let view = self.graph.clone(); let node_types_filter = self.node_types_filter.clone(); - self.node_list().into_par_iter().filter(move |&vid| { + self.node_list().nodes_par_iter(&g).filter(move |&vid| { g.try_core_node(vid).is_some_and(|node| { node_types_filter .as_ref() @@ -185,7 +187,7 @@ where Nodes::new_filtered( self.base_graph.clone(), self.graph.clone(), - Some(index), + index, self.node_types_filter.clone(), ) } @@ -199,7 +201,7 @@ where fn iter_vids(&self, g: GraphStorage) -> impl Iterator + Send + Sync + 'graph { let node_types_filter = self.node_types_filter.clone(); let view = self.graph.clone(); - self.node_list().into_iter().filter(move |&vid| { + self.node_list().nodes_iter(&g).filter(move |&vid| { g.try_core_node(vid).is_some_and(|node| { node_types_filter .as_ref() @@ -259,15 +261,15 @@ where /// Returns the number of nodes in the graph. #[inline] pub fn len(&self) -> usize { - match self.nodes.as_ref() { - None => { + match &self.nodes { + Index::Full(_) => { if self.is_list_filtered() { self.par_iter_refs().count() } else { self.graph.node_list().len() } } - Some(nodes) => { + Index::Partial(nodes) => { if self.is_filtered() { self.par_iter_refs().count() } else { @@ -346,11 +348,7 @@ where .as_ref() .map(|filter| filter[node.node_type_id()]) .unwrap_or(true) - && self - .nodes - .as_ref() - .map(|nodes| nodes.contains(&node.node)) - .unwrap_or(true) + && self.nodes.contains(&node.node) }) .is_some() } diff --git a/raphtory/src/db/task/edge/eval_edge.rs b/raphtory/src/db/task/edge/eval_edge.rs index bf2629af81..1e64bfb09d 100644 --- a/raphtory/src/db/task/edge/eval_edge.rs +++ b/raphtory/src/db/task/edge/eval_edge.rs @@ -6,6 +6,7 @@ use crate::{ db::{ api::{ properties::Properties, + state::Index, view::{internal::OneHopFilter, *}, }, graph::edge::EdgeView, @@ -26,6 +27,7 @@ pub struct EvalEdgeView<'graph, 'a, G, GH, CS: Clone, S> { pub(crate) ss: usize, pub(crate) edge: EdgeView<&'graph G, GH>, pub(crate) storage: &'graph GraphStorage, + pub(crate) index: &'graph Index, pub(crate) node_state: Rc>>, pub(crate) local_state_prev: &'graph PrevLocalState<'a, S>, } @@ -43,6 +45,7 @@ impl< ss: usize, edge: EdgeView<&'graph G, GH>, storage: &'graph GraphStorage, + index: &'graph Index, node_state: Rc>>, local_state_prev: &'graph PrevLocalState<'a, S>, ) -> Self { @@ -50,6 +53,7 @@ impl< ss, edge, storage, + index, node_state, local_state_prev, } @@ -117,9 +121,15 @@ impl< storage, local_state_prev, node_state, + index: self.index, }; + let state_pos = self + .index + .index(&node.node) + .unwrap_or_else(|| panic!("Internal Error, node {:?} needs to be in index", node.node)); EvalNodeView { node: node.node, + state_pos, graph: node.base_graph, eval_graph, local_state: None, @@ -138,10 +148,12 @@ impl< let node_state = self.node_state.clone(); let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; EvalEdges { ss, edges, storage, + index, node_state, local_state_prev, } @@ -162,6 +174,7 @@ impl< ss: self.ss, edge: self.edge.clone(), storage: self.storage, + index: self.index, node_state: self.node_state.clone(), local_state_prev: self.local_state_prev, } @@ -198,6 +211,7 @@ impl< self.ss, edge, self.storage, + self.index, self.node_state.clone(), self.local_state_prev, ) diff --git a/raphtory/src/db/task/edge/eval_edges.rs b/raphtory/src/db/task/edge/eval_edges.rs index 1addb8798a..5f22847b1f 100644 --- a/raphtory/src/db/task/edge/eval_edges.rs +++ b/raphtory/src/db/task/edge/eval_edges.rs @@ -6,6 +6,7 @@ use crate::{ db::{ api::{ properties::{Metadata, Properties}, + state::Index, view::{internal::OneHopFilter, BaseEdgeViewOps, BoxedLIter}, }, graph::edges::Edges, @@ -25,6 +26,7 @@ pub struct EvalEdges<'graph, 'a, G, GH, CS: Clone, S> { pub(crate) ss: usize, pub(crate) edges: Edges<'graph, &'graph G, GH>, pub(crate) storage: &'graph GraphStorage, + pub(crate) index: &'graph Index, pub(crate) node_state: Rc>>, pub(crate) local_state_prev: &'graph PrevLocalState<'a, S>, } @@ -37,6 +39,7 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>, CS: ss: self.ss, edges: self.edges.clone(), storage: self.storage, + index: self.index, node_state: self.node_state.clone(), local_state_prev: self.local_state_prev, } @@ -67,10 +70,12 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>, CS: let node_state = self.node_state.clone(); let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; EvalEdges { ss, edges, storage, + index, node_state, local_state_prev, } @@ -91,6 +96,7 @@ impl< let ss = self.ss; let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; self.edges .clone() .into_iter() @@ -98,6 +104,7 @@ impl< ss, edge, storage, + index, node_state: node_state.clone(), local_state_prev, }) @@ -121,10 +128,12 @@ impl< let ss = self.ss; let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; Box::new(self.edges.into_iter().map(move |edge| EvalEdgeView { ss, edge, storage, + index, node_state: node_state.clone(), local_state_prev, })) @@ -186,10 +195,12 @@ impl< let path = self.edges.map_nodes(op); let base_graph = self.edges.base_graph; let storage = self.storage; + let index = self.index; let eval_graph = EvalGraph { ss, base_graph, storage, + index, local_state_prev, node_state, }; @@ -212,9 +223,11 @@ impl< let local_state_prev = self.local_state_prev; let edges = self.edges.map_exploded(op); let storage = self.storage; + let index = self.index; Self { ss, storage, + index, node_state, local_state_prev, edges, diff --git a/raphtory/src/db/task/eval_graph.rs b/raphtory/src/db/task/eval_graph.rs index fa4742910d..8b6d0ac071 100644 --- a/raphtory/src/db/task/eval_graph.rs +++ b/raphtory/src/db/task/eval_graph.rs @@ -3,13 +3,17 @@ use crate::{ entities::nodes::node_ref::AsNodeRef, state::compute_state::{ComputeState, ComputeStateVec}, }, - db::task::{ - edge::eval_edge::EvalEdgeView, - node::{eval_node::EvalNodeView, eval_node_state::EVState}, - task_state::PrevLocalState, + db::{ + api::state::Index, + task::{ + edge::eval_edge::EvalEdgeView, + node::{eval_node::EvalNodeView, eval_node_state::EVState}, + task_state::PrevLocalState, + }, }, prelude::GraphViewOps, }; +use raphtory_core::entities::VID; use raphtory_storage::graph::graph::GraphStorage; use std::{cell::RefCell, rc::Rc}; @@ -20,6 +24,7 @@ pub struct EvalGraph<'graph, 'a, G, S, CS: Clone = ComputeStateVec> { pub(crate) storage: &'graph GraphStorage, pub(crate) local_state_prev: &'graph PrevLocalState<'a, S>, pub(crate) node_state: Rc>>, + pub(crate) index: &'graph Index, } impl<'graph, 'a, G, S, CS: Clone> Clone for EvalGraph<'graph, 'a, G, S, CS> { @@ -30,6 +35,7 @@ impl<'graph, 'a, G, S, CS: Clone> Clone for EvalGraph<'graph, 'a, G, S, CS> { storage: self.storage, local_state_prev: self.local_state_prev, node_state: self.node_state.clone(), + index: self.index, } } } @@ -39,7 +45,15 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, S: 'static, CS: ComputeState + { pub fn node(&self, n: impl AsNodeRef) -> Option> { let node = (&self.base_graph).node(n)?; - Some(EvalNodeView::new_local(node.node, self.clone(), None)) + let state_pos = self.index.index(&node.node).unwrap_or_else(|| { + panic!("Internal Error, node {:?} needs to be in index", node.node); + }); + Some(EvalNodeView::new_local( + node.node, + state_pos, + self.clone(), + None, + )) } pub fn edge( @@ -52,6 +66,7 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, S: 'static, CS: ComputeState + self.ss, edge, self.storage, + self.index, self.node_state.clone(), self.local_state_prev, )) diff --git a/raphtory/src/db/task/mod.rs b/raphtory/src/db/task/mod.rs index 141ef726e9..025a1544ae 100644 --- a/raphtory/src/db/task/mod.rs +++ b/raphtory/src/db/task/mod.rs @@ -89,7 +89,7 @@ mod task_tests { vec![], vec![Job::new(step1)], None, - |egs, _, _, _| egs.finalize(&count), + |egs, _, _, _, _| egs.finalize(&count), Some(2), 1, None, diff --git a/raphtory/src/db/task/node/eval_node.rs b/raphtory/src/db/task/node/eval_node.rs index f5830ab1c2..e516cdc597 100644 --- a/raphtory/src/db/task/node/eval_node.rs +++ b/raphtory/src/db/task/node/eval_node.rs @@ -28,6 +28,7 @@ use std::{ pub struct EvalNodeView<'graph, 'a: 'graph, G, S, GH = &'graph G, CS: Clone = ComputeStateVec> { pub node: VID, + pub(crate) state_pos: usize, pub(crate) eval_graph: EvalGraph<'graph, 'a, G, S, CS>, pub(crate) graph: GH, pub(crate) local_state: Option<&'graph mut S>, @@ -38,12 +39,14 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, CS: ComputeState + 'a, S> { pub(crate) fn new_local( node: VID, + state_pos: usize, eval_graph: EvalGraph<'graph, 'a, G, S, CS>, local_state: Option<&'graph mut S>, ) -> Self { let graph = eval_graph.base_graph; Self { node, + state_pos, eval_graph, graph, local_state, @@ -63,6 +66,7 @@ impl< fn clone(&self) -> Self { Self { node: self.node, + state_pos: self.state_pos, eval_graph: self.eval_graph.clone(), graph: self.graph.clone(), local_state: None, @@ -83,8 +87,7 @@ impl< self.eval_graph.clone() } pub fn prev(&self) -> &S { - let VID(i) = self.node; - &self.eval_graph.local_state_prev.state[i] + &self.eval_graph.local_state_prev.state[self.state_pos] } pub fn get_mut(&mut self) -> &mut S { @@ -103,23 +106,20 @@ impl< pub(crate) fn new_filtered( node: VID, + state_pos: usize, eval_graph: EvalGraph<'graph, 'a, G, S, CS>, graph: GH, local_state: Option<&'graph mut S>, ) -> Self { Self { node, + state_pos, eval_graph, graph, local_state, } } - fn pid(&self) -> usize { - let VID(i) = self.node; - i - } - fn node_state(&self) -> Ref<'_, EVState<'a, CS>> { RefCell::borrow(&self.eval_graph.node_state) } @@ -133,9 +133,12 @@ impl< id: &AccId, a: IN, ) { - self.node_state_mut() - .shard_mut() - .accumulate_into(self.eval_graph.ss, self.pid(), a, id); + self.node_state_mut().shard_mut().accumulate_into( + self.eval_graph.ss, + self.state_pos, + a, + id, + ); } pub fn global_update>( @@ -190,7 +193,7 @@ impl< { self.node_state() .shard() - .read_with_pid(self.eval_graph.ss, self.pid(), agg_r) + .read_with_pid(self.eval_graph.ss, self.state_pos, agg_r) .unwrap_or(ACC::finish(&ACC::zero())) } @@ -204,7 +207,12 @@ impl< A: StateType, OUT: std::fmt::Debug, { - Entry::new(self.node_state(), *agg_r, &self.node, self.eval_graph.ss) + Entry::new( + self.node_state(), + *agg_r, + self.state_pos, + self.eval_graph.ss, + ) } /// Read the prev value of the node state using the given accumulator. @@ -219,7 +227,7 @@ impl< { self.node_state() .shard() - .read_with_pid(self.eval_graph.ss + 1, self.pid(), agg_r) + .read_with_pid(self.eval_graph.ss + 1, self.state_pos, agg_r) .unwrap_or(ACC::finish(&ACC::zero())) } @@ -267,8 +275,11 @@ impl< pub fn iter(&self) -> impl Iterator> + 'graph { let base_graph = self.base_graph.clone(); let graph = self.graph.clone(); - self.iter_refs() - .map(move |v| EvalNodeView::new_filtered(v, base_graph.clone(), graph.clone(), None)) + let index = self.base_graph.index; + self.iter_refs().map(move |v| { + let state_pos = index.index(&v).expect("VID not found in index"); + EvalNodeView::new_filtered(v, state_pos, base_graph.clone(), graph.clone(), None) + }) } pub fn type_filter, V: AsRef>(&self, node_types: I) -> Self { @@ -374,6 +385,7 @@ impl< self.graph.clone(), self.op.clone(), ); + let index = self.base_graph.index; let edges = path.map_edges(op); EvalEdges { ss, @@ -381,6 +393,7 @@ impl< node_state, local_state_prev, storage, + index, } } @@ -470,7 +483,7 @@ impl< filtered_graph: GHH, ) -> Self::Filtered { let eval_graph = self.eval_graph.clone(); - EvalNodeView::new_filtered(self.node, eval_graph, filtered_graph, None) + EvalNodeView::new_filtered(self.node, self.state_pos, eval_graph, filtered_graph, None) } } @@ -523,12 +536,14 @@ impl< graph: self.graph.clone(), edges, }; + let index = self.eval_graph.index; EvalEdges { ss, edges, node_state, local_state_prev, storage, + index, } } @@ -560,7 +575,7 @@ impl< pub struct Entry<'a, 'b, A: StateType, IN, OUT, ACC: Accumulator, CS: ComputeState> { state: Ref<'a, EVState<'b, CS>>, acc_id: AccId, - v_ref: &'a VID, + state_pos: usize, ss: usize, } @@ -579,13 +594,13 @@ impl<'a, 'b, A: StateType, IN, OUT, ACC: Accumulator, CS: ComputeSta pub(crate) fn new( state: Ref<'a, EVState<'b, CS>>, acc_id: AccId, - v_ref: &'a VID, + state_pos: usize, ss: usize, ) -> Entry<'a, 'b, A, IN, OUT, ACC, CS> { Entry { state, acc_id, - v_ref, + state_pos, ss, } } @@ -594,6 +609,6 @@ impl<'a, 'b, A: StateType, IN, OUT, ACC: Accumulator, CS: ComputeSta pub fn read_ref(&self) -> Option<&A> { self.state .shard() - .read_ref(self.ss, (*self.v_ref).into(), &self.acc_id) + .read_ref(self.ss, self.state_pos, &self.acc_id) } } diff --git a/raphtory/src/db/task/task_runner.rs b/raphtory/src/db/task/task_runner.rs index 90cec1a44a..e6c905937a 100644 --- a/raphtory/src/db/task/task_runner.rs +++ b/raphtory/src/db/task/task_runner.rs @@ -14,7 +14,7 @@ use crate::{ }, }, db::{ - api::view::StaticGraphViewOps, + api::{state::Index, view::StaticGraphViewOps}, task::{ eval_graph::EvalGraph, node::{eval_node::EvalNodeView, eval_node_state::EVState}, @@ -22,6 +22,7 @@ use crate::{ }, prelude::GraphViewOps, }; +use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; use raphtory_storage::graph::graph::GraphStorage; use rayon::{prelude::*, ThreadPool}; use std::{ @@ -55,7 +56,9 @@ impl TaskRunner { global_state: &Global, morcel: &mut [S], prev_local_state: &Vec, + reverse_vids: &Vec, storage: &GraphStorage, + index: &Index, atomic_done: &AtomicBool, morcel_size: usize, morcel_id: usize, @@ -72,23 +75,25 @@ impl TaskRunner { let mut v_ref = morcel_id * morcel_size; for local_state in morcel { - if g.has_node(VID(v_ref)) { - let eval_graph = EvalGraph { - ss: self.ctx.ss(), - base_graph: &g, - storage, - local_state_prev: &local, - node_state: node_state.clone(), - }; - let mut vv = EvalNodeView::new_local(v_ref.into(), eval_graph, Some(local_state)); + let node = reverse_vids[v_ref]; + // if g.has_node(VID(v_ref)) { + let eval_graph = EvalGraph { + ss: self.ctx.ss(), + base_graph: &g, + storage, + index, + local_state_prev: &local, + node_state: node_state.clone(), + }; + let mut vv = EvalNodeView::new_local(node, v_ref, eval_graph, Some(local_state)); - match task.run(&mut vv) { - Step::Continue => { - done = false; - } - Step::Done => {} + match task.run(&mut vv) { + Step::Continue => { + done = false; } + Step::Done => {} } + // } v_ref += 1; } @@ -128,7 +133,9 @@ impl TaskRunner { global_state: Global, mut local_state: Vec, prev_local_state: &Vec, + reverse_vids: &Vec, storage: &GraphStorage, + index: &Index, ) -> (bool, Shard, Global, Vec) { pool.install(move || { let mut new_shard_state = shard_state; @@ -149,7 +156,9 @@ impl TaskRunner { &new_global_state, morcel, prev_local_state, + reverse_vids, storage, + index, &atomic_done, morcel_size, morcel_id, @@ -167,7 +176,9 @@ impl TaskRunner { &new_global_state, morcel, prev_local_state, + reverse_vids, storage, + index, &atomic_done, morcel_size, morcel_id, @@ -202,16 +213,25 @@ impl TaskRunner { }) } - fn make_cur_and_prev_states(&self, mut init: Vec) -> (Vec, Vec) { - let g = self.ctx.graph(); - init.resize(g.unfiltered_num_nodes(), S::default()); + fn make_cur_and_prev_states( + &self, + mut init: Vec, + num_nodes: usize, + ) -> (Vec, Vec) { + init.resize(num_nodes, S::default()); (init.clone(), init) } pub fn run< B, - F: FnOnce(GlobalState, EvalShardState, EvalLocalState, Vec) -> B, + F: FnOnce( + GlobalState, + EvalShardState, + EvalLocalState, + Vec, + Index, + ) -> B, S: Send + Sync + Clone + 'static + std::fmt::Debug + Default, >( &mut self, @@ -226,8 +246,9 @@ impl TaskRunner { ) -> B { let pool = num_threads.map(custom_pool).unwrap_or_else(|| POOL.clone()); - let num_nodes = self.ctx.graph().unfiltered_num_nodes(); let graph = self.ctx.graph(); + let node_index = Index::for_graph(graph.clone()); + let num_nodes = node_index.len(); let storage = graph.core_graph(); let morcel_size = num_nodes.min(16_000); let num_chunks = if morcel_size == 0 { @@ -236,16 +257,48 @@ impl TaskRunner { (num_nodes + morcel_size - 1) / morcel_size }; + let index = Index::for_graph(graph.clone()); + + println!("DEBUG TaskRunner::run:"); + println!( + " graph.unfiltered_num_nodes() = {}", + graph.unfiltered_num_nodes() + ); + println!(" node_index.len() = {}", num_nodes); + println!(" morcel_size = {}", morcel_size); + println!(" num_chunks = {}", num_chunks); + println!( + " index variant = {:?}", + match &index { + Index::Full(_) => "Full", + Index::Partial(_) => "Partial", + } + ); + let mut shard_state = shard_initial_state.unwrap_or_else(|| Shard::new(num_nodes, num_chunks, morcel_size)); let mut global_state = global_initial_state.unwrap_or_else(|| Global::new()); let (mut cur_local_state, mut prev_local_state) = - self.make_cur_and_prev_states::(init.unwrap_or_default()); + self.make_cur_and_prev_states::(init.unwrap_or_default(), num_nodes); let mut _done = false; + let mut reverse_vids = vec![VID(0); node_index.len()]; + { + let atom_vids = atomic_vid_from_mut_slice(&mut reverse_vids); + + node_index.par_iter().for_each(|(i, vid)| { + atom_vids[i].store(vid.0, Ordering::Relaxed); + }); + } + + println!(" reverse_vids mapping (flat_idx -> VID):"); + for (flat_idx, vid) in reverse_vids.iter().enumerate() { + println!(" {} -> {}", flat_idx, vid.0); + } + (_done, shard_state, global_state, cur_local_state) = self.run_task_list( &init_tasks, &pool, @@ -254,7 +307,9 @@ impl TaskRunner { global_state, cur_local_state, &prev_local_state, + &reverse_vids, storage, + &index, ); // To allow the init step to cache stuff we will copy everything from cur_local_state to prev_local_state @@ -269,7 +324,9 @@ impl TaskRunner { global_state, cur_local_state, &prev_local_state, + &reverse_vids, storage, + &index, ); // copy and reset the state from the step that just ended @@ -295,6 +352,7 @@ impl TaskRunner { EvalShardState::new(ss, self.ctx.graph(), shard_state), EvalLocalState::new(ss, self.ctx.graph(), vec![]), last_local_state, + index, ); self.ctx.reset_ss(); to_return diff --git a/raphtory/src/db/task/task_state.rs b/raphtory/src/db/task/task_state.rs index 8c9d7654ca..c62051d2f8 100644 --- a/raphtory/src/db/task/task_state.rs +++ b/raphtory/src/db/task/task_state.rs @@ -1,4 +1,9 @@ -use crate::core::state::{compute_state::ComputeState, shuffle_state::ShuffleComputeState}; +use raphtory_core::entities::VID; + +use crate::{ + core::state::{compute_state::ComputeState, shuffle_state::ShuffleComputeState}, + db::api::state::Index, +}; use std::{borrow::Cow, sync::Arc}; // this only contains the global state and it is synchronized after each task run diff --git a/raphtory/src/errors.rs b/raphtory/src/errors.rs index f7d790b29b..1b6ef29ab2 100644 --- a/raphtory/src/errors.rs +++ b/raphtory/src/errors.rs @@ -19,9 +19,10 @@ use raphtory_core::{ use raphtory_storage::mutation::MutationError; use std::{ backtrace::Backtrace, + error::Error, fmt::Debug, - io, panic, - panic::Location, + io, + panic::{self, Location}, path::{PathBuf, StripPrefixError}, sync::Arc, time::SystemTimeError, @@ -91,8 +92,6 @@ pub enum LoadError { MissingEdgeError(VID, VID), #[error("Node IDs have the wrong type, expected {existing}, got {new}")] NodeIdTypeError { existing: GidType, new: GidType }, - #[error("Fatal load error, graph may be in a dirty state.")] - FatalError, #[error("Arrow error: {0:?}")] Arrow(#[from] ArrowError), } diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs index 644b6f3d3f..68ee14ede1 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/io/arrow/dataframe.rs @@ -47,12 +47,14 @@ where } pub(crate) fn get_index(&self, name: &str) -> Result { - self.names - .iter() - .position(|n| n == name) + self.get_index_opt(name) .ok_or_else(|| GraphError::ColumnDoesNotExist(name.to_string())) } + pub(crate) fn get_index_opt(&self, name: &str) -> Option { + self.names.iter().position(|n| n == name) + } + pub fn is_empty(&self) -> bool { self.num_rows == 0 } @@ -160,6 +162,13 @@ impl SecondaryIndexCol { pub fn max(&self) -> usize { self.iter().max().unwrap_or(0) } + + pub fn len(&self) -> usize { + match self { + SecondaryIndexCol::DataFrame(arr) => arr.len(), + SecondaryIndexCol::Range(range) => range.len(), + } + } } #[derive(Clone, Debug)] @@ -176,6 +185,10 @@ impl DFChunk { self.chunk.first().map(|c| c.len()).unwrap_or(0) } + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + pub fn node_col(&self, index: usize) -> Result { lift_node_col(index, self) } diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs deleted file mode 100644 index 7ddeb6cae1..0000000000 --- a/raphtory/src/io/arrow/df_loaders.rs +++ /dev/null @@ -1,1035 +0,0 @@ -use crate::{ - core::entities::nodes::node_ref::AsNodeRef, - db::api::view::StaticGraphViewOps, - errors::{into_graph_err, GraphError, LoadError}, - io::arrow::{ - dataframe::{DFChunk, DFView, SecondaryIndexCol}, - layer_col::{lift_layer_col, lift_node_type_col}, - prop_handler::*, - }, - prelude::*, -}; -use arrow::array::BooleanArray; -use bytemuck::checked::cast_slice_mut; -use db4_graph::WriteLockedGraph; -use either::Either; -use itertools::izip; -use kdam::{Bar, BarBuilder, BarExt}; -use raphtory_api::{ - atomic_extra::atomic_usize_from_mut_slice, - core::{ - entities::{ - properties::{meta::STATIC_GRAPH_LAYER_ID, prop::PropType}, - EID, - }, - storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry}, - }, -}; -use raphtory_core::{ - entities::{graph::logical_to_physical::ResolverShardT, GidRef, VID}, - storage::timeindex::AsTime, -}; -use raphtory_storage::{ - core_ops::CoreGraphOps, - layer_ops::InternalLayerOps, - mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}, -}; -use rayon::prelude::*; -use std::{ - borrow::{Borrow, Cow}, - collections::HashMap, - sync::{ - atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, - }, -}; - -fn build_progress_bar(des: String, num_rows: usize) -> Result { - BarBuilder::default() - .desc(des) - .animation(kdam::Animation::FillUp) - .total(num_rows) - .unit_scale(true) - .build() - .map_err(|_| GraphError::TqdmError) -} - -fn process_shared_properties( - props: Option<&HashMap>, - resolver: impl Fn(&str, PropType) -> Result, GraphError>, -) -> Result, GraphError> { - match props { - None => Ok(vec![]), - Some(props) => props - .iter() - .map(|(key, prop)| Ok((resolver(key, prop.dtype())?.inner(), prop.clone()))) - .collect(), - } -} - -pub fn load_nodes_from_df< - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, ->( - df_view: DFView>>, - time: &str, - secondary_index: Option<&str>, - node_id: &str, - properties: &[&str], - metadata: &[&str], - shared_metadata: Option<&HashMap>, - node_type: Option<&str>, - node_type_col: Option<&str>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let properties_indices = properties - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let node_type_index = - node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); - let node_type_index = node_type_index.transpose()?; - - let node_id_index = df_view.get_index(node_id)?; - let time_index = df_view.get_index(time)?; - let secondary_index_index = secondary_index - .map(|col| df_view.get_index(col)) - .transpose()?; - - let session = graph.write_session().map_err(into_graph_err)?; - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading nodes".to_string(), df_view.num_rows)?; - - let mut node_col_resolved = vec![]; - let mut node_type_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - for chunk in df_view.chunks { - let df = chunk?; - let prop_cols = - combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { - session - .resolve_node_property(key, dtype, false) - .map_err(into_graph_err) - })?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; - - let time_col = df.time_col(time_index)?; - let node_col = df.node_col(node_id_index)?; - - // Load the secondary index column if it exists, otherwise generate from start_id. - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) - } - }; - - node_col_resolved.resize_with(df.len(), Default::default); - node_type_col_resolved.resize_with(df.len(), Default::default); - - // TODO: Using parallel iterators results in a 5x speedup, but - // needs to be implemented such that node VID order is preserved. - // See: https://github.com/Pometry/pometry-storage/issues/81 - for (gid, resolved, node_type, node_type_resolved) in izip!( - node_col.iter(), - node_col_resolved.iter_mut(), - node_type_col.iter(), - node_type_col_resolved.iter_mut() - ) { - let (vid, res_node_type) = write_locked_graph - .graph() - .resolve_node_and_type(gid.as_node_ref(), node_type) - .map_err(|_| LoadError::FatalError)?; - - *resolved = vid; - *node_type_resolved = res_node_type; - } - - let node_stats = write_locked_graph.node_stats().clone(); - let update_time = |time: TimeIndexEntry| { - let time = time.t(); - node_stats.update_time(time); - }; - - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); - - write_locked_graph - .nodes - .par_iter_mut() - .try_for_each(|shard| { - // Zip all columns for iteration. - let zip = izip!( - node_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - node_type_col_resolved.iter(), - node_col.iter() - ); - - for (row, (vid, time, secondary_index, node_type, gid)) in zip.enumerate() { - if let Some(mut_node) = shard.resolve_pos(*vid) { - let mut writer = shard.writer(); - let t = TimeIndexEntry(time, secondary_index); - let layer_id = STATIC_GRAPH_LAYER_ID; - let lsn = 0; - - update_time(t); - writer - .store_node_id_and_node_type(mut_node, layer_id, gid, *node_type, lsn); - - let t_props = prop_cols.iter_row(row); - let c_props = metadata_cols - .iter_row(row) - .chain(shared_metadata.iter().cloned()); - - writer.add_props(t, mut_node, layer_id, t_props, lsn); - writer.update_c_props(mut_node, layer_id, c_props, lsn); - }; - } - - Ok::<_, GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - - Ok(()) -} - -pub fn load_edges_from_df( - df_view: DFView>>, - time: &str, - secondary_index: Option<&str>, - src: &str, - dst: &str, - properties: &[&str], - metadata: &[&str], - shared_metadata: Option<&HashMap>, - layer: Option<&str>, - layer_col: Option<&str>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - - let properties_indices = properties - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let src_index = df_view.get_index(src)?; - let dst_index = df_view.get_index(dst)?; - let time_index = df_view.get_index(time)?; - let secondary_index_index = secondary_index - .map(|col| df_view.get_index(col)) - .transpose()?; - let layer_index = if let Some(layer_col) = layer_col { - Some(df_view.get_index(layer_col.as_ref())?) - } else { - None - }; - let session = graph.write_session().map_err(into_graph_err)?; - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading edges".to_string(), df_view.num_rows)?; - - let mut src_col_resolved = vec![]; - let mut dst_col_resolved = vec![]; - let mut eid_col_resolved: Vec = vec![]; - let mut eids_exist: Vec = vec![]; // exists or needs to be created - let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - // set the type of the resolver; - let chunks = df_view.chunks.peekable(); - - let num_nodes = AtomicUsize::new(write_locked_graph.graph().internal_num_nodes()); - - for chunk in chunks { - let df = chunk?; - let prop_cols = - combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, false) - .map_err(into_graph_err) - })?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - src_col_resolved.resize_with(df.len(), Default::default); - dst_col_resolved.resize_with(df.len(), Default::default); - - // let src_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut src_col_resolved)); - // let dst_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut dst_col_resolved)); - - let layer = lift_layer_col(layer, layer_index, &df)?; - let layer_col_resolved = layer.resolve(graph)?; - - let src_col = df.node_col(src_index)?; - src_col.validate(graph, LoadError::MissingSrcError)?; - - let dst_col = df.node_col(dst_index)?; - dst_col.validate(graph, LoadError::MissingDstError)?; - - // It's our graph, no one else can change it - src_col - .par_iter() - .zip(src_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let vid = write_locked_graph - .graph() - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; - - if vid.is_new() { - num_nodes.fetch_add(1, Ordering::Relaxed); - } - - *resolved = vid.inner(); - Ok::<(), LoadError>(()) - })?; - - dst_col - .par_iter() - .zip(dst_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let vid = write_locked_graph - .graph() - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; - - if vid.is_new() { - num_nodes.fetch_add(1, Ordering::Relaxed); - } - - *resolved = vid.inner(); - Ok::<(), LoadError>(()) - })?; - - let time_col = df.time_col(time_index)?; - - // Load the secondary index column if it exists, otherwise generate from start_id. - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) - } - }; - - write_locked_graph.resize_chunks_to_num_nodes(num_nodes.load(Ordering::Relaxed)); - - eid_col_resolved.resize_with(df.len(), Default::default); - eids_exist.resize_with(df.len(), Default::default); - layer_eids_exist.resize_with(df.len(), Default::default); - let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); - - let edges = write_locked_graph.graph().storage().edges().clone(); - let next_edge_id = |row: usize| { - let (page, pos) = edges.reserve_free_pos(row); - pos.as_eid(page, edges.max_page_len()) - }; - - let mut per_segment_edge_count = Vec::with_capacity(write_locked_graph.nodes.len()); - per_segment_edge_count.resize_with(write_locked_graph.nodes.len(), || AtomicUsize::new(0)); - - let WriteLockedGraph { - nodes, ref edges, .. - } = &mut write_locked_graph; - - // Generate all edge_ids + add outbound edges - nodes - .iter_mut() // TODO: change to par_iter_mut but preserve edge_id order - .enumerate() - .for_each(|(page_id, locked_page)| { - // Zip all columns for iteration. - let zip = izip!( - src_col_resolved.iter(), - src_col.iter(), - dst_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - layer_col_resolved.iter() - ); - - for (row, (src, src_gid, dst, time, secondary_index, layer)) in zip.enumerate() { - if let Some(src_pos) = locked_page.resolve_pos(*src) { - let mut writer = locked_page.writer(); - let t = TimeIndexEntry(time, secondary_index); - writer.store_node_id(src_pos, 0, src_gid, 0); - // find the original EID in the static graph if it exists - // otherwise create a new one - - let edge_id = if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(true, Ordering::Relaxed); - edge_id.with_layer(*layer) - } else { - let edge_id = next_edge_id(row); - - writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(false, Ordering::Relaxed); - edge_id.with_layer(*layer) - }; - - if edges.exists(edge_id) - || writer.get_out_edge(src_pos, *dst, *layer).is_some() - { - layer_eids_exist[row].store(true, Ordering::Relaxed); - // node additions - writer.update_timestamp(t, src_pos, edge_id, 0); - } else { - layer_eids_exist[row].store(false, Ordering::Relaxed); - // actually adds the edge - writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); - } - - per_segment_edge_count[page_id].fetch_add(1, Ordering::Relaxed); - } - } - }); - - let aprox_num_edges = write_locked_graph.graph().internal_num_edges() + df.len(); - - write_locked_graph.resize_chunks_to_num_edges(aprox_num_edges); - - rayon::scope(|sc| { - // Add inbound edges - sc.spawn(|_| { - write_locked_graph - .nodes - .par_iter_mut() - .enumerate() - .for_each(|(page_id, shard)| { - let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), - dst_col.iter(), - eid_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - layer_col_resolved.iter(), - layer_eids_exist.iter().map(|a| a.load(Ordering::Relaxed)), - eids_exist.iter().map(|b| b.load(Ordering::Relaxed)) - ); - - for ( - src, - dst, - dst_gid, - eid, - time, - secondary_index, - layer, - edge_exists_in_layer, - edge_exists_in_static_graph, - ) in zip - { - if let Some(dst_pos) = shard.resolve_pos(*dst) { - let t = TimeIndexEntry(time, secondary_index); - let mut writer = shard.writer(); - - writer.store_node_id(dst_pos, 0, dst_gid, 0); - - if !edge_exists_in_static_graph { - writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); - } - - if !edge_exists_in_layer { - writer.add_inbound_edge( - Some(t), - dst_pos, - *src, - eid.with_layer(*layer), - 0, - ); - } else { - writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); - } - - per_segment_edge_count[page_id].fetch_add(1, Ordering::Relaxed); - } - } - }); - }); - - // Add temporal & constant properties to edges - sc.spawn(|_| { - write_locked_graph.edges.par_iter_mut().for_each(|shard| { - let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - eid_col_resolved.iter(), - layer_col_resolved.iter(), - eids_exist - .iter() - .map(|exists| exists.load(Ordering::Relaxed)) - ); - let mut t_props: Vec<(usize, Prop)> = vec![]; - let mut c_props: Vec<(usize, Prop)> = vec![]; - - for (row, (src, dst, time, secondary_index, eid, layer, exists)) in - zip.enumerate() - { - if let Some(eid_pos) = shard.resolve_pos(*eid) { - let t = TimeIndexEntry(time, secondary_index); - let mut writer = shard.writer(); - - t_props.clear(); - t_props.extend(prop_cols.iter_row(row)); - - c_props.clear(); - c_props.extend(metadata_cols.iter_row(row)); - c_props.extend_from_slice(&shared_metadata); - - writer.bulk_add_edge( - t, - eid_pos, - *src, - *dst, - exists, - *layer, - c_props.drain(..), - t_props.drain(..), - 0, - ); - } - } - }); - }); - }); - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - - Ok(()) -} - -fn load_into_shard( - src_col_shared: &[AtomicUsize], - dst_col_shared: &[AtomicUsize], - src_col: &super::node_col::NodeCol, - dst_col: &super::node_col::NodeCol, - node_count: &AtomicUsize, - shard: &mut ResolverShardT<'_, T>, - mut mapper_fn: impl FnMut(GidRef<'_>) -> Cow<'_, Q>, - mut fallback_fn: impl FnMut(&Q) -> Option, -) -> Result<(), LoadError> -where - T: Clone + Eq + std::hash::Hash + Borrow, - Q: Eq + std::hash::Hash + ToOwned + ?Sized, -{ - let src_iter = src_col.iter().map(&mut mapper_fn).enumerate(); - - for (id, gid) in src_iter { - if let Some(vid) = shard.resolve_node(&gid, |id| { - // fallback_fn(id).map(Either::Right).unwrap_or_else(|| { - // // If the node does not exist, create a new VID - // Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - // }) - Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - }) { - src_col_shared[id].store(vid.0, Ordering::Relaxed); - } - } - - let dst_iter = dst_col.iter().map(mapper_fn).enumerate(); - for (id, gid) in dst_iter { - if let Some(vid) = shard.resolve_node(&gid, |id| { - // fallback_fn(id).map(Either::Right).unwrap_or_else(|| { - // // If the node does not exist, create a new VID - // Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - // }) - Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - }) { - dst_col_shared[id].store(vid.0, Ordering::Relaxed); - } - } - Ok::<_, LoadError>(()) -} - -pub(crate) fn load_edge_deletions_from_df< - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + DeletionOps, ->( - df_view: DFView>>, - time: &str, - secondary_index: Option<&str>, - src: &str, - dst: &str, - layer: Option<&str>, - layer_col: Option<&str>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let src_index = df_view.get_index(src)?; - let dst_index = df_view.get_index(dst)?; - let time_index = df_view.get_index(time)?; - let secondary_index_index = secondary_index - .map(|col| df_view.get_index(col)) - .transpose()?; - let layer_index = layer_col.map(|layer_col| df_view.get_index(layer_col.as_ref())); - let layer_index = layer_index.transpose()?; - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading edge deletions".to_string(), df_view.num_rows)?; - let session = graph.write_session().map_err(into_graph_err)?; - - for chunk in df_view.chunks { - let df = chunk?; - let layer = lift_layer_col(layer, layer_index, &df)?; - let src_col = df.node_col(src_index)?; - let dst_col = df.node_col(dst_index)?; - let time_col = df.time_col(time_index)?; - - // Load the secondary index column if it exists, otherwise generate from start_id. - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) - } - }; - - src_col - .iter() - .zip(dst_col.iter()) - .zip(time_col.iter()) - .zip(secondary_index_col.iter()) - .zip(layer.iter()) - .try_for_each(|((((src, dst), time), secondary_index), layer)| { - // let src = src.ok_or(LoadError::MissingSrcError)?; - // let dst = dst.ok_or(LoadError::MissingDstError)?; - graph.delete_edge((time, secondary_index), src, dst, layer)?; - Ok::<(), GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - - Ok(()) -} - -pub(crate) fn load_node_props_from_df< - 'a, - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, ->( - df_view: DFView>>, - node_id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - metadata: &[&str], - shared_metadata: Option<&HashMap>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let node_type_index = - node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); - let node_type_index = node_type_index.transpose()?; - - let node_id_index = df_view.get_index(node_id)?; - let session = graph.write_session().map_err(into_graph_err)?; - - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading node properties".to_string(), df_view.num_rows)?; - - let mut node_col_resolved = vec![]; - let mut node_type_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - for chunk in df_view.chunks { - let df = chunk?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; - let node_col = df.node_col(node_id_index)?; - - node_col_resolved.resize_with(df.len(), Default::default); - node_type_col_resolved.resize_with(df.len(), Default::default); - - node_col - .iter() - .zip(node_col_resolved.iter_mut()) - .zip(node_type_col.iter()) - .zip(node_type_col_resolved.iter_mut()) - .try_for_each(|(((gid, resolved), node_type), node_type_resolved)| { - let (vid, res_node_type) = write_locked_graph - .graph() - .resolve_node_and_type(gid.as_node_ref(), node_type) - .map_err(|_| LoadError::FatalError)?; - *resolved = vid; - *node_type_resolved = res_node_type; - Ok::<(), LoadError>(()) - })?; - - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); - - write_locked_graph.nodes.iter_mut().try_for_each(|shard| { - let mut c_props = vec![]; - - for (idx, ((vid, node_type), gid)) in node_col_resolved - .iter() - .zip(node_type_col_resolved.iter()) - .zip(node_col.iter()) - .enumerate() - { - if let Some(mut_node) = shard.resolve_pos(*vid) { - let mut writer = shard.writer(); - writer.store_node_id_and_node_type(mut_node, 0, gid, *node_type, 0); - - c_props.clear(); - c_props.extend(metadata_cols.iter_row(idx)); - c_props.extend_from_slice(&shared_metadata); - writer.update_c_props(mut_node, 0, c_props.drain(..), 0); - }; - } - - Ok::<_, GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - Ok(()) -} - -pub(crate) fn load_edges_props_from_df< - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, ->( - df_view: DFView>>, - src: &str, - dst: &str, - metadata: &[&str], - shared_metadata: Option<&HashMap>, - layer: Option<&str>, - layer_col: Option<&str>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let src_index = df_view.get_index(src)?; - let dst_index = df_view.get_index(dst)?; - let layer_index = if let Some(layer_col) = layer_col { - Some(df_view.get_index(layer_col.as_ref())?) - } else { - None - }; - let session = graph.write_session().map_err(into_graph_err)?; - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading edge properties".to_string(), df_view.num_rows)?; - #[cfg(feature = "python")] - let _ = pb.update(0); - - let mut src_col_resolved = vec![]; - let mut dst_col_resolved = vec![]; - let mut eid_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - let g = write_locked_graph.graph; - - for chunk in df_view.chunks { - let df = chunk?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - let layer = lift_layer_col(layer, layer_index, &df)?; - let layer_col_resolved = layer.resolve(graph)?; - - let src_col = df.node_col(src_index)?; - src_col.validate(graph, LoadError::MissingSrcError)?; - - let dst_col = df.node_col(dst_index)?; - dst_col.validate(graph, LoadError::MissingDstError)?; - - // It's our graph, no one else can change it - src_col_resolved.resize_with(df.len(), Default::default); - src_col - .par_iter() - .zip(src_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let vid = g - .resolve_node_ref(gid.as_node_ref()) - .ok_or(LoadError::MissingNodeError)?; - *resolved = vid; - Ok::<(), LoadError>(()) - })?; - - dst_col_resolved.resize_with(df.len(), Default::default); - dst_col - .par_iter() - .zip(dst_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let vid = g - .resolve_node_ref(gid.as_node_ref()) - .ok_or(LoadError::MissingNodeError)?; - *resolved = vid; - Ok::<(), LoadError>(()) - })?; - - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); - - // resolve all the edges - eid_col_resolved.resize_with(df.len(), Default::default); - let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); - - write_locked_graph - .nodes - .par_iter_mut() - .try_for_each(|shard| { - for (row, (src, dst)) in src_col_resolved - .iter() - .zip(dst_col_resolved.iter()) - .enumerate() - { - if let Some(src_node) = shard.resolve_pos(*src) { - let writer = shard.writer(); - let EID(eid) = writer - .get_out_edge(src_node, *dst, 0) - .ok_or(LoadError::MissingEdgeError(*src, *dst))?; - eid_col_shared[row].store(eid, Ordering::Relaxed); - } - } - Ok::<_, LoadError>(()) - })?; - - write_locked_graph - .edges - .par_iter_mut() - .try_for_each(|shard| { - let mut c_props = vec![]; - for (idx, (((eid, layer), src), dst)) in eid_col_resolved - .iter() - .zip(layer_col_resolved.iter()) - .zip(&src_col_resolved) - .zip(&dst_col_resolved) - .enumerate() - { - if let Some(eid_pos) = shard.resolve_pos(*eid) { - let mut writer = shard.writer(); - c_props.clear(); - c_props.extend(metadata_cols.iter_row(idx)); - c_props.extend_from_slice(&shared_metadata); - writer.update_c_props(eid_pos, *src, *dst, *layer, c_props.drain(..)); - } - } - Ok::<(), GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - Ok(()) -} - -pub(crate) fn load_graph_props_from_df< - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, ->( - df_view: DFView>>, - time: &str, - secondary_index: Option<&str>, - properties: Option<&[&str]>, - metadata: Option<&[&str]>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let properties = properties.unwrap_or(&[]); - let metadata = metadata.unwrap_or(&[]); - - let properties_indices = properties - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let time_index = df_view.get_index(time)?; - let secondary_index_index = secondary_index - .map(|col| df_view.get_index(col)) - .transpose()?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading graph properties".to_string(), df_view.num_rows)?; - let session = graph.write_session().map_err(into_graph_err)?; - - for chunk in df_view.chunks { - let df = chunk?; - let prop_cols = - combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { - session - .resolve_graph_property(key, dtype, false) - .map_err(into_graph_err) - })?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_graph_property(key, dtype, true) - .map_err(into_graph_err) - })?; - let time_col = df.time_col(time_index)?; - - // Load the secondary index column if it exists, otherwise generate from start_id. - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - let col = SecondaryIndexCol::new_from_range(start_id, start_id + df.len()); - col - } - }; - - time_col - .par_iter() - .zip(secondary_index_col.par_iter()) - .zip(prop_cols.par_rows()) - .zip(metadata_cols.par_rows()) - .try_for_each(|(((time, secondary_index), t_props), c_props)| { - let t = TimeIndexEntry(time, secondary_index); - let t_props: Vec<_> = t_props.collect(); - - if !t_props.is_empty() { - graph - .internal_add_properties(t, &t_props) - .map_err(into_graph_err)?; - } - - let c_props: Vec<_> = c_props.collect(); - - if !c_props.is_empty() { - graph - .internal_add_metadata(&c_props) - .map_err(into_graph_err)?; - } - - Ok::<(), GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - - Ok(()) -} diff --git a/raphtory/src/io/arrow/df_loaders/edge_props.rs b/raphtory/src/io/arrow/df_loaders/edge_props.rs new file mode 100644 index 0000000000..a2de196f94 --- /dev/null +++ b/raphtory/src/io/arrow/df_loaders/edge_props.rs @@ -0,0 +1,225 @@ +use crate::{ + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, + io::arrow::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + build_progress_bar, + edges::{get_or_resolve_node_vids, store_node_ids, ColumnNames}, + extract_secondary_index_col, process_shared_properties, resolve_nodes_with_cache, + GidKey, + }, + layer_col::lift_layer_col, + node_col::NodeCol, + prop_handler::*, + }, + prelude::*, +}; +use arrow::{array::AsArray, datatypes::UInt64Type}; +use bytemuck::checked::cast_slice_mut; +use db4_graph::WriteLockedGraph; +use itertools::izip; +use kdam::BarExt; +use raphtory_api::{ + atomic_extra::{atomic_usize_from_mut_slice, atomic_vid_from_mut_slice}, + core::{ + entities::EID, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, + }, +}; +use raphtory_core::entities::VID; +use raphtory_storage::mutation::addition_ops::SessionAdditionOps; +use rayon::prelude::*; +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + mpsc, + }, +}; +use storage::{ + api::{edges::EdgeSegmentOps, nodes::NodeSegmentOps}, + pages::locked::{edges::LockedEdgePage, nodes::LockedNodePage}, + Extension, +}; + +#[allow(clippy::too_many_arguments)] +pub fn load_edges_from_df( + df_view: DFView> + Send>, + column_names: ColumnNames, + resolve_nodes: bool, + metadata: &[&str], + shared_metadata: Option<&HashMap>, + layer: Option<&str>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + + let ColumnNames { + src, + dst, + layer_col, + layer_id_col, + .. + } = column_names; + + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let src_index = df_view.get_index(src)?; + let dst_index = df_view.get_index(dst)?; + let layer_id_index = layer_id_col.and_then(|name| df_view.get_index_opt(name)); + let layer_index = layer_col.map(|name| df_view.get_index(name)).transpose()?; + + let session = graph.write_session().map_err(into_graph_err)?; + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + // #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading edges metadata".to_string(), df_view.num_rows)?; + + let mut src_col_resolved: Vec = vec![]; + let mut dst_col_resolved: Vec = vec![]; + let mut eid_col_resolved: Vec = vec![]; + + rayon::scope(|s| { + let (tx, rx) = mpsc::sync_channel(2); + + s.spawn(move |_| { + let sender = tx; + for chunk in df_view.chunks { + sender.send(chunk).unwrap() + } + }); + + for chunk in rx.iter() { + let df = chunk?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + // validate src and dst columns + let src_col = df.node_col(src_index)?; + src_col.validate(graph, LoadError::MissingSrcError)?; + let dst_col = df.node_col(dst_index)?; + dst_col.validate(graph, LoadError::MissingDstError)?; + let layer = lift_layer_col(layer, layer_index, &df)?; + let layer_id_values = layer_id_index + .map(|idx| { + df.chunk[idx] + .as_primitive_opt::() + .ok_or_else(|| { + LoadError::InvalidLayerType(df.chunk[idx].data_type().clone()) + }) + .map(|array| array.values().as_ref()) + }) + .transpose()?; + let layer_col_resolved = layer.resolve_layer(layer_id_values, graph)?; + + let (src_vids, dst_vids, gid_str_cache) = get_or_resolve_node_vids( + graph, + src_index, + dst_index, + &mut src_col_resolved, + &mut dst_col_resolved, + resolve_nodes, + &df, + &src_col, + &dst_col, + )?; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + eid_col_resolved.resize_with(df.len(), Default::default); + let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); + + let WriteLockedGraph { nodes, .. } = &mut write_locked_graph; + + // Generate all edge_ids + add outbound edges + nodes.par_iter_mut().try_for_each(|locked_page| { + // Zip all columns for iteration. + let zip = izip!(src_vids.iter(), dst_vids.iter()); + add_and_resolve_outbound_edges(&eid_col_shared, locked_page, zip)?; + // resolve_nodes=false + // assumes we are loading our own graph, via the parquet loaders, + // so previous calls have already stored the node ids and types + if resolve_nodes { + store_node_ids(&gid_str_cache, locked_page); + } + Ok::<_, GraphError>(()) + })?; + + drop(write_locked_graph); + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + write_locked_graph.edges.par_iter_mut().for_each(|shard| { + let zip = izip!( + src_vids.iter(), + dst_vids.iter(), + eid_col_resolved.iter(), + layer_col_resolved.iter(), + ); + update_edge_metadata(&shared_metadata, &metadata_cols, shard, zip); + }); + + // #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + Ok::<_, GraphError>(()) + })?; + // set the type of the resolver; + + Ok(()) +} + +#[inline(never)] +fn add_and_resolve_outbound_edges<'a, NS: NodeSegmentOps>( + eid_col_shared: &&mut [AtomicUsize], + locked_page: &mut LockedNodePage<'_, NS>, + zip: impl Iterator, +) -> Result<(), LoadError> { + for (row, (src, dst)) in zip.enumerate() { + if let Some(src_pos) = locked_page.resolve_pos(*src) { + let writer = locked_page.writer(); + // find the original EID in the static graph if it exists + // otherwise create a new one + if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + } else { + return Err(LoadError::MissingEdgeError(*src, *dst)); + }; + } + } + Ok(()) +} + +#[inline(never)] +fn update_edge_metadata<'a, ES: EdgeSegmentOps>( + shared_metadata: &[(usize, Prop)], + metadata_cols: &PropCols, + shard: &mut LockedEdgePage<'_, ES>, + zip: impl Iterator, +) { + let mut c_props: Vec<(usize, Prop)> = Vec::new(); + for (row, (src, dst, eid, layer)) in zip.enumerate() { + if let Some(eid_pos) = shard.resolve_pos(*eid) { + let mut writer = shard.writer(); + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(row)); + c_props.extend_from_slice(shared_metadata); + + writer.update_c_props(eid_pos, *src, *dst, *layer, c_props.drain(..)); + } + } +} diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs new file mode 100644 index 0000000000..e53308b5e3 --- /dev/null +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -0,0 +1,535 @@ +use crate::{ + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, + io::arrow::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + build_progress_bar, extract_secondary_index_col, process_shared_properties, + resolve_nodes_with_cache, GidKey, + }, + layer_col::lift_layer_col, + node_col::NodeCol, + prop_handler::*, + }, + prelude::*, +}; +use arrow::{array::AsArray, datatypes::UInt64Type}; +use bytemuck::checked::cast_slice_mut; +use db4_graph::WriteLockedGraph; +use itertools::izip; +use kdam::BarExt; +use raphtory_api::{ + atomic_extra::{atomic_usize_from_mut_slice, atomic_vid_from_mut_slice}, + core::{ + entities::EID, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, + }, +}; +use raphtory_core::entities::VID; +use raphtory_storage::mutation::addition_ops::SessionAdditionOps; +use rayon::prelude::*; +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + mpsc, + }, +}; +use storage::{ + api::{edges::EdgeSegmentOps, nodes::NodeSegmentOps}, + pages::locked::{ + edges::{LockedEdgePage, WriteLockedEdgePages}, + nodes::LockedNodePage, + }, + Extension, +}; +use zip::unstable::write; + +#[derive(Debug, Copy, Clone)] +pub struct ColumnNames<'a> { + pub time: &'a str, + pub secondary_index: Option<&'a str>, + pub src: &'a str, + pub dst: &'a str, + pub edge_id: Option<&'a str>, + pub layer_col: Option<&'a str>, + pub layer_id_col: Option<&'a str>, +} + +impl<'a> ColumnNames<'a> { + pub fn new( + time: &'a str, + secondary_index: Option<&'a str>, + + src: &'a str, + dst: &'a str, + + layer_col: Option<&'a str>, + ) -> Self { + Self { + time, + secondary_index, + src, + dst, + layer_col, + edge_id: None, + layer_id_col: None, + } + } + + pub fn with_layer_id_col(mut self, layer_id_col: &'a str) -> Self { + self.layer_id_col = Some(layer_id_col); + self + } + + pub fn with_edge_id_col(mut self, edge_id: &'a str) -> Self { + self.edge_id = Some(edge_id); + self + } +} + +#[allow(clippy::too_many_arguments)] +pub fn load_edges_from_df( + df_view: DFView> + Send>, + column_names: ColumnNames, + resolve_nodes: bool, // this is reserved for internal parquet encoders, this cannot be exposed to users + properties: &[&str], + metadata: &[&str], + shared_metadata: Option<&HashMap>, + layer: Option<&str>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + + let ColumnNames { + time, + secondary_index, + src, + dst, + edge_id, + layer_col, + layer_id_col, + } = column_names; + + let properties_indices = properties + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let src_index = df_view.get_index(src)?; + let dst_index = df_view.get_index(dst)?; + let time_index = df_view.get_index(time)?; + let edge_index = edge_id.and_then(|name| df_view.get_index_opt(name)); + let layer_id_index = layer_id_col.and_then(|name| df_view.get_index_opt(name)); + let secondary_index_index = secondary_index + .map(|col| df_view.get_index(col)) + .transpose()?; + let layer_index = layer_col.map(|name| df_view.get_index(name)).transpose()?; + + let session = graph.write_session().map_err(into_graph_err)?; + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + assert!( + (resolve_nodes ^ edge_index.is_some()), + "resolve_nodes must be false when edge_id is provided or true when edge_id is None, {{resolve_nodes:{resolve_nodes:?}, edge_id:{edge_index:?}}}" + ); + + assert!( + (resolve_nodes ^ layer_id_index.is_some()), + "resolve_nodes must be false when layer_id is provided or true when layer_id is None, {{resolve_nodes:{resolve_nodes:?}, layer_id:{layer_id_index:?}}}" + ); + + // #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading edges".to_string(), df_view.num_rows)?; + + let mut src_col_resolved: Vec = vec![]; + let mut dst_col_resolved: Vec = vec![]; + let mut eid_col_resolved: Vec = vec![]; + let mut eids_exist: Vec = vec![]; // exists or needs to be created + let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created + + rayon::scope(|s| { + let (tx, rx) = mpsc::sync_channel(2); + + s.spawn(move |_| { + let sender = tx; + for chunk in df_view.chunks { + sender.send(chunk).unwrap() + } + }); + + let max_edge_id = AtomicUsize::new(0); + + for chunk in rx.iter() { + let df = chunk?; + let prop_cols = + combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, false) + .map_err(into_graph_err) + })?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + // validate src and dst columns + let src_col = df.node_col(src_index)?; + src_col.validate(graph, LoadError::MissingSrcError)?; + let dst_col = df.node_col(dst_index)?; + dst_col.validate(graph, LoadError::MissingDstError)?; + let layer = lift_layer_col(layer, layer_index, &df)?; + let layer_id_values = layer_id_index + .map(|idx| { + df.chunk[idx] + .as_primitive_opt::() + .ok_or_else(|| { + LoadError::InvalidLayerType(df.chunk[idx].data_type().clone()) + }) + .map(|array| array.values().as_ref()) + }) + .transpose()?; + let layer_col_resolved = layer.resolve_layer(layer_id_values, graph)?; + + let (src_vids, dst_vids, gid_str_cache) = get_or_resolve_node_vids( + graph, + src_index, + dst_index, + &mut src_col_resolved, + &mut dst_col_resolved, + resolve_nodes, + &df, + &src_col, + &dst_col, + )?; + + let time_col = df.time_col(time_index)?; + + // Load the secondary index column if it exists, otherwise generate from start_id. + let secondary_index_col = + extract_secondary_index_col::(secondary_index_index, &session, &df)?; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + eid_col_resolved.resize_with(df.len(), Default::default); + eids_exist.resize_with(df.len(), Default::default); + layer_eids_exist.resize_with(df.len(), Default::default); + let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); + + let edges = write_locked_graph.graph().storage().edges().clone(); + let next_edge_id = |row: usize| { + let (page, pos) = edges.reserve_free_pos(row); + pos.as_eid(page, edges.max_page_len()) + }; + + let WriteLockedGraph { + nodes, ref edges, .. + } = &mut write_locked_graph; + + let eids = edge_index.and_then(|edge_id_col| { + Some( + df.chunk[edge_id_col] + .as_primitive_opt::()? + .values() + .as_ref(), + ) + }); + + // Generate all edge_ids + add outbound edges + nodes.par_iter_mut().for_each(|locked_page| { + // Zip all columns for iteration. + let zip = izip!( + src_vids.iter(), + dst_vids.iter(), + time_col.iter(), + secondary_index_col.iter(), + layer_col_resolved.iter() + ); + + // resolve_nodes=false + // assumes we are loading our own graph, via the parquet loaders, + // so previous calls have already stored the node ids and types + if resolve_nodes { + store_node_ids(&gid_str_cache, locked_page); + } + + if resolve_nodes { + add_and_resolve_outbound_edges( + &eids_exist, + &layer_eids_exist, + &eid_col_shared, + next_edge_id, + edges, + locked_page, + zip, + ); + } else if let Some(edge_ids) = eids { + add_and_resolve_outbound_edges( + &eids_exist, + &layer_eids_exist, + &eid_col_shared, + |row| { + let eid = EID(edge_ids[row] as usize); + max_edge_id.fetch_max(eid.0, Ordering::Relaxed); + eid + }, + edges, + locked_page, + zip, + ); + } + }); + + write_locked_graph.nodes.par_iter_mut().for_each(|shard| { + let zip = izip!( + src_vids.iter(), + dst_vids.iter(), + eid_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + layer_col_resolved.iter(), + layer_eids_exist.iter().map(|a| a.load(Ordering::Relaxed)), + eids_exist.iter().map(|b| b.load(Ordering::Relaxed)) + ); + + add_outbound_edges(shard, zip); + }); + + drop(write_locked_graph); + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + if !resolve_nodes { + write_locked_graph + .resize_chunks_to_num_edges(EID(max_edge_id.load(Ordering::Relaxed))); + } + + write_locked_graph.edges.par_iter_mut().for_each(|shard| { + let zip = izip!( + src_vids.iter(), + dst_vids.iter(), + time_col.iter(), + secondary_index_col.iter(), + eid_col_resolved.iter(), + layer_col_resolved.iter(), + eids_exist + .iter() + .map(|exists| exists.load(Ordering::Relaxed)) + ); + update_edge_properties(&shared_metadata, &prop_cols, &metadata_cols, shard, zip); + }); + + // #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + Ok::<_, GraphError>(()) + })?; + // set the type of the resolver; + + Ok(()) +} + +#[inline(never)] +#[allow(clippy::too_many_arguments, clippy::type_complexity)] +pub fn get_or_resolve_node_vids< + 'a: 'c, + 'b: 'c, + 'c, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + src_index: usize, + dst_index: usize, + src_col_resolved: &'a mut Vec, + dst_col_resolved: &'a mut Vec, + resolve_nodes: bool, + df: &'b DFChunk, + src_col: &'a NodeCol, + dst_col: &'a NodeCol, +) -> Result< + ( + &'c [VID], + &'c [VID], + FxDashMap, (Prop, MaybeNew)>, + ), + GraphError, +> { + let (src_vids, dst_vids, gid_str_cache) = if resolve_nodes { + src_col_resolved.resize_with(df.len(), Default::default); + dst_col_resolved.resize_with(df.len(), Default::default); + + let atomic_src_col = atomic_vid_from_mut_slice(src_col_resolved); + let atomic_dst_col = atomic_vid_from_mut_slice(dst_col_resolved); + + let gid_str_cache = resolve_nodes_with_cache::( + graph, + [(src_col), (dst_col)].as_ref(), + [atomic_src_col, atomic_dst_col].as_ref(), + )?; + ( + src_col_resolved.as_slice(), + dst_col_resolved.as_slice(), + gid_str_cache, + ) + } else { + let srcs = df.chunk[src_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[src_index].data_type().clone()))? + .values() + .as_ref(); + let dsts = df.chunk[dst_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[dst_index].data_type().clone()))? + .values() + .as_ref(); + ( + bytemuck::cast_slice(srcs), + bytemuck::cast_slice(dsts), + FxDashMap::default(), + ) + }; + Ok((src_vids, dst_vids, gid_str_cache)) +} + +#[inline(never)] +fn update_edge_properties<'a, ES: EdgeSegmentOps>( + shared_metadata: &[(usize, Prop)], + prop_cols: &PropCols, + metadata_cols: &PropCols, + shard: &mut LockedEdgePage<'_, ES>, + zip: impl Iterator, +) { + let mut t_props: Vec<(usize, Prop)> = vec![]; + let mut c_props: Vec<(usize, Prop)> = vec![]; + + for (row, (src, dst, time, secondary_index, eid, layer, exists)) in zip.enumerate() { + if let Some(eid_pos) = shard.resolve_pos(*eid) { + let t = TimeIndexEntry(time, secondary_index); + let mut writer = shard.writer(); + + t_props.clear(); + t_props.extend(prop_cols.iter_row(row)); + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(row)); + c_props.extend_from_slice(shared_metadata); + + writer.bulk_add_edge( + t, + eid_pos, + *src, + *dst, + exists, + *layer, + c_props.drain(..), + t_props.drain(..), + 0, + ); + } + } +} + +#[inline(never)] +fn add_outbound_edges<'a, NS: NodeSegmentOps>( + shard: &mut LockedNodePage<'_, NS>, + zip: impl Iterator, +) { + for ( + src, + dst, + eid, + time, + secondary_index, + layer, + edge_exists_in_layer, + edge_exists_in_static_graph, + ) in zip + { + if let Some(dst_pos) = shard.resolve_pos(*dst) { + let t = TimeIndexEntry(time, secondary_index); + let mut writer = shard.writer(); + + if !edge_exists_in_static_graph { + writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); + } + + if !edge_exists_in_layer { + writer.add_inbound_edge(Some(t), dst_pos, *src, eid.with_layer(*layer), 0); + } else { + writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); + } + } + } +} + +#[inline(never)] +fn add_and_resolve_outbound_edges< + 'a, + NS: NodeSegmentOps, + ES: EdgeSegmentOps, +>( + eids_exist: &[AtomicBool], + layer_eids_exist: &[AtomicBool], + eid_col_shared: &&mut [AtomicUsize], + next_edge_id: impl Fn(usize) -> EID, + edges: &WriteLockedEdgePages<'_, ES>, + locked_page: &mut LockedNodePage<'_, NS>, + zip: impl Iterator, +) { + for (row, (src, dst, time, secondary_index, layer)) in zip.enumerate() { + if let Some(src_pos) = locked_page.resolve_pos(*src) { + let mut writer = locked_page.writer(); + let t = TimeIndexEntry(time, secondary_index); + // find the original EID in the static graph if it exists + // otherwise create a new one + + let edge_id = if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(true, Ordering::Relaxed); + edge_id.with_layer(*layer) + } else { + let edge_id = next_edge_id(row); + writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(false, Ordering::Relaxed); + edge_id.with_layer(*layer) + }; + + if edges.exists(edge_id) { + layer_eids_exist[row].store(true, Ordering::Relaxed); + // node additions + writer.update_timestamp(t, src_pos, edge_id, 0); + } else { + layer_eids_exist[row].store(false, Ordering::Relaxed); + // actually adds the edge + writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); + } + } + } +} + +#[inline(never)] +pub fn store_node_ids>( + gid_str_cache: &FxDashMap)>, + locked_page: &mut LockedNodePage<'_, NS>, +) { + for entry in gid_str_cache.iter() { + let (src_gid, vid) = entry.value(); + + if let Some(src_pos) = locked_page.resolve_pos(vid.inner()) { + let mut writer = locked_page.writer(); + writer.store_node_id(src_pos, 0, src_gid.clone(), 0); + } + } +} diff --git a/raphtory/src/io/arrow/df_loaders/mod.rs b/raphtory/src/io/arrow/df_loaders/mod.rs new file mode 100644 index 0000000000..3080eceef5 --- /dev/null +++ b/raphtory/src/io/arrow/df_loaders/mod.rs @@ -0,0 +1,401 @@ +use crate::{ + core::entities::nodes::node_ref::AsNodeRef, + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, + io::arrow::{ + dataframe::{DFChunk, DFView, SecondaryIndexCol}, + df_loaders::edges::ColumnNames, + layer_col::{lift_layer_col, LayerCol}, + node_col::NodeCol, + prop_handler::*, + }, + prelude::*, +}; +use bytemuck::checked::cast_slice_mut; +use kdam::{Bar, BarBuilder, BarExt}; +use raphtory_api::{ + atomic_extra::atomic_usize_from_mut_slice, + core::{ + entities::{properties::prop::PropType, EID}, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, + }, +}; +use raphtory_core::entities::{GidRef, VID}; +use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; +use rayon::prelude::*; +use std::{ + collections::HashMap, + sync::atomic::{AtomicUsize, Ordering}, +}; + +pub mod edge_props; +pub mod edges; +pub mod nodes; + +fn build_progress_bar(des: String, num_rows: usize) -> Result { + BarBuilder::default() + .desc(des) + .animation(kdam::Animation::FillUp) + .total(num_rows) + .unit_scale(true) + .build() + .map_err(|_| GraphError::TqdmError) +} + +fn process_shared_properties( + props: Option<&HashMap>, + resolver: impl Fn(&str, PropType) -> Result, GraphError>, +) -> Result, GraphError> { + match props { + None => Ok(vec![]), + Some(props) => props + .iter() + .map(|(key, prop)| Ok((resolver(key, prop.dtype())?.inner(), prop.clone()))) + .collect(), + } +} + +pub(crate) fn load_edge_deletions_from_df< + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + DeletionOps, +>( + df_view: DFView>>, + time: &str, + secondary_index: Option<&str>, + src: &str, + dst: &str, + layer: Option<&str>, + layer_col: Option<&str>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + let src_index = df_view.get_index(src)?; + let dst_index = df_view.get_index(dst)?; + let time_index = df_view.get_index(time)?; + let secondary_index_index = secondary_index + .map(|col| df_view.get_index(col)) + .transpose()?; + let layer_index = layer_col.map(|layer_col| df_view.get_index(layer_col.as_ref())); + let layer_index = layer_index.transpose()?; + #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading edge deletions".to_string(), df_view.num_rows)?; + let session = graph.write_session().map_err(into_graph_err)?; + + for chunk in df_view.chunks { + let df = chunk?; + let layer = lift_layer_col(layer, layer_index, &df)?; + let src_col = df.node_col(src_index)?; + let dst_col = df.node_col(dst_index)?; + let time_col = df.time_col(time_index)?; + + // Load the secondary index column if it exists, otherwise generate from start_id. + let secondary_index_col = match secondary_index_index { + Some(col_index) => { + // Update the event_id to reflect ingesting new secondary indices. + let col = df.secondary_index_col(col_index)?; + session + .set_max_event_id(col.max()) + .map_err(into_graph_err)?; + col + } + None => { + let start_id = session + .reserve_event_ids(df.len()) + .map_err(into_graph_err)?; + SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) + } + }; + + src_col + .iter() + .zip(dst_col.iter()) + .zip(time_col.iter()) + .zip(secondary_index_col.iter()) + .zip(layer.iter()) + .try_for_each(|((((src, dst), time), secondary_index), layer)| { + // let src = src.ok_or(LoadError::MissingSrcError)?; + // let dst = dst.ok_or(LoadError::MissingDstError)?; + graph.delete_edge((time, secondary_index), src, dst, layer)?; + Ok::<(), GraphError>(()) + })?; + + #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + + Ok(()) +} + +pub(crate) fn load_edges_props_from_df< + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + df_view: DFView> + Send>, + src: &str, + dst: &str, + metadata: &[&str], + shared_metadata: Option<&HashMap>, + layer: Option<&str>, + layer_col: Option<&str>, + graph: &G, + resolve_nodes: bool, +) -> Result<(), GraphError> { + edge_props::load_edges_from_df( + df_view, + ColumnNames::new("", None, src, dst, layer_col), + resolve_nodes, + metadata, + shared_metadata, + layer, + graph, + ) +} + +pub(crate) fn load_graph_props_from_df< + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + df_view: DFView>>, + time: &str, + secondary_index: Option<&str>, + properties: Option<&[&str]>, + metadata: Option<&[&str]>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + let properties = properties.unwrap_or(&[]); + let metadata = metadata.unwrap_or(&[]); + + let properties_indices = properties + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let time_index = df_view.get_index(time)?; + let secondary_index_index = secondary_index + .map(|col| df_view.get_index(col)) + .transpose()?; + + #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading graph properties".to_string(), df_view.num_rows)?; + let session = graph.write_session().map_err(into_graph_err)?; + + for chunk in df_view.chunks { + let df = chunk?; + let prop_cols = + combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { + session + .resolve_graph_property(key, dtype, false) + .map_err(into_graph_err) + })?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_graph_property(key, dtype, true) + .map_err(into_graph_err) + })?; + let time_col = df.time_col(time_index)?; + + // Load the secondary index column if it exists, otherwise generate from start_id. + let secondary_index_col = match secondary_index_index { + Some(col_index) => { + // Update the event_id to reflect ingesting new secondary indices. + let col = df.secondary_index_col(col_index)?; + session + .set_max_event_id(col.max()) + .map_err(into_graph_err)?; + col + } + None => { + let start_id = session + .reserve_event_ids(df.len()) + .map_err(into_graph_err)?; + let col = SecondaryIndexCol::new_from_range(start_id, start_id + df.len()); + col + } + }; + + time_col + .par_iter() + .zip(secondary_index_col.par_iter()) + .zip(prop_cols.par_rows()) + .zip(metadata_cols.par_rows()) + .try_for_each(|(((time, secondary_index), t_props), c_props)| { + let t = TimeIndexEntry(time, secondary_index); + let t_props: Vec<_> = t_props.collect(); + + if !t_props.is_empty() { + graph + .internal_add_properties(t, &t_props) + .map_err(into_graph_err)?; + } + + let c_props: Vec<_> = c_props.collect(); + + if !c_props.is_empty() { + graph + .internal_add_metadata(&c_props) + .map_err(into_graph_err)?; + } + + Ok::<(), GraphError>(()) + })?; + + #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + + Ok(()) +} + +#[inline(never)] +pub(crate) fn extract_secondary_index_col( + secondary_index_index: Option, + session: &::WS<'_>, + df: &DFChunk, +) -> Result { + let secondary_index_col = match secondary_index_index { + Some(col_index) => { + // Update the event_id to reflect ingesting new secondary indices. + let col = df.secondary_index_col(col_index)?; + session + .set_max_event_id(col.max()) + .map_err(into_graph_err)?; + col + } + None => { + let start_id = session + .reserve_event_ids(df.len()) + .map_err(into_graph_err)?; + SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) + } + }; + Ok(secondary_index_col) +} + +#[inline(never)] +fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps>( + graph: &G, + cols_to_resolve: &[&'a NodeCol], + resolved_cols: &[&mut [AtomicUsize]], +) -> Result, (Prop, MaybeNew)>, GraphError> { + let node_type_col = vec![None; cols_to_resolve.len()]; + resolve_nodes_with_cache_generic( + &cols_to_resolve, + &node_type_col, + |v: &(Prop, MaybeNew), idx, col_idx| { + let (_, vid) = v; + resolved_cols[col_idx][idx].store(vid.inner().0, Ordering::Relaxed); + }, + |gid, _idx| { + let GidKey { gid, .. } = gid; + let vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(into_graph_err)?; + Ok((Prop::from(gid), vid)) + }, + ) +} + +#[inline(never)] +fn resolve_nodes_and_type_with_cache< + 'a, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + cols_to_resolve: &[&'a NodeCol], + resolved_cols: &[&mut [AtomicUsize]], + node_type_col: LayerCol<'a>, +) -> Result, (VID, usize)>, GraphError> { + let node_type_cols = vec![Some(node_type_col); cols_to_resolve.len()]; + resolve_nodes_with_cache_generic( + cols_to_resolve, + &node_type_cols, + |v: &(VID, usize), row, col_idx| { + let (vid, _) = v; + resolved_cols[col_idx][row].store(vid.index(), Ordering::Relaxed); + }, + |gid, _| { + let GidKey { gid, node_type } = gid; + let (vid, node_type) = graph + .resolve_node_and_type(gid.as_node_ref(), node_type) + .map_err(into_graph_err)?; + Ok((vid, node_type)) + }, + ) +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash, Copy)] +pub struct GidKey<'a> { + gid: GidRef<'a>, + node_type: Option<&'a str>, +} + +impl<'a> GidKey<'a> { + pub fn new(gid: GidRef<'a>, node_type: Option<&'a str>) -> Self { + Self { gid, node_type } + } +} + +#[inline(always)] +fn resolve_nodes_with_cache_generic<'a, V: Send + Sync>( + cols_to_resolve: &[&'a NodeCol], + node_type_cols: &[Option>], + update_fn: impl Fn(&V, usize, usize) + Send + Sync, + new_fn: impl Fn(GidKey<'a>, usize) -> Result + Send + Sync, +) -> Result, V>, GraphError> { + assert_eq!(cols_to_resolve.len(), node_type_cols.len()); + let gid_str_cache: dashmap::DashMap, V, _> = FxDashMap::default(); + let hasher_factory = gid_str_cache.hasher().clone(); + gid_str_cache + .shards() + .par_iter() + .enumerate() + .try_for_each(|(shard_idx, shard)| { + let mut shard_guard = shard.write(); + use dashmap::SharedValue; + use std::hash::BuildHasher; + + // Create hasher function for this shard + let hash_key = |key: &GidKey<'_>| -> u64 { hasher_factory.hash_one(key) }; + + let hasher_fn = + |tuple: &(GidKey<'_>, SharedValue)| -> u64 { hasher_factory.hash_one(tuple.0) }; + + for (col_id, (node_col, layer_col)) in + cols_to_resolve.iter().zip(node_type_cols).enumerate() + { + // Process src_col sequentially for this shard + for (idx, gid) in node_col.iter().enumerate() { + let node_type = layer_col.as_ref().and_then(|lc| lc.get(idx)); + let gid = GidKey::new(gid, node_type); + // Check if this key belongs to this shard + if gid_str_cache.determine_map(&gid) != shard_idx { + continue; // Skip, not our shard + } + + let hash = hash_key(&gid); + + // Check if exists in this shard + if let Some((_, value)) = shard_guard.get(hash, |(g, _)| g == &gid) { + let v = value.get(); + update_fn(&v, idx, col_id); + } else { + let v = new_fn(gid, idx)?; + + update_fn(&v, idx, col_id); + let data = (gid, SharedValue::new(v)); + shard_guard.insert(hash, data, hasher_fn); + } + } + } + + Ok::<(), GraphError>(()) + })?; + Ok(gid_str_cache) +} diff --git a/raphtory/src/io/arrow/df_loaders/nodes.rs b/raphtory/src/io/arrow/df_loaders/nodes.rs new file mode 100644 index 0000000000..c8bd068ba1 --- /dev/null +++ b/raphtory/src/io/arrow/df_loaders/nodes.rs @@ -0,0 +1,468 @@ +#[cfg(feature = "python")] +use crate::io::arrow::df_loaders::build_progress_bar; +use crate::{ + core::entities::nodes::node_ref::AsNodeRef, + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, + io::arrow::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + extract_secondary_index_col, process_shared_properties, + resolve_nodes_and_type_with_cache, GidKey, + }, + layer_col::{lift_node_type_col, LayerCol}, + node_col::NodeCol, + prop_handler::*, + }, + prelude::*, +}; +use arrow::{array::AsArray, datatypes::UInt64Type}; +use itertools::izip; +#[cfg(feature = "python")] +use kdam::BarExt; +use raphtory_api::{ + atomic_extra::atomic_vid_from_mut_slice, + core::{ + entities::properties::meta::STATIC_GRAPH_LAYER_ID, + storage::{timeindex::TimeIndexEntry, FxDashMap}, + }, +}; +use raphtory_core::{entities::VID, storage::timeindex::AsTime}; +use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; +use rayon::prelude::*; +use std::collections::HashMap; +use storage::{api::nodes::NodeSegmentOps, pages::locked::nodes::LockedNodePage, Extension}; + +pub fn load_nodes_from_df< + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, +>( + df_view: DFView>>, + time: &str, + secondary_index: Option<&str>, + node_id: &str, + properties: &[&str], + metadata: &[&str], + shared_metadata: Option<&HashMap>, + node_type: Option<&str>, + node_type_col: Option<&str>, + graph: &G, + resolve_nodes: bool, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + let properties_indices = properties + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let node_type_index = + node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); + let node_type_index = node_type_index.transpose()?; + + let node_id_index = df_view.get_index(node_id)?; + let time_index = df_view.get_index(time)?; + let secondary_index_index = secondary_index + .map(|col| df_view.get_index(col)) + .transpose()?; + + let session = graph.write_session().map_err(into_graph_err)?; + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading nodes".to_string(), df_view.num_rows)?; + + let mut node_col_resolved = vec![]; + + for chunk in df_view.chunks { + let df = chunk?; + let prop_cols = + combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { + session + .resolve_node_property(key, dtype, false) + .map_err(into_graph_err) + })?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + + let time_col = df.time_col(time_index)?; + let node_col = df.node_col(node_id_index)?; + + // Load the secondary index column if it exists, otherwise generate from start_id. + let secondary_index_col = + extract_secondary_index_col::(secondary_index_index, &session, &df)?; + node_col_resolved.resize_with(df.len(), Default::default); + + let (src_vids, gid_str_cache) = get_or_resolve_node_vids::( + graph, + node_id_index, + &mut node_col_resolved, + resolve_nodes, + &df, + &node_col, + node_type_col, + )?; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + let node_stats = write_locked_graph.node_stats().clone(); + let update_time = |time: TimeIndexEntry| { + let time = time.t(); + node_stats.update_time(time); + }; + + write_locked_graph + .nodes + .par_iter_mut() + .try_for_each(|shard| { + // Zip all columns for iteration. + let zip = izip!(src_vids.iter(), time_col.iter(), secondary_index_col.iter(),); + + // resolve_nodes=false + // assumes we are loading our own graph, via the parquet loaders, + // so previous calls have already stored the node ids and types + if resolve_nodes { + store_node_ids_and_type(&gid_str_cache, shard); + } + + for (row, (vid, time, secondary_index)) in zip.enumerate() { + if let Some(mut_node) = shard.resolve_pos(*vid) { + let mut writer = shard.writer(); + let t = TimeIndexEntry(time, secondary_index); + let layer_id = STATIC_GRAPH_LAYER_ID; + let lsn = 0; + + update_time(t); + + let t_props = prop_cols.iter_row(row); + let c_props = metadata_cols + .iter_row(row) + .chain(shared_metadata.iter().cloned()); + + writer.add_props(t, mut_node, layer_id, t_props, lsn); + writer.update_c_props(mut_node, layer_id, c_props, lsn); + }; + } + + Ok::<_, GraphError>(()) + })?; + + #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + + Ok(()) +} + +pub(crate) fn load_node_props_from_df< + 'a, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, +>( + df_view: DFView>>, + node_id: &str, + node_type: Option<&str>, + node_type_col: Option<&str>, + node_id_col: Option<&str>, // provided by our parquet encoder + node_type_id_col: Option<&str>, // provided by our parquet encoder + metadata: &[&str], + shared_metadata: Option<&HashMap>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let node_type_index = + node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); + let node_type_index = node_type_index.transpose()?; + let node_type_ids_col = node_type_id_col + .map(|node_type_id_col| df_view.get_index(node_type_id_col.as_ref())) + .transpose()?; + + let node_id_index = node_id_col + .map(|node_col| df_view.get_index(node_col.as_ref())) + .transpose()?; + + let node_gid_index = df_view.get_index(node_id)?; + let session = graph.write_session().map_err(into_graph_err)?; + + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + let resolve_nodes = node_type_ids_col.is_some() && node_id_index.is_some(); + + #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading node properties".to_string(), df_view.num_rows)?; + + let mut node_col_resolved = vec![]; + let mut node_type_resolved = vec![]; + + for chunk in df_view.chunks { + let df = chunk?; + if df.is_empty() { + continue; + } + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + let node_col = df.node_col(node_gid_index)?; + + let (node_col_resolved, node_type_col_resolved) = get_or_resolve_node_vids_no_events::( + graph, + &session, + &mut node_col_resolved, + &mut node_type_resolved, + node_type_ids_col, + node_id_index, + &df, + &node_col, + node_type_col, + )?; + + // We assume this is fast enough + let max_id = node_col_resolved.iter().map(|VID(i)| *i).max().map(VID); + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + write_locked_graph.resize_chunks_to_num_nodes(max_id); + + write_locked_graph.nodes.iter_mut().try_for_each(|shard| { + let mut c_props = vec![]; + + for (idx, ((vid, node_type), gid)) in node_col_resolved + .iter() + .zip(node_type_col_resolved.iter()) + .zip(node_col.iter()) + .enumerate() + { + if let Some(mut_node) = shard.resolve_pos(*vid) { + let mut writer = shard.writer(); + writer.store_node_id_and_node_type(mut_node, 0, gid, *node_type, 0); + + if resolve_nodes { + // because we don't call resolve_node above + writer.increment_seg_num_nodes() + } + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(idx)); + c_props.extend_from_slice(&shared_metadata); + if !c_props.is_empty() { + writer.update_c_props(mut_node, 0, c_props.drain(..), 0); + } + }; + } + + Ok::<_, GraphError>(()) + })?; + + #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + Ok(()) +} + +#[allow(clippy::too_many_arguments, clippy::type_complexity)] +fn get_or_resolve_node_vids< + 'a: 'c, + 'b: 'c, + 'c, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + src_index: usize, + src_col_resolved: &'a mut Vec, + resolve_nodes: bool, + df: &'b DFChunk, + src_col: &'a NodeCol, + node_type_col: LayerCol<'a>, +) -> Result<(&'c [VID], FxDashMap, (VID, usize)>), GraphError> { + let (src_vids, gid_str_cache) = if resolve_nodes { + src_col_resolved.resize_with(df.len(), Default::default); + + let atomic_src_col = atomic_vid_from_mut_slice(src_col_resolved); + + let gid_str_cache = resolve_nodes_and_type_with_cache::( + graph, + [src_col].as_ref(), + [atomic_src_col].as_ref(), + node_type_col, + )?; + (src_col_resolved.as_slice(), gid_str_cache) + } else { + let srcs = df.chunk[src_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[src_index].data_type().clone()))? + .values() + .as_ref(); + (bytemuck::cast_slice(srcs), FxDashMap::default()) + }; + Ok((src_vids, gid_str_cache)) +} + +#[allow(clippy::too_many_arguments, clippy::type_complexity)] +fn get_or_resolve_node_vids_no_events< + 'a: 'c, + 'b: 'c, + 'c, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + session: &::WS<'_>, + node_col_resolved: &'a mut Vec, + node_type_resolved: &'a mut Vec, + node_type_ids_col: Option, + node_id_col: Option, + df: &'b DFChunk, + src_col: &'a NodeCol, + node_type_col: LayerCol<'a>, +) -> Result<(&'c [VID], &'c [usize]), GraphError> { + assert!(!(node_type_ids_col.is_none() ^ node_id_col.is_none())); // both some or both none + if let Some((node_type_index, node_id_col)) = node_type_ids_col.zip(node_id_col) { + set_meta_for_pre_resolved_nodes_and_node_ids( + graph, + session, + df, + src_col, + node_type_col, + node_type_index, + node_id_col, + ) + } else { + resolve_node_and_meta_for_node_col( + graph, + node_col_resolved, + node_type_resolved, + df, + src_col, + node_type_col, + ) + } +} + +fn resolve_node_and_meta_for_node_col< + 'a, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + node_col_resolved: &'a mut Vec, + node_type_resolved: &'a mut Vec, + df: &DFChunk, + src_col: &NodeCol, + node_type_col: LayerCol<'a>, +) -> Result<(&'a [VID], &'a [usize]), GraphError> { + node_col_resolved.resize_with(df.len(), Default::default); + node_type_resolved.resize_with(df.len(), Default::default); + + let mut locked_mapper = graph.node_meta().node_type_meta().write(); + + let zip = izip!( + src_col.iter(), + node_type_col.iter(), + node_col_resolved.iter_mut(), + node_type_resolved.iter_mut() + ); + + let mut last_node_type: Option<&str> = None; + for (gid, node_type, vid, node_type_id) in zip { + if last_node_type != node_type { + if let Some(name) = node_type { + *node_type_id = locked_mapper.get_or_create_id(name).inner(); + } + } + + let res_vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(into_graph_err)?; + *vid = res_vid.inner(); + last_node_type = node_type; + } + + Ok((node_col_resolved.as_slice(), node_type_resolved.as_slice())) +} + +fn set_meta_for_pre_resolved_nodes_and_node_ids< + 'b, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + session: &::WS<'_>, + df: &'b DFChunk, + src_col: &NodeCol, + node_type_col: LayerCol<'_>, + node_type_index: usize, + node_id_col: usize, +) -> Result<(&'b [VID], &'b [usize]), GraphError> { + let srcs = df.chunk[node_id_col] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[node_id_col].data_type().clone()))? + .values() + .as_ref(); + + let node_types = df.chunk[node_type_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeType(df.chunk[node_type_index].data_type().clone()))? + .values() + .as_ref(); + + let mut locked_mapper = graph.node_meta().node_type_meta().write(); + + let zip = izip!( + src_col.iter(), + srcs.iter(), + node_type_col.iter(), + node_types.iter() + ); + + let mut last_node_type: Option<&str> = None; + + for (gid, node_id, node_type, node_type_id) in zip { + if last_node_type != node_type { + let node_type_name = node_type.unwrap_or("_default"); + locked_mapper.set_id(node_type_name, *node_type_id as usize); + } + last_node_type = node_type; + session + .set_node(gid, VID(*node_id as usize)) + .map_err(into_graph_err)?; + } + + Ok((bytemuck::cast_slice(srcs), bytemuck::cast_slice(node_types))) +} + +#[inline(never)] +fn store_node_ids_and_type>( + gid_str_cache: &FxDashMap, (VID, usize)>, + locked_page: &mut LockedNodePage<'_, NS>, +) { + for entry in gid_str_cache.iter() { + let (vid, node_type) = entry.value(); + let GidKey { gid, .. } = entry.key(); + + if let Some(src_pos) = locked_page.resolve_pos(*vid) { + let mut writer = locked_page.writer(); + writer.store_node_id_and_node_type(src_pos, 0, *gid, *node_type, 0); + } + } +} diff --git a/raphtory/src/io/arrow/layer_col.rs b/raphtory/src/io/arrow/layer_col.rs index 05fa5aed1c..963cb77691 100644 --- a/raphtory/src/io/arrow/layer_col.rs +++ b/raphtory/src/io/arrow/layer_col.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + use crate::{ errors::{into_graph_err, GraphError, LoadError}, io::arrow::dataframe::DFChunk, @@ -61,28 +63,92 @@ impl<'a> LayerCol<'a> { } } - pub fn resolve( + pub fn get(&self, row: usize) -> Option<&'a str> { + match self { + LayerCol::Name { name, .. } => *name, + LayerCol::Utf8 { col } => { + if col.is_valid(row) && row < col.len() { + Some(col.value(row)) + } else { + None + } + } + LayerCol::LargeUtf8 { col } => { + if col.is_valid(row) && row < col.len() { + Some(col.value(row)) + } else { + None + } + } + LayerCol::Utf8View { col } => { + if col.is_valid(row) && row < col.len() { + Some(col.value(row)) + } else { + None + } + } + } + } + + pub fn resolve_layer<'b>( self, + layer_id_col: Option<&'b [u64]>, graph: &(impl AdditionOps + Send + Sync), - ) -> Result, GraphError> { - match self { - LayerCol::Name { name, len } => { + ) -> Result, GraphError> { + match (self, layer_id_col) { + (LayerCol::Name { name, len }, _) => { let layer = graph.resolve_layer(name).map_err(into_graph_err)?.inner(); - Ok(vec![layer; len]) + Ok(Cow::Owned(vec![layer; len])) } - col => { - let iter = col.par_iter(); - let mut res = vec![0usize; iter.len()]; - iter.zip(res.par_iter_mut()) - .try_for_each(|(layer, entry)| { - let layer = graph.resolve_layer(layer).map_err(into_graph_err)?.inner(); - *entry = layer; - Ok::<(), GraphError>(()) - })?; - Ok(res) + (col, None) => { + let mut res = vec![0usize; col.len()]; + let mut last_name = None; + for (row, name) in col.iter().enumerate() { + if last_name == name { + continue; + } + + let layer = graph.resolve_layer(name).map_err(into_graph_err)?.inner(); + res[row] = layer; + last_name = name; + } + Ok(Cow::Owned(res)) + } + (col, Some(layer_ids)) => { + let mut last_pair = None; + + let edge_layer_mapper = graph.edge_meta().layer_meta(); + let node_layer_mapper = graph.node_meta().layer_meta(); + + let mut locked_edge_lm = edge_layer_mapper.write(); + let mut locked_node_lm = node_layer_mapper.write(); + + for pair @ (name, id) in col + .iter() + .map(|name| name.unwrap_or("_default")) + .zip(layer_ids) + { + if let Some(last_pair) = last_pair { + if last_pair != pair { + locked_edge_lm.set_id(name, *id as usize); + locked_node_lm.set_id(name, *id as usize); + } + } + last_pair = Some(pair); + } + Ok(Cow::Borrowed(bytemuck::cast_slice(layer_ids))) } } } + + pub fn len(&self) -> usize { + match self { + LayerCol::Name { len, .. } => *len, + LayerCol::Utf8 { col } => col.len(), + LayerCol::LargeUtf8 { col } => col.len(), + LayerCol::Utf8View { col } => col.len(), + } + } } pub(crate) fn lift_layer_col<'a>( diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs index 9186b21f31..b56aef26c2 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/io/arrow/mod.rs @@ -9,7 +9,10 @@ mod test { use crate::{ io::arrow::{ dataframe::{DFChunk, DFView}, - df_loaders::*, + df_loaders::{ + edges::{load_edges_from_df, ColumnNames}, + nodes::load_nodes_from_df, + }, }, prelude::*, }; @@ -54,15 +57,12 @@ mod test { load_edges_from_df( df, - "time", - secondary_index, - "src", - "dst", + ColumnNames::new("time", secondary_index, "src", "dst", layer_col), + true, &["prop1", "prop2"], &[], None, layer_name, - layer_col, &graph, ) .expect("failed to load edges from pretend df"); @@ -161,6 +161,7 @@ mod test { Some("node_type"), None, &graph, + true, ) .expect("failed to load nodes from pretend df"); diff --git a/raphtory/src/io/arrow/node_col.rs b/raphtory/src/io/arrow/node_col.rs index 3a4c64ef56..5a07666d8a 100644 --- a/raphtory/src/io/arrow/node_col.rs +++ b/raphtory/src/io/arrow/node_col.rs @@ -228,6 +228,14 @@ impl NodeCol { pub fn dtype(&self) -> GidType { self.0.dtype() } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn get(&self, i: usize) -> Option> { + self.0.get(i) + } } pub fn lift_node_col(index: usize, df: &DFChunk) -> Result { diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index e2251ba160..b616036383 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -1,7 +1,14 @@ use crate::{ db::api::view::StaticGraphViewOps, errors::{GraphError, InvalidPathReason::PathDoesNotExist}, - io::arrow::{dataframe::*, df_loaders::*}, + io::arrow::{ + dataframe::*, + df_loaders::{ + edges::{load_edges_from_df, ColumnNames}, + nodes::{load_node_props_from_df, load_nodes_from_df}, + *, + }, + }, prelude::{AdditionOps, DeletionOps, PropertyAdditionOps}, }; use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; @@ -27,6 +34,7 @@ pub fn load_nodes_from_parquet< metadata: &[&str], shared_metadata: Option<&HashMap>, batch_size: Option, + resolve_nodes: bool, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id, time]; @@ -55,6 +63,7 @@ pub fn load_nodes_from_parquet< node_type, node_type_col, graph, + resolve_nodes, ) .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; } @@ -65,19 +74,31 @@ pub fn load_nodes_from_parquet< pub fn load_edges_from_parquet( graph: &G, parquet_path: impl AsRef, - time: &str, - secondary_index: Option<&str>, - src: &str, - dst: &str, + column_names: ColumnNames, + resolve_nodes: bool, properties: &[&str], metadata: &[&str], shared_metadata: Option<&HashMap>, layer: Option<&str>, - layer_col: Option<&str>, batch_size: Option, ) -> Result<(), GraphError> { + let ColumnNames { + time, + secondary_index, + src, + dst, + layer_col, + layer_id_col, + edge_id, + } = column_names; + let parquet_path = parquet_path.as_ref(); - let mut cols_to_check = vec![src, dst, time]; + let mut cols_to_check = [src, dst, time] + .into_iter() + .chain(layer_id_col) + .chain(edge_id) + .collect::>(); + cols_to_check.extend_from_slice(properties); cols_to_check.extend_from_slice(metadata); @@ -127,15 +148,12 @@ pub fn load_edges_from_parquet, node_type_col: Option<&str>, + node_id_col: Option<&str>, // for inner parquet use only + node_type_id_col: Option<&str>, // for inner parquet use only metadata_properties: &[&str], shared_metadata: Option<&HashMap>, batch_size: Option, ) -> Result<(), GraphError> { - let mut cols_to_check = vec![id]; - cols_to_check.extend_from_slice(metadata_properties); + let mut cols_to_check = std::iter::once(id) + .chain(node_type_id_col) + .chain(node_type_col) + .chain(node_id_col) + .collect::>(); - if let Some(ref node_type_col) = node_type_col { - cols_to_check.push(node_type_col.as_ref()); - } + cols_to_check.extend_from_slice(metadata_properties); for path in get_parquet_file_paths(parquet_path)? { let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check), batch_size)?; @@ -171,6 +192,8 @@ pub fn load_node_props_from_parquet< id, node_type, node_type_col, + node_id_col, + node_type_id_col, metadata_properties, shared_metadata, graph, @@ -191,6 +214,7 @@ pub fn load_edge_props_from_parquet, layer_col: Option<&str>, batch_size: Option, + resolve_nodes: bool, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst]; if let Some(ref layer_col) = layer_col { @@ -211,6 +235,7 @@ pub fn load_edge_props_from_parquet( df: &Bound<'a, PyAny>, col_names: Vec<&str>, -) -> PyResult> + 'a>> { +) -> PyResult> + Send + 'a>> { let py = df.py(); is_jupyter(py); py.import("pandas")?; @@ -245,19 +261,28 @@ pub(crate) fn process_pandas_py_df<'a>( .collect(); let names_len = names.len(); - let chunks = rb.into_iter().map(move |rb| { - let chunk = (0..names_len) - .map(|i| { - let array = rb.call_method1("column", (i,)).map_err(GraphError::from)?; - let arr = array_to_rust(&array).map_err(GraphError::from)?; - Ok::<_, GraphError>(arr) - }) - .collect::, GraphError>>()?; - Ok(DFChunk { chunk }) - }); + // Convert all Python batches to Rust Arrow arrays while we have the GIL + // This makes the iterator Send-safe + let rust_batches: Vec> = rb + .into_iter() + .map(|rb| { + let chunk = (0..names_len) + .map(|i| { + let array = rb.call_method1("column", (i,)).map_err(GraphError::from)?; + let arr = array_to_rust(&array).map_err(GraphError::from)?; + Ok::<_, GraphError>(arr) + }) + .collect::, GraphError>>()?; + + Ok(DFChunk { chunk }) + }) + .collect(); + let num_rows: usize = dropped_df.call_method0("__len__")?.extract()?; + let chunks = rust_batches.into_iter(); + Ok(DFView { names, chunks, diff --git a/raphtory/src/python/packages/algorithms.rs b/raphtory/src/python/packages/algorithms.rs index 13bbb90019..e48b0e51fc 100644 --- a/raphtory/src/python/packages/algorithms.rs +++ b/raphtory/src/python/packages/algorithms.rs @@ -69,6 +69,7 @@ use crate::{ utils::{PyNodeRef, PyTime}, }, }; +use either::Either; use pyo3::{prelude::*, types::PyList}; use rand::{prelude::StdRng, SeedableRng}; use raphtory_api::core::Direction; @@ -772,9 +773,9 @@ pub fn k_core( ) -> Nodes<'static, DynamicGraph> { let v_set = k_core_set(&graph.graph, k, iter_count, threads); let index = if v_set.len() == graph.graph.unfiltered_num_nodes() { - None + Index::for_graph(graph.graph.clone()) } else { - Some(Index::from_iter(v_set)) + Index::from_iter(v_set) }; Nodes::new_filtered(graph.graph.clone(), graph.graph.clone(), index, None) } diff --git a/raphtory/src/serialise/graph_folder.rs b/raphtory/src/serialise/graph_folder.rs index 1d1c2913a4..9f66ba9451 100644 --- a/raphtory/src/serialise/graph_folder.rs +++ b/raphtory/src/serialise/graph_folder.rs @@ -317,6 +317,7 @@ mod tests { prelude::{AdditionOps, Graph, Prop, StableEncode, NO_PROPS}, }; use raphtory_api::{core::utils::logging::global_info_logger, GraphType}; + use raphtory_storage::core_ops::CoreGraphOps; /// Verify that the metadata is re-created if it does not exist. #[test] @@ -500,6 +501,12 @@ mod tests { .add_edge(4, 1, 3, [("test prop 4", true)], None) .unwrap(); + graph + .node(1) + .unwrap() + .add_updates(5, [("test node prop", 5i32)]) + .unwrap(); + let temp_folder = tempfile::TempDir::new().unwrap(); let folder = temp_folder.path().join("graph"); let graph_folder = GraphFolder::from(&folder); diff --git a/raphtory/src/serialise/parquet/edges.rs b/raphtory/src/serialise/parquet/edges.rs index ec631af7fb..48b1c44c21 100644 --- a/raphtory/src/serialise/parquet/edges.rs +++ b/raphtory/src/serialise/parquet/edges.rs @@ -22,18 +22,19 @@ pub(crate) fn encode_edge_tprop( g.edges().segmented_par_iter(), path, EDGES_T_PATH, - |id_type| { + |_| { vec![ Field::new(TIME_COL, DataType::Int64, false), Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - Field::new(SRC_COL, id_type.clone(), false), - Field::new(DST_COL, id_type.clone(), false), + Field::new(SRC_COL_ID, DataType::UInt64, false), + Field::new(DST_COL_ID, DataType::UInt64, false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), Field::new(LAYER_COL, DataType::Utf8, true), + Field::new(LAYER_ID_COL, DataType::UInt64, true), ] }, |edges, g, decoder, writer| { let row_group_size = 100_000; - let edges = edges.collect::>(); for edge_rows in edges .into_iter() @@ -67,12 +68,13 @@ pub(crate) fn encode_edge_deletions( g.edges().segmented_par_iter(), path, EDGES_D_PATH, - |id_type| { + |_| { vec![ Field::new(TIME_COL, DataType::Int64, false), Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - Field::new(SRC_COL, id_type.clone(), false), - Field::new(DST_COL, id_type.clone(), false), + Field::new(SRC_COL_ID, DataType::UInt64, false), + Field::new(DST_COL_ID, DataType::UInt64, false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), Field::new(LAYER_COL, DataType::Utf8, true), ] }, @@ -128,10 +130,11 @@ pub(crate) fn encode_edge_cprop( g.edges().segmented_par_iter(), path, EDGES_C_PATH, - |id_type| { + |_| { vec![ - Field::new(SRC_COL, id_type.clone(), false), - Field::new(DST_COL, id_type.clone(), false), + Field::new(SRC_COL_ID, DataType::UInt64, false), + Field::new(DST_COL_ID, DataType::UInt64, false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), Field::new(LAYER_COL, DataType::Utf8, true), ] }, diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index 93eb4740fb..2cd1e29710 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -1,12 +1,19 @@ use crate::{ db::{ - api::{storage::storage::Storage, view::MaterializedGraph}, + api::{ + storage::storage::Storage, + view::{internal::InternalStorageOps, MaterializedGraph}, + }, graph::views::deletion_graph::PersistentGraph, }, errors::GraphError, - io::parquet_loaders::{ - load_edge_deletions_from_parquet, load_edge_props_from_parquet, load_edges_from_parquet, - load_graph_props_from_parquet, load_node_props_from_parquet, load_nodes_from_parquet, + io::{ + arrow::df_loaders::edges::ColumnNames, + parquet_loaders::{ + load_edge_deletions_from_parquet, load_edge_props_from_parquet, + load_edges_from_parquet, load_graph_props_from_parquet, load_node_props_from_parquet, + load_nodes_from_parquet, + }, }, prelude::*, serialise::{ @@ -181,12 +188,16 @@ pub trait ParquetDecoder: Sized { } const NODE_ID_COL: &str = "rap_node_id"; +const NODE_VID_COL: &str = "rap_node_vid"; const TYPE_COL: &str = "rap_node_type"; +const TYPE_ID_COL: &str = "rap_node_type_id"; const TIME_COL: &str = "rap_time"; const SECONDARY_INDEX_COL: &str = "rap_secondary_index"; -const SRC_COL: &str = "rap_src"; -const DST_COL: &str = "rap_dst"; +const SRC_COL_ID: &str = "rap_src_id"; +const DST_COL_ID: &str = "rap_dst_id"; +const EDGE_COL_ID: &str = "rap_edge_id"; const LAYER_COL: &str = "rap_layer"; +const LAYER_ID_COL: &str = "rap_layer_id"; const EDGES_T_PATH: &str = "edges_t"; const EDGES_D_PATH: &str = "edges_d"; // deletions const EDGES_C_PATH: &str = "edges_c"; @@ -478,57 +489,68 @@ fn decode_graph_storage( )?; } - let t_node_path = path.as_ref().join(NODES_T_PATH); + let c_node_path = path.as_ref().join(NODES_C_PATH); - if std::fs::exists(&t_node_path)? { - let exclude = vec![NODE_ID_COL, TIME_COL, SECONDARY_INDEX_COL, TYPE_COL]; - let (t_prop_columns, _) = collect_prop_columns(&t_node_path, &exclude)?; - let t_prop_columns = t_prop_columns + if std::fs::exists(&c_node_path)? { + let exclude = vec![NODE_ID_COL, NODE_VID_COL, TYPE_COL, TYPE_ID_COL]; + let (c_prop_columns, _) = collect_prop_columns(&c_node_path, &exclude)?; + let c_prop_columns = c_prop_columns .iter() .map(|s| s.as_str()) .collect::>(); - load_nodes_from_parquet( + load_node_props_from_parquet( &graph, - &t_node_path, - TIME_COL, - Some(SECONDARY_INDEX_COL), + &c_node_path, NODE_ID_COL, None, Some(TYPE_COL), - &t_prop_columns, - &[], + Some(NODE_VID_COL), + Some(TYPE_ID_COL), + &c_prop_columns, None, batch_size, )?; } - let c_node_path = path.as_ref().join(NODES_C_PATH); + let t_node_path = path.as_ref().join(NODES_T_PATH); - if std::fs::exists(&c_node_path)? { - let exclude = vec![NODE_ID_COL, TYPE_COL]; - let (c_prop_columns, _) = collect_prop_columns(&c_node_path, &exclude)?; - let c_prop_columns = c_prop_columns + if std::fs::exists(&t_node_path)? { + let exclude = vec![NODE_VID_COL, TIME_COL, SECONDARY_INDEX_COL]; + let (t_prop_columns, _) = collect_prop_columns(&t_node_path, &exclude)?; + let t_prop_columns = t_prop_columns .iter() .map(|s| s.as_str()) .collect::>(); - load_node_props_from_parquet( + load_nodes_from_parquet( &graph, - &c_node_path, - NODE_ID_COL, + &t_node_path, + TIME_COL, + Some(SECONDARY_INDEX_COL), + NODE_VID_COL, None, - Some(TYPE_COL), - &c_prop_columns, + None, + &t_prop_columns, + &[], None, batch_size, + false, )?; } let t_edge_path = path.as_ref().join(EDGES_T_PATH); if std::fs::exists(&t_edge_path)? { - let exclude = vec![TIME_COL, SECONDARY_INDEX_COL, SRC_COL, DST_COL, LAYER_COL]; + let exclude = vec![ + TIME_COL, + SECONDARY_INDEX_COL, + SRC_COL_ID, + DST_COL_ID, + LAYER_COL, + LAYER_ID_COL, + EDGE_COL_ID, + ]; let (t_prop_columns, _) = collect_prop_columns(&t_edge_path, &exclude)?; let t_prop_columns = t_prop_columns .iter() @@ -538,31 +560,20 @@ fn decode_graph_storage( load_edges_from_parquet( &graph, &t_edge_path, - TIME_COL, - Some(SECONDARY_INDEX_COL), - SRC_COL, - DST_COL, + ColumnNames::new( + TIME_COL, + Some(SECONDARY_INDEX_COL), + SRC_COL_ID, + DST_COL_ID, + Some(LAYER_COL), + ) + .with_layer_id_col(LAYER_ID_COL) + .with_edge_id_col(EDGE_COL_ID), + false, &t_prop_columns, &[], None, None, - Some(LAYER_COL), - batch_size, - )?; - } - - let d_edge_path = path.as_ref().join(EDGES_D_PATH); - - if std::fs::exists(&d_edge_path)? { - load_edge_deletions_from_parquet( - graph.core_graph(), - &d_edge_path, - TIME_COL, - Some(SECONDARY_INDEX_COL), - SRC_COL, - DST_COL, - None, - Some(LAYER_COL), batch_size, )?; } @@ -570,7 +581,7 @@ fn decode_graph_storage( let c_edge_path = path.as_ref().join(EDGES_C_PATH); if std::fs::exists(&c_edge_path)? { - let exclude = vec![SRC_COL, DST_COL, LAYER_COL]; + let exclude = vec![SRC_COL_ID, DST_COL_ID, LAYER_COL, EDGE_COL_ID]; let (c_prop_columns, _) = collect_prop_columns(&c_edge_path, &exclude)?; let metadata = c_prop_columns .iter() @@ -580,13 +591,30 @@ fn decode_graph_storage( load_edge_props_from_parquet( &graph, &c_edge_path, - SRC_COL, - DST_COL, + SRC_COL_ID, + DST_COL_ID, &metadata, None, None, Some(LAYER_COL), batch_size, + false, + )?; + } + + let d_edge_path = path.as_ref().join(EDGES_D_PATH); + + if std::fs::exists(&d_edge_path)? { + load_edge_deletions_from_parquet( + graph.core_graph(), + &d_edge_path, + TIME_COL, + Some(SECONDARY_INDEX_COL), + SRC_COL_ID, + DST_COL_ID, + None, + Some(LAYER_COL), + batch_size, )?; } diff --git a/raphtory/src/serialise/parquet/model.rs b/raphtory/src/serialise/parquet/model.rs index a34b661c25..b5772767d7 100644 --- a/raphtory/src/serialise/parquet/model.rs +++ b/raphtory/src/serialise/parquet/model.rs @@ -1,12 +1,13 @@ -use super::{ - Prop, DST_COL, LAYER_COL, NODE_ID_COL, SECONDARY_INDEX_COL, SRC_COL, TIME_COL, TYPE_COL, -}; +use super::{Prop, LAYER_COL, NODE_ID_COL, SECONDARY_INDEX_COL, TIME_COL, TYPE_COL}; use crate::{ db::{ api::view::StaticGraphViewOps, graph::{edge::EdgeView, node::NodeView}, }, prelude::*, + serialise::parquet::{ + DST_COL_ID, EDGE_COL_ID, LAYER_ID_COL, NODE_VID_COL, SRC_COL_ID, TYPE_ID_COL, + }, }; use arrow::datatypes::DataType; use raphtory_api::core::{ @@ -52,11 +53,18 @@ impl<'a, G: StaticGraphViewOps> Serialize for ParquetTEdge<'a, G> { .layer_name() .map_err(|_| S::Error::custom("Edge has no layer"))?; + let layer_id = edge + .edge + .layer() + .ok_or_else(|| S::Error::custom("Edge has no layer"))?; + state.serialize_entry(TIME_COL, &t.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &t.1)?; - state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; - state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(SRC_COL_ID, &edge.src().node.0)?; + state.serialize_entry(DST_COL_ID, &edge.dst().node.0)?; + state.serialize_entry(EDGE_COL_ID, &edge.edge.pid())?; state.serialize_entry(LAYER_COL, &layer)?; + state.serialize_entry(LAYER_ID_COL, &layer_id)?; for (name, prop) in edge.properties().temporal().iter_latest() { state.serialize_entry(&name, &SerdeProp(&prop))?; @@ -80,8 +88,9 @@ impl<'a, G: StaticGraphViewOps> Serialize for ParquetCEdge<'a, G> { .layer_name() .map_err(|_| S::Error::custom("Edge has no layer"))?; - state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; - state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(SRC_COL_ID, &(edge.src().node.0))?; + state.serialize_entry(DST_COL_ID, &(edge.dst().node.0))?; + state.serialize_entry(EDGE_COL_ID, &(edge.edge.pid().0))?; state.serialize_entry(LAYER_COL, &layer)?; for (name, prop) in edge.metadata().iter_filtered() { @@ -108,8 +117,9 @@ impl<'a, G: StaticGraphViewOps> Serialize for ParquetDelEdge<'a, G> { state.serialize_entry(TIME_COL, &self.del.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &self.del.1)?; - state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; - state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(SRC_COL_ID, &(edge.src().node.0))?; + state.serialize_entry(DST_COL_ID, &(edge.dst().node.0))?; + state.serialize_entry(EDGE_COL_ID, &(edge.edge.pid().0))?; state.serialize_entry(LAYER_COL, &self.layer)?; state.end() @@ -130,10 +140,9 @@ impl<'a> Serialize for ParquetTNode<'a> { { let mut state = serializer.serialize_map(None)?; - state.serialize_entry(NODE_ID_COL, &ParquetGID(self.node.id()))?; + state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; state.serialize_entry(TIME_COL, &self.t.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &self.t.1)?; - state.serialize_entry(TYPE_COL, &self.node.node_type())?; for (name, prop) in self.props.iter() { state.serialize_entry(&self.cols[*name], &SerdeProp(prop))?; @@ -155,7 +164,9 @@ impl<'a> Serialize for ParquetCNode<'a> { let mut state = serializer.serialize_map(None)?; state.serialize_entry(NODE_ID_COL, &ParquetGID(self.node.id()))?; + state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; state.serialize_entry(TYPE_COL, &self.node.node_type())?; + state.serialize_entry(TYPE_ID_COL, &self.node.node_type_id())?; for (name, prop) in self.node.metadata().iter_filtered() { state.serialize_entry(&name, &SerdeProp(&prop))?; diff --git a/raphtory/src/serialise/parquet/nodes.rs b/raphtory/src/serialise/parquet/nodes.rs index 4669b5b9fc..d15764b19f 100644 --- a/raphtory/src/serialise/parquet/nodes.rs +++ b/raphtory/src/serialise/parquet/nodes.rs @@ -4,13 +4,13 @@ use crate::{ errors::GraphError, serialise::parquet::{ model::{ParquetCNode, ParquetTNode}, - run_encode, NODES_C_PATH, NODES_T_PATH, NODE_ID_COL, SECONDARY_INDEX_COL, TIME_COL, - TYPE_COL, + run_encode_indexed, NODES_C_PATH, NODES_T_PATH, NODE_ID_COL, NODE_VID_COL, + SECONDARY_INDEX_COL, TIME_COL, TYPE_COL, TYPE_ID_COL, }, }; use arrow::datatypes::{DataType, Field}; use itertools::Itertools; -use raphtory_api::{core::entities::VID, iter::IntoDynBoxed}; +use raphtory_api::iter::IntoDynBoxed; use raphtory_storage::graph::graph::GraphStorage; use std::path::Path; @@ -18,28 +18,28 @@ pub(crate) fn encode_nodes_tprop( g: &GraphStorage, path: impl AsRef, ) -> Result<(), GraphError> { - run_encode( + run_encode_indexed( g, g.node_meta().temporal_prop_mapper(), - g.unfiltered_num_nodes(), + g.nodes().row_groups_par_iter(), path, NODES_T_PATH, - |id_type| { + |_| { vec![ - Field::new(NODE_ID_COL, id_type.clone(), false), + Field::new(NODE_VID_COL, DataType::UInt64, false), Field::new(TIME_COL, DataType::Int64, false), Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - Field::new(TYPE_COL, DataType::Utf8, true), ] }, |nodes, g, decoder, writer| { let row_group_size = 100_000; + let nodes = nodes.collect::>(); + + let nodes = nodes.into_iter(); let cols = g.node_meta().temporal_prop_mapper().all_keys(); let cols = &cols; for node_rows in nodes - .into_iter() - .map(VID) .map(|vid| NodeView::new_internal(g, vid)) .flat_map(move |node| { GenLockedIter::from(node, |node| { @@ -72,24 +72,24 @@ pub(crate) fn encode_nodes_cprop( g: &GraphStorage, path: impl AsRef, ) -> Result<(), GraphError> { - run_encode( + run_encode_indexed( g, g.node_meta().metadata_mapper(), - g.unfiltered_num_nodes(), + g.nodes().row_groups_par_iter(), path, NODES_C_PATH, |id_type| { vec![ Field::new(NODE_ID_COL, id_type.clone(), false), + Field::new(NODE_VID_COL, DataType::UInt64, false), Field::new(TYPE_COL, DataType::Utf8, true), + Field::new(TYPE_ID_COL, DataType::UInt64, true), ] }, |nodes, g, decoder, writer| { let row_group_size = 100_000; for node_rows in nodes - .into_iter() - .map(VID) .map(|vid| NodeView::new_internal(g, vid)) .map(move |node| ParquetCNode { node }) .chunks(row_group_size) diff --git a/raphtory/src/serialise/serialise.rs b/raphtory/src/serialise/serialise.rs index 88ff739193..4ccb8d60dd 100644 --- a/raphtory/src/serialise/serialise.rs +++ b/raphtory/src/serialise/serialise.rs @@ -93,15 +93,14 @@ impl StableDecode for T { path: impl Into, path_for_decoded_graph: Option<&Path>, ) -> Result { - let graph; let folder: GraphFolder = path.into(); - if folder.is_zip() { - let reader = std::fs::File::open(&folder.get_base_path())?; - graph = Self::decode_parquet_from_zip(reader, path_for_decoded_graph)?; + let graph = if folder.is_zip() { + let reader = std::fs::File::open(folder.get_base_path())?; + Self::decode_parquet_from_zip(reader, path_for_decoded_graph)? } else { - graph = Self::decode_parquet(&folder.get_graph_path(), path_for_decoded_graph)?; - } + Self::decode_parquet(folder.get_graph_path(), path_for_decoded_graph)? + }; #[cfg(feature = "search")] graph.load_index(&folder)?; diff --git a/raphtory/tests/df_loaders.rs b/raphtory/tests/df_loaders.rs index 5066aa004d..185ea3d0cb 100644 --- a/raphtory/tests/df_loaders.rs +++ b/raphtory/tests/df_loaders.rs @@ -10,7 +10,10 @@ mod io_tests { errors::GraphError, io::arrow::{ dataframe::{DFChunk, DFView}, - df_loaders::{load_edges_from_df, load_nodes_from_df}, + df_loaders::{ + edges::{load_edges_from_df, ColumnNames}, + nodes::load_nodes_from_df, + }, }, prelude::*, test_utils::{build_edge_list, build_edge_list_str, build_edge_list_with_secondary_index}, @@ -208,7 +211,10 @@ mod io_tests { let g = Graph::new(); let props = ["str_prop", "int_prop"]; let secondary_index = None; - load_edges_from_df(df_view, "time", secondary_index,"src", "dst", &props, &[], None, None, None, &g).unwrap(); + load_edges_from_df(df_view, + ColumnNames::new("time", secondary_index, "src", "dst", None), + true, + &props, &[], None, None, &g).unwrap(); let g2 = Graph::new(); @@ -243,15 +249,12 @@ mod io_tests { load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames::new("time", secondary_index, "src", "dst", None), + true, &props, &[], None, None, - None, &g, ) .unwrap(); @@ -285,8 +288,7 @@ mod io_tests { let df_view = build_df_str(chunk_size, &edges); let g = Graph::new(); let props = ["str_prop", "int_prop"]; - let secondary_index = None; - load_edges_from_df(df_view, "time", secondary_index, "src", "dst", &props, &[], None, None, None, &g).unwrap(); + load_edges_from_df(df_view, ColumnNames::new("time", None, "src", "dst", None), true, &props, &[], None, None, &g).unwrap(); let g2 = Graph::new(); @@ -306,19 +308,14 @@ mod io_tests { let df_view = build_df_str(1, &edges); let g = Graph::new(); let props = ["str_prop", "int_prop"]; - let secondary_index = None; - load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames::new("time", None, "src", "dst", None), + true, &props, &[], None, None, - None, &g, ) .unwrap(); @@ -351,15 +348,12 @@ mod io_tests { // Load edges from DataFrame with secondary_index load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames::new("time", secondary_index, "src", "dst", None), + true, &props, &[], None, None, - None, &g, ) .unwrap(); @@ -412,15 +406,12 @@ mod io_tests { load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames::new("time", secondary_index, "src", "dst", None), + true, &props, &[], None, None, - None, &g, ).unwrap(); @@ -500,6 +491,7 @@ mod io_tests { None, None, &g, + true, ) .unwrap(); @@ -562,15 +554,12 @@ mod io_tests { let secondary_index = None; load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames::new("time", secondary_index, "src", "dst", None), + true, &props, &[], None, layer.as_deref(), - None, &g, ) .unwrap();