From 9b2748e8c8d9bdb80f7628fe1bc8dc69eb362440 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Fri, 28 Nov 2025 18:09:58 +0000 Subject: [PATCH 01/24] remove unused --- Cargo.toml | 2 +- db4-storage/src/pages/edge_page/writer.rs | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 14cd14d775..936043adb6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ raphtory-core = { version = "0.16.2", path = "raphtory-core", default-features = raphtory-graphql = { version = "0.16.2", path = "raphtory-graphql", default-features = false } raphtory-storage = { version = "0.16.2", path = "raphtory-storage", default-features = false } async-graphql = { version = "7.0.16", features = ["dynamic-schema"] } -bincode = "1.3.3" +bincode = {version = "2", features = ["serde"]} async-graphql-poem = "7.0.16" dynamic-graphql = "0.10.1" derive_more = "2.0.1" diff --git a/db4-storage/src/pages/edge_page/writer.rs b/db4-storage/src/pages/edge_page/writer.rs index cde4d7fdb3..ab9a2a4670 100644 --- a/db4-storage/src/pages/edge_page/writer.rs +++ b/db4-storage/src/pages/edge_page/writer.rs @@ -2,10 +2,8 @@ use crate::{ LocalPOS, api::edges::EdgeSegmentOps, error::StorageError, pages::layer_counter::GraphStats, segments::edge::segment::MemEdgeSegment, }; -use arrow_array::{ArrayRef, BooleanArray}; use raphtory_api::core::entities::{VID, properties::prop::Prop}; use raphtory_core::{ - entities::EID, storage::timeindex::{AsTime, TimeIndexEntry}, }; use std::ops::DerefMut; From 7bde59cca95f29ef41781bcc9d1481de87252095 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Tue, 2 Dec 2025 13:11:37 +0000 Subject: [PATCH 02/24] rename pages to segments and refactor some APIs for counting # Conflicts: # db4-graph/src/lib.rs # db4-storage/src/pages/locked/nodes.rs --- Cargo.lock | 38 ++- db4-graph/src/lib.rs | 4 - db4-storage/src/api/edges.rs | 6 +- db4-storage/src/api/nodes.rs | 27 +- db4-storage/src/pages/layer_counter.rs | 4 + db4-storage/src/pages/locked/nodes.rs | 6 +- db4-storage/src/pages/node_page/writer.rs | 6 +- db4-storage/src/pages/node_store.rs | 155 +++++++++-- db4-storage/src/segments/node/segment.rs | 26 +- .../entities/properties/prop/prop_enum.rs | 12 +- .../src/entities/properties/tcell.rs | 2 +- raphtory-core/src/lib.rs | 16 -- raphtory-core/src/storage/lazy_vec.rs | 2 +- .../src/mutation/addition_ops_ext.rs | 10 +- raphtory/src/db/api/view/graph.rs | 2 +- raphtory/src/io/arrow/df_loaders.rs | 260 ++++++++++-------- 16 files changed, 353 insertions(+), 223 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5e55b0c231..6f9620bdcb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -768,6 +768,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -2114,7 +2134,7 @@ dependencies = [ "arrow-csv", "arrow-schema", "bigdecimal", - "bincode", + "bincode 2.0.1", "bitvec", "boxcar", "bytemuck", @@ -2884,7 +2904,7 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c255bdf46e07fb840d120a36dcc81f385140d7191c76a7391672675c01a55d" dependencies = [ - "bincode", + "bincode 1.3.3", "byteorder", "heed-traits", "serde", @@ -5066,7 +5086,7 @@ dependencies = [ "async-openai", "async-trait", "bigdecimal", - "bincode", + "bincode 2.0.1", "bytemuck", "bzip2 0.4.4", "chrono", @@ -6992,6 +7012,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "url" version = "2.5.7" @@ -7052,6 +7078,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "wait-timeout" version = "0.2.1" diff --git a/db4-graph/src/lib.rs b/db4-graph/src/lib.rs index 7b7e8b7fa2..382ba1f688 100644 --- a/db4-graph/src/lib.rs +++ b/db4-graph/src/lib.rs @@ -35,7 +35,6 @@ use tempfile::TempDir; pub struct TemporalGraph { // mapping between logical and physical ids pub logical_to_physical: Arc, - pub node_count: AtomicUsize, storage: Arc>, graph_dir: Option, pub transaction_manager: Arc, @@ -152,14 +151,12 @@ impl, ES = ES, GS = GS>> Temporal let gid_resolver_dir = path.join("gid_resolver"); let resolver = GIDResolver::new_with_path(&gid_resolver_dir)?; - let node_count = AtomicUsize::new(storage.nodes().num_nodes()); let wal_dir = path.join("wal"); let wal = Arc::new(WalImpl::new(Some(wal_dir))?); Ok(Self { graph_dir: Some(path.into()), logical_to_physical: resolver.into(), - node_count, storage: Arc::new(storage), transaction_manager: Arc::new(TransactionManager::new(wal.clone())), wal, @@ -205,7 +202,6 @@ impl, ES = ES, GS = GS>> Temporal Ok(Self { graph_dir, logical_to_physical, - node_count: AtomicUsize::new(0), storage: Arc::new(storage), transaction_manager: Arc::new(TransactionManager::new(wal.clone())), wal, diff --git a/db4-storage/src/api/edges.rs b/db4-storage/src/api/edges.rs index 61136444cd..96a810db34 100644 --- a/db4-storage/src/api/edges.rs +++ b/db4-storage/src/api/edges.rs @@ -27,6 +27,7 @@ pub trait EdgeSegmentOps: Send + Sync + std::fmt::Debug + 'static { fn t_len(&self) -> usize; fn num_layers(&self) -> usize; + // Persistent layer count, not used for up to date counts fn layer_count(&self, layer_id: usize) -> u32; fn load( @@ -67,7 +68,10 @@ pub trait EdgeSegmentOps: Send + Sync + std::fmt::Debug + 'static { head_lock: impl DerefMut, ) -> Result<(), StorageError>; - fn increment_num_edges(&self) -> u32; + fn increment_num_edges(&self) -> u32 { + self.edges_counter() + .fetch_add(1, std::sync::atomic::Ordering::Relaxed) + } fn contains_edge( &self, diff --git a/db4-storage/src/api/nodes.rs b/db4-storage/src/api/nodes.rs index ebea776c8a..c674c1bce9 100644 --- a/db4-storage/src/api/nodes.rs +++ b/db4-storage/src/api/nodes.rs @@ -21,7 +21,7 @@ use std::{ borrow::Cow, ops::{Deref, DerefMut, Range}, path::{Path, PathBuf}, - sync::Arc, + sync::{Arc, atomic::AtomicU32}, }; use crate::{ @@ -47,12 +47,6 @@ pub trait NodeSegmentOps: Send + Sync + std::fmt::Debug + 'static { fn t_len(&self) -> usize; - fn event_id(&self) -> i64; - - fn increment_event_id(&self, i: i64); - - fn decrement_event_id(&self) -> i64; - fn load( page_id: usize, node_meta: Arc, @@ -81,14 +75,6 @@ pub trait NodeSegmentOps: Send + Sync + std::fmt::Debug + 'static { fn try_head_mut(&self) -> Option>; - fn num_nodes(&self) -> u32 { - self.layer_count(0) - } - - fn num_layers(&self) -> usize; - - fn layer_count(&self, layer_id: usize) -> u32; - fn notify_write( &self, head_lock: impl DerefMut, @@ -128,6 +114,17 @@ pub trait NodeSegmentOps: Send + Sync + std::fmt::Debug + 'static { &self, locked_head: impl DerefMut, ) -> Result<(), StorageError>; + + fn nodes_counter(&self) -> &AtomicU32; + + fn num_nodes(&self) -> u32 { + self.nodes_counter() + .load(std::sync::atomic::Ordering::Relaxed) + } + + fn num_layers(&self) -> usize; + + fn layer_count(&self, layer_id: usize) -> u32; } pub trait LockedNSSegment: std::fmt::Debug + Send + Sync { diff --git a/db4-storage/src/pages/layer_counter.rs b/db4-storage/src/pages/layer_counter.rs index b3865ba0b1..5574c11f64 100644 --- a/db4-storage/src/pages/layer_counter.rs +++ b/db4-storage/src/pages/layer_counter.rs @@ -79,6 +79,10 @@ impl GraphStats { counter.load(std::sync::atomic::Ordering::Acquire) } + pub fn get_counter(&self, layer_id: usize) -> &AtomicUsize { + self.get_or_create_layer(layer_id) + } + fn get_or_create_layer(&self, layer_id: usize) -> &AtomicUsize { if let Some(counter) = self.layers.get(layer_id) { return counter; diff --git a/db4-storage/src/pages/locked/nodes.rs b/db4-storage/src/pages/locked/nodes.rs index 48b4fd7f10..a43d89fa2b 100644 --- a/db4-storage/src/pages/locked/nodes.rs +++ b/db4-storage/src/pages/locked/nodes.rs @@ -69,11 +69,11 @@ impl<'a, NS: NodeSegmentOps> LockedNodePage<'a, NS> { } } -pub struct WriteLockedNodePages<'a, NS> { +pub struct WriteLockedNodeSegments<'a, NS> { writers: Vec>, } -impl Default for WriteLockedNodePages<'_, NS> { +impl Default for WriteLockedNodeSegments<'_, NS> { fn default() -> Self { Self { writers: Vec::new(), @@ -81,7 +81,7 @@ impl Default for WriteLockedNodePages<'_, NS> { } } -impl<'a, NS: NodeSegmentOps> WriteLockedNodePages<'a, NS> { +impl<'a, EXT, NS: NodeSegmentOps> WriteLockedNodeSegments<'a, NS> { pub fn new(writers: Vec>) -> Self { Self { writers } } diff --git a/db4-storage/src/pages/node_page/writer.rs b/db4-storage/src/pages/node_page/writer.rs index 882c97d05e..bb87225a0c 100644 --- a/db4-storage/src/pages/node_page/writer.rs +++ b/db4-storage/src/pages/node_page/writer.rs @@ -178,6 +178,7 @@ impl<'a, MP: DerefMut + 'a, NS: NodeSegmentOps> NodeWri self.page.increment_est_size(add); } + #[inline(always)] pub fn get_out_edge(&self, pos: LocalPOS, dst: VID, layer_id: usize) -> Option { self.page .get_out_edge(pos, dst, layer_id, self.mut_segment.deref()) @@ -200,8 +201,8 @@ impl<'a, MP: DerefMut + 'a, NS: NodeSegmentOps> NodeWri self.update_c_props(pos, layer_id, node_info_as_props(Some(gid), node_type), lsn); } - pub fn store_node_id(&mut self, pos: LocalPOS, layer_id: usize, gid: GidRef<'_>, lsn: u64) { - self.update_c_props(pos, layer_id, node_info_as_props(Some(gid), None), lsn); + pub fn store_node_id(&mut self, pos: LocalPOS, layer_id: usize, gid: Prop, lsn: u64) { + self.update_c_props(pos, layer_id, [(NODE_ID_IDX, gid)], lsn); } pub fn update_deletion_time(&mut self, t: T, node: LocalPOS, e_id: ELID, lsn: u64) { @@ -224,7 +225,6 @@ impl<'a, MP: DerefMut + 'a, NS: NodeSegmentOps> Drop for NodeWriter<'a, MP, NS> { fn drop(&mut self) { - self.page.increment_event_id(1); self.page .notify_write(self.mut_segment.deref_mut()) .expect("Failed to persist node page"); diff --git a/db4-storage/src/pages/node_store.rs b/db4-storage/src/pages/node_store.rs index 113112a77a..5e4750ecc4 100644 --- a/db4-storage/src/pages/node_store.rs +++ b/db4-storage/src/pages/node_store.rs @@ -5,12 +5,12 @@ use crate::{ error::StorageError, pages::{ layer_counter::GraphStats, - locked::nodes::{LockedNodePage, WriteLockedNodePages}, + locked::nodes::{LockedNodePage, WriteLockedNodeSegments}, }, persist::strategy::Config, segments::node::segment::MemNodeSegment, }; -use parking_lot::RwLockWriteGuard; +use parking_lot::{RwLock, RwLockWriteGuard}; use raphtory_api::core::entities::properties::meta::Meta; use raphtory_core::{ entities::{EID, VID}, @@ -24,11 +24,13 @@ use std::{ }; // graph // (nodes|edges) // graph segments // layers // chunks +const N: usize = 32; #[derive(Debug)] pub struct NodeStorageInner { - pages: boxcar::Vec>, + segments: boxcar::Vec>, stats: Arc, + free_segments: Box<[RwLock; N]>, nodes_path: Option, node_meta: Arc, edge_meta: Arc, @@ -46,18 +48,18 @@ impl, EXT: Config> ReadLockedNodeStorage, ) -> <::ArcLockedSegment as LockedNSSegment>::EntryRef<'_> { - let (page_id, pos) = self.storage.resolve_pos(node); - let locked_page = &self.locked_segments[page_id]; - locked_page.entry_ref(pos) + let (segment_id, pos) = self.storage.resolve_pos(node); + let locked_segment = &self.locked_segments[segment_id]; + locked_segment.entry_ref(pos) } pub fn try_node_ref( &self, node: VID, ) -> Option<<::ArcLockedSegment as LockedNSSegment>::EntryRef<'_>> { - let (page_id, pos) = self.storage.resolve_pos(node); - let locked_page = &self.locked_segments.get(page_id)?; - Some(locked_page.entry_ref(pos)) + let (segment_id, pos) = self.storage.resolve_pos(node); + let locked_segment = &self.locked_segments.get(segment_id)?; + Some(locked_segment.entry_ref(pos)) } pub fn len(&self) -> usize { @@ -104,6 +106,7 @@ impl NodeStorageInner { self.stats.get(0) } + // FIXME: this should be called by the high level APIs on layer filter pub fn layer_num_nodes(&self, layer_id: usize) -> usize { self.stats.get(layer_id) } @@ -113,7 +116,7 @@ impl NodeStorageInner { } pub fn segments(&self) -> &boxcar::Vec> { - &self.pages + &self.segments } pub fn nodes_path(&self) -> Option<&Path> { @@ -122,10 +125,10 @@ impl NodeStorageInner { /// Return the position of the chunk and the position within the chunk pub fn resolve_pos(&self, i: impl Into) -> (usize, LocalPOS) { - resolve_pos(i.into(), self.max_page_len()) + resolve_pos(i.into(), self.max_segment_len()) } - pub fn max_page_len(&self) -> u32 { + pub fn max_segment_len(&self) -> u32 { self.ext.max_node_page_len() } } @@ -137,9 +140,11 @@ impl, EXT: Config> NodeStorageInner edge_meta: Arc, ext: EXT, ) -> Self { + let free_segments = (0..N).map(RwLock::new).collect::>(); let empty = Self { - pages: boxcar::Vec::new(), + segments: boxcar::Vec::new(), stats: GraphStats::new().into(), + free_segments: free_segments.try_into().unwrap(), nodes_path, node_meta, edge_meta, @@ -165,7 +170,7 @@ impl, EXT: Config> NodeStorageInner } pub fn locked(self: &Arc) -> ReadLockedNodeStorage { let locked_segments = self - .pages + .segments .iter() .map(|(_, segment)| segment.locked()) .collect::>(); @@ -175,15 +180,15 @@ impl, EXT: Config> NodeStorageInner } } - pub fn write_locked<'a>(&'a self) -> WriteLockedNodePages<'a, NS> { - WriteLockedNodePages::new( - self.pages + pub fn write_locked<'a>(&'a self) -> WriteLockedNodeSegments<'a, NS> { + WriteLockedNodeSegments::new( + self.segments .iter() .map(|(page_id, page)| { LockedNodePage::new( page_id, &self.stats, - self.max_page_len(), + self.max_segment_len(), page.as_ref(), page.head_mut(), ) @@ -191,11 +196,75 @@ impl, EXT: Config> NodeStorageInner .collect(), ) } + pub fn reserve_free_pos(&self, row: usize) -> (usize, LocalPOS) { + let slot_idx = row % N; + let maybe_free_page = { + let lock_slot = self.free_segments[slot_idx].read_recursive(); + let page_id = *lock_slot; + let page = self.segments.get(page_id); + page.and_then(|page| { + self.reserve_segment_row(page) + .map(|pos| (page.segment_id(), LocalPOS(pos))) + }) + }; + + if let Some(reserved_pos) = maybe_free_page { + reserved_pos + } else { + // not lucky, go wait on your slot + let mut slot = self.free_segments[slot_idx].write(); + loop { + if let Some(page) = self.segments.get(*slot) + && let Some(pos) = self.reserve_segment_row(page) + { + return (page.segment_id(), LocalPOS(pos)); + } + *slot = self.push_new_segment(); + } + } + } + + fn reserve_segment_row(&self, segment: &Arc) -> Option { + // TODO: if this becomes a hotspot, we can switch to a fetch_add followed by a fetch_min + // this means when we read the counter we need to clamp it to max_page_len so the iterators don't break + segment + .nodes_counter() + .fetch_update( + std::sync::atomic::Ordering::Relaxed, + std::sync::atomic::Ordering::Relaxed, + |current| { + if current < self.max_segment_len() { + Some(current + 1) + } else { + None + } + }, + ) + .ok() + } + + fn push_new_segment(&self) -> usize { + let segment_id = self.segments.push_with(|segment_id| { + Arc::new(NS::new( + segment_id, + self.node_meta.clone(), + self.edge_meta.clone(), + self.nodes_path.clone(), + self.ext.clone(), + )) + }); + + while self.segments.get(segment_id).is_none() { + std::thread::yield_now(); + } + + segment_id + } pub fn node<'a>(&'a self, node: impl Into) -> NS::Entry<'a> { let (page_id, pos) = self.resolve_pos(node); let node_page = self - .pages + .segments .get(page_id) .expect("Internal error: page not found"); node_page.entry(pos) @@ -203,7 +272,7 @@ impl, EXT: Config> NodeStorageInner pub fn try_node(&self, node: VID) -> Option> { let (page_id, pos) = self.resolve_pos(node); - let node_page = self.pages.get(page_id)?; + let node_page = self.segments.get(page_id)?; Some(node_page.entry(pos)) } @@ -221,7 +290,7 @@ impl, EXT: Config> NodeStorageInner &'a self, segment_id: usize, ) -> Option, NS>> { - let segment = &self.pages[segment_id]; + let segment = &self.segments[segment_id]; let head = segment.try_head_mut()?; Some(NodeWriter::new(segment, &self.stats, head)) } @@ -232,6 +301,7 @@ impl, EXT: Config> NodeStorageInner ext: EXT, ) -> Result { let nodes_path = nodes_path.as_ref(); + let max_page_len = ext.max_node_page_len(); let node_meta = Arc::new(Meta::new_for_nodes()); if !nodes_path.exists() { @@ -325,10 +395,35 @@ impl, EXT: Config> NodeStorageInner .max() .unwrap_or(i64::MIN); + let mut free_pages = pages + .iter() + .filter_map(|(_, page)| { + let len = page.num_nodes(); + if len < max_page_len { + Some(RwLock::new(page.segment_id())) + } else { + None + } + }) + .collect::>(); + + let mut next_free_page = free_pages + .last() + .map(|page| *(page.read())) + .map(|last| last + 1) + .unwrap_or_else(|| pages.count()); + + free_pages.resize_with(N, || { + let lock = RwLock::new(next_free_page); + next_free_page += 1; + lock + }); + let stats = GraphStats::load(layer_counts, earliest, latest); Ok(Self { - pages, + segments: pages, + free_segments: free_pages.try_into().unwrap(), nodes_path: Some(nodes_path.to_path_buf()), stats: stats.into(), node_meta, @@ -339,10 +434,10 @@ impl, EXT: Config> NodeStorageInner pub fn get_edge(&self, src: VID, dst: VID, layer_id: usize) -> Option { let (src_chunk, src_pos) = self.resolve_pos(src); - if src_chunk >= self.pages.count() { + if src_chunk >= self.segments.count() { return None; } - let src_page = &self.pages[src_chunk]; + let src_page = &self.segments[src_chunk]; src_page.get_out_edge(src_pos, dst, layer_id, src_page.head()) } @@ -351,14 +446,14 @@ impl, EXT: Config> NodeStorageInner } pub fn get_or_create_segment(&self, segment_id: usize) -> &Arc { - if let Some(segment) = self.pages.get(segment_id) { + if let Some(segment) = self.segments.get(segment_id) { return segment; } - let count = self.pages.count(); + let count = self.segments.count(); if count > segment_id { // something has allocated the segment, wait for it to be added loop { - if let Some(segment) = self.pages.get(segment_id) { + if let Some(segment) = self.segments.get(segment_id) { return segment; } else { // wait for the segment to be created @@ -367,10 +462,10 @@ impl, EXT: Config> NodeStorageInner } } else { // we need to create the segment - self.pages.reserve(segment_id + 1 - count); + self.segments.reserve(segment_id + 1 - count); loop { - let new_segment_id = self.pages.push_with(|segment_id| { + let new_segment_id = self.segments.push_with(|segment_id| { Arc::new(NS::new( segment_id, self.node_meta.clone(), @@ -382,7 +477,7 @@ impl, EXT: Config> NodeStorageInner if new_segment_id >= segment_id { loop { - if let Some(segment) = self.pages.get(segment_id) { + if let Some(segment) = self.segments.get(segment_id) { return segment; } else { // wait for the segment to be created diff --git a/db4-storage/src/segments/node/segment.rs b/db4-storage/src/segments/node/segment.rs index bc7c5bcdd6..4af9aaa4d0 100644 --- a/db4-storage/src/segments/node/segment.rs +++ b/db4-storage/src/segments/node/segment.rs @@ -27,7 +27,7 @@ use std::{ path::PathBuf, sync::{ Arc, - atomic::{AtomicI64, AtomicUsize, Ordering}, + atomic::{AtomicU32, AtomicUsize, Ordering}, }, }; @@ -365,8 +365,8 @@ impl MemNodeSegment { pub struct NodeSegmentView { inner: Arc>, segment_id: usize, - event_id: AtomicI64, est_size: AtomicUsize, + max_num_node: AtomicU32, _ext: EXT, } @@ -403,22 +403,6 @@ impl>> NodeSegmentOps for NodeSegm self.head().t_len() } - fn event_id(&self) -> i64 { - self.event_id.load(Ordering::Relaxed) - } - - fn increment_event_id(&self, i: i64) { - self.event_id.fetch_add(i, Ordering::Relaxed); - } - - fn decrement_event_id(&self) -> i64 { - self.event_id - .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { - if x > 0 { Some(x - 1) } else { None } - }) - .unwrap_or_default() - } - fn load( _page_id: usize, _node_meta: Arc, @@ -447,7 +431,7 @@ impl>> NodeSegmentOps for NodeSegm .into(), segment_id: page_id, _ext: ext, - event_id: Default::default(), + max_num_node: AtomicU32::new(0), est_size: AtomicUsize::new(0), } } @@ -546,6 +530,10 @@ impl>> NodeSegmentOps for NodeSegm ) -> Result<(), StorageError> { Ok(()) } + + fn nodes_counter(&self) -> &AtomicU32 { + &self.max_num_node + } } #[cfg(test)] diff --git a/raphtory-api/src/core/entities/properties/prop/prop_enum.rs b/raphtory-api/src/core/entities/properties/prop/prop_enum.rs index acde96f75d..a076cd4b52 100644 --- a/raphtory-api/src/core/entities/properties/prop/prop_enum.rs +++ b/raphtory-api/src/core/entities/properties/prop/prop_enum.rs @@ -26,6 +26,7 @@ use thiserror::Error; use crate::core::entities::properties::prop::prop_array::*; use arrow_array::{cast::AsArray, ArrayRef, LargeListArray, StructArray}; use arrow_schema::{DataType, Field, FieldRef}; +use crate::core::entities::GID; pub const DECIMAL_MAX: i128 = 99999999999999999999999999999999999999i128; // equivalent to parquet decimal(38, 0) @@ -57,7 +58,16 @@ impl From> for Prop { fn from(value: GidRef<'_>) -> Self { match value { GidRef::U64(n) => Prop::U64(n), - GidRef::Str(s) => Prop::str(s), + GidRef::Str(s) => Prop::Str(ArcStr(s.into())), + } + } +} + +impl From for Prop { + fn from(value: GID) -> Self { + match value { + GID::U64(n) => Prop::U64(n), + GID::Str(s) => Prop::Str(ArcStr(s.into())), } } } diff --git a/raphtory-core/src/entities/properties/tcell.rs b/raphtory-core/src/entities/properties/tcell.rs index c81474e9f3..e51b8ae77c 100644 --- a/raphtory-core/src/entities/properties/tcell.rs +++ b/raphtory-core/src/entities/properties/tcell.rs @@ -23,7 +23,7 @@ enum TCellVariants { TCellN(TCellN), } -const BTREE_CUTOFF: usize = 128; +const BTREE_CUTOFF: usize = 16; impl TCell { pub fn new(t: TimeIndexEntry, value: A) -> Self { diff --git a/raphtory-core/src/lib.rs b/raphtory-core/src/lib.rs index 791b0765ae..c754214f76 100644 --- a/raphtory-core/src/lib.rs +++ b/raphtory-core/src/lib.rs @@ -24,24 +24,8 @@ //! * `macOS` //! -use std::{thread, time::Duration}; - -use parking_lot::RwLock; - pub mod entities; #[cfg(feature = "python")] mod python; pub mod storage; pub mod utils; - -pub(crate) fn loop_lock_write(l: &RwLock) -> parking_lot::RwLockWriteGuard<'_, A> { - const MAX_BACKOFF_US: u64 = 1000; // 1ms max - let mut backoff_us = 1; - loop { - if let Some(guard) = l.try_write_for(Duration::from_micros(50)) { - return guard; - } - thread::park_timeout(Duration::from_micros(backoff_us)); - backoff_us = (backoff_us * 2).min(MAX_BACKOFF_US); - } -} diff --git a/raphtory-core/src/storage/lazy_vec.rs b/raphtory-core/src/storage/lazy_vec.rs index b5f5cfe5ad..d8da2041e4 100644 --- a/raphtory-core/src/storage/lazy_vec.rs +++ b/raphtory-core/src/storage/lazy_vec.rs @@ -1,6 +1,6 @@ use arrow_array::BooleanArray; use serde::{Deserialize, Serialize}; -use std::{fmt::Debug, iter}; +use std::fmt::Debug; #[derive(thiserror::Error, Debug, PartialEq)] #[error("Cannot set previous value '{previous_value:?}' to '{new_value:?}' in position '{index}'")] diff --git a/raphtory-storage/src/mutation/addition_ops_ext.rs b/raphtory-storage/src/mutation/addition_ops_ext.rs index 70cba75036..428290d044 100644 --- a/raphtory-storage/src/mutation/addition_ops_ext.rs +++ b/raphtory-storage/src/mutation/addition_ops_ext.rs @@ -23,7 +23,7 @@ use storage::{ persist::strategy::PersistentStrategy, properties::props_meta_writer::PropsMetaWriter, resolver::GIDResolverOps, - Extension, WalImpl, ES, GS, NS, + Config, Extension, WalImpl, ES, GS, NS, }; pub struct WriteS<'a, EXT: PersistentStrategy, ES = ES, GS = GS>> { @@ -234,9 +234,11 @@ impl InternalAdditionOps for TemporalGraph { match id { NodeRef::External(id) => { let id = self.logical_to_physical.get_or_init(id, || { - self.node_count - .fetch_add(1, std::sync::atomic::Ordering::Relaxed) - .into() + let (seg, pos) = self + .storage() + .nodes() + .reserve_free_pos(self.storage().nodes().stats().get(0)); + pos.as_vid(seg, self.extension().max_node_page_len()) })?; Ok(id) diff --git a/raphtory/src/db/api/view/graph.rs b/raphtory/src/db/api/view/graph.rs index f8fc5d3387..f52adefbdc 100644 --- a/raphtory/src/db/api/view/graph.rs +++ b/raphtory/src/db/api/view/graph.rs @@ -360,7 +360,7 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { 0, ); } else { - writer.store_node_id(node_pos, 0, gid.as_ref(), 0); + writer.store_node_id(node_pos, 0, gid.clone().into(), 0); } graph_storage .write_session()? diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs index 7ddeb6cae1..3d4b73df58 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -15,6 +15,7 @@ use db4_graph::WriteLockedGraph; use either::Either; use itertools::izip; use kdam::{Bar, BarBuilder, BarExt}; +use raphtory_api::core::storage::FxHashMap; use raphtory_api::{ atomic_extra::atomic_usize_from_mut_slice, core::{ @@ -22,7 +23,7 @@ use raphtory_api::{ properties::{meta::STATIC_GRAPH_LAYER_ID, prop::PropType}, EID, }, - storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry}, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, }, }; use raphtory_core::{ @@ -42,6 +43,7 @@ use std::{ atomic::{AtomicBool, AtomicUsize, Ordering}, Arc, }, + usize, }; fn build_progress_bar(des: String, num_rows: usize) -> Result { @@ -273,11 +275,11 @@ pub fn load_edges_from_df = vec![]; + let mut dst_col_resolved: Vec<(VID, Prop)> = vec![]; let mut eid_col_resolved: Vec = vec![]; let mut eids_exist: Vec = vec![]; // exists or needs to be created let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created @@ -304,8 +306,8 @@ pub fn load_edges_from_df = FxDashMap::default(); // It's our graph, no one else can change it src_col .par_iter() .zip(src_col_resolved.par_iter_mut()) .try_for_each(|(gid, resolved)| { let gid = gid.ok_or(LoadError::FatalError)?; + let gid_prop = if let Some(gid_prop) = gid_str_cache.get(&gid) { + gid_prop.value().clone() + } else { + let gid_prop = gid_str_cache.entry(gid).or_insert_with(|| Prop::from(gid)); + gid_prop.value().clone() + }; let vid = write_locked_graph .graph() .resolve_node(gid.as_node_ref()) @@ -334,7 +344,7 @@ pub fn load_edges_from_df(()) })?; @@ -343,6 +353,13 @@ pub fn load_edges_from_df(()) })?; @@ -404,18 +421,19 @@ pub fn load_edges_from_df = vec![]; - let mut c_props: Vec<(usize, Prop)> = vec![]; - - for (row, (src, dst, time, secondary_index, eid, layer, exists)) in - zip.enumerate() - { - if let Some(eid_pos) = shard.resolve_pos(*eid) { - let t = TimeIndexEntry(time, secondary_index); - let mut writer = shard.writer(); - - t_props.clear(); - t_props.extend(prop_cols.iter_row(row)); - - c_props.clear(); - c_props.extend(metadata_cols.iter_row(row)); - c_props.extend_from_slice(&shared_metadata); - - writer.bulk_add_edge( - t, - eid_pos, - *src, - *dst, - exists, - *layer, - c_props.drain(..), - t_props.drain(..), - 0, - ); - } - } - }); - }); - }); - - #[cfg(feature = "python")] + // rayon::scope(|sc| { + // // Add inbound edges + // sc.spawn(|_| { + // write_locked_graph + // .nodes + // .par_iter_mut() + // .enumerate() + // .for_each(|(page_id, shard)| { + // let zip = izip!( + // src_col_resolved.iter(), + // dst_col_resolved.iter(), + // dst_col.iter(), + // eid_col_resolved.iter(), + // time_col.iter(), + // secondary_index_col.iter(), + // layer_col_resolved.iter(), + // layer_eids_exist.iter().map(|a| a.load(Ordering::Relaxed)), + // eids_exist.iter().map(|b| b.load(Ordering::Relaxed)) + // ); + + // for ( + // src, + // dst, + // dst_gid, + // eid, + // time, + // secondary_index, + // layer, + // edge_exists_in_layer, + // edge_exists_in_static_graph, + // ) in zip + // { + // if let Some(dst_pos) = shard.resolve_pos(*dst) { + // let t = TimeIndexEntry(time, secondary_index); + // let mut writer = shard.writer(); + + // writer.store_node_id(dst_pos, 0, dst_gid, 0); + + // if !edge_exists_in_static_graph { + // writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); + // } + + // if !edge_exists_in_layer { + // writer.add_inbound_edge( + // Some(t), + // dst_pos, + // *src, + // eid.with_layer(*layer), + // 0, + // ); + // } else { + // writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); + // } + + // per_segment_edge_count[page_id].fetch_add(1, Ordering::Relaxed); + // } + // } + // }); + // }); + + // // Add temporal & constant properties to edges + // sc.spawn(|_| { + // write_locked_graph.edges.par_iter_mut().for_each(|shard| { + // let zip = izip!( + // src_col_resolved.iter(), + // dst_col_resolved.iter(), + // time_col.iter(), + // secondary_index_col.iter(), + // eid_col_resolved.iter(), + // layer_col_resolved.iter(), + // eids_exist + // .iter() + // .map(|exists| exists.load(Ordering::Relaxed)) + // ); + // let mut t_props: Vec<(usize, Prop)> = vec![]; + // let mut c_props: Vec<(usize, Prop)> = vec![]; + + // for (row, (src, dst, time, secondary_index, eid, layer, exists)) in + // zip.enumerate() + // { + // if let Some(eid_pos) = shard.resolve_pos(*eid) { + // let t = TimeIndexEntry(time, secondary_index); + // let mut writer = shard.writer(); + + // t_props.clear(); + // t_props.extend(prop_cols.iter_row(row)); + + // c_props.clear(); + // c_props.extend(metadata_cols.iter_row(row)); + // c_props.extend_from_slice(&shared_metadata); + + // writer.bulk_add_edge( + // t, + // eid_pos, + // *src, + // *dst, + // exists, + // *layer, + // c_props.drain(..), + // t_props.drain(..), + // 0, + // ); + // } + // } + // }); + // }); + // }); + + // #[cfg(feature = "python")] let _ = pb.update(df.len()); } From 2b7c9f7d2d5b6acdb839d1ac83b644f4489d7d12 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Wed, 3 Dec 2025 12:38:10 +0000 Subject: [PATCH 03/24] added better iterators for new node layout --- Cargo.toml | 1 + db4-graph/src/lib.rs | 3 + db4-storage/Cargo.toml | 1 + db4-storage/src/api/nodes.rs | 18 + db4-storage/src/lib.rs | 1 + db4-storage/src/pages/edge_store.rs | 8 + db4-storage/src/pages/locked/edges.rs | 8 + db4-storage/src/pages/locked/nodes.rs | 1 + db4-storage/src/pages/mod.rs | 48 ++ db4-storage/src/pages/node_store.rs | 26 +- db4-storage/src/segments/node/segment.rs | 18 +- raphtory-storage/src/graph/graph.rs | 16 +- .../src/mutation/addition_ops_ext.rs | 8 +- raphtory/src/db/api/view/internal/list_ops.rs | 46 +- raphtory/src/db/graph/nodes.rs | 4 +- raphtory/src/io/arrow/df_loaders.rs | 556 +++++++++--------- .../src/python/graph/io/pandas_loaders.rs | 2 +- 17 files changed, 441 insertions(+), 324 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 936043adb6..2c4826268e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -100,6 +100,7 @@ num-integer = "0.1" rand_distr = "0.5.1" rustc-hash = "2.0.0" twox-hash = "2.1.0" +tinyvec = { version = "1.10", features = ["serde", "alloc"] } lock_api = { version = "0.4.11", features = ["arc_lock", "serde"] } dashmap = { version = "6.0.1", features = ["serde", "rayon"] } glam = "0.29.0" diff --git a/db4-graph/src/lib.rs b/db4-graph/src/lib.rs index 382ba1f688..90363daf43 100644 --- a/db4-graph/src/lib.rs +++ b/db4-graph/src/lib.rs @@ -35,6 +35,7 @@ use tempfile::TempDir; pub struct TemporalGraph { // mapping between logical and physical ids pub logical_to_physical: Arc, + pub event_counter: AtomicUsize, storage: Arc>, graph_dir: Option, pub transaction_manager: Arc, @@ -156,6 +157,7 @@ impl, ES = ES, GS = GS>> Temporal Ok(Self { graph_dir: Some(path.into()), + event_counter: AtomicUsize::new(resolver.len()), logical_to_physical: resolver.into(), storage: Arc::new(storage), transaction_manager: Arc::new(TransactionManager::new(wal.clone())), @@ -204,6 +206,7 @@ impl, ES = ES, GS = GS>> Temporal logical_to_physical, storage: Arc::new(storage), transaction_manager: Arc::new(TransactionManager::new(wal.clone())), + event_counter: AtomicUsize::new(0), wal, }) } diff --git a/db4-storage/Cargo.toml b/db4-storage/Cargo.toml index c3db7ff3c3..8649d81c15 100644 --- a/db4-storage/Cargo.toml +++ b/db4-storage/Cargo.toml @@ -36,6 +36,7 @@ itertools.workspace = true thiserror.workspace = true roaring.workspace = true sysinfo.workspace = true +tinyvec.workspace = true proptest = { workspace = true, optional = true } tempfile = { workspace = true, optional = true } iter-enum = { workspace = true, features = ["rayon"] } diff --git a/db4-storage/src/api/nodes.rs b/db4-storage/src/api/nodes.rs index c674c1bce9..f4af23344b 100644 --- a/db4-storage/src/api/nodes.rs +++ b/db4-storage/src/api/nodes.rs @@ -24,6 +24,8 @@ use std::{ sync::{Arc, atomic::AtomicU32}, }; +use rayon::prelude::*; + use crate::{ LocalPOS, error::StorageError, @@ -132,7 +134,23 @@ pub trait LockedNSSegment: std::fmt::Debug + Send + Sync { where Self: 'a; + fn num_nodes(&self) -> u32; + fn entry_ref<'a>(&'a self, pos: impl Into) -> Self::EntryRef<'a>; + + fn iter_entries<'a>(&'a self) -> impl Iterator> + Send + Sync + 'a { + let num_nodes = self.num_nodes(); + (0..num_nodes).map(move |vid| self.entry_ref(LocalPOS(vid))) + } + + fn par_iter_entries<'a>( + &'a self, + ) -> impl ParallelIterator> + Send + Sync + 'a { + let num_nodes = self.num_nodes(); + (0..num_nodes) + .into_par_iter() + .map(move |vid| self.entry_ref(LocalPOS(vid))) + } } pub trait NodeEntryOps<'a>: Send + Sync + 'a { diff --git a/db4-storage/src/lib.rs b/db4-storage/src/lib.rs index ebe5bf708b..0b1291e02a 100644 --- a/db4-storage/src/lib.rs +++ b/db4-storage/src/lib.rs @@ -43,6 +43,7 @@ pub mod persist; pub mod properties; pub mod resolver; pub mod segments; +pub mod state; pub mod utils; pub mod wal; diff --git a/db4-storage/src/pages/edge_store.rs b/db4-storage/src/pages/edge_store.rs index 71ed0d1be3..65fa64ce86 100644 --- a/db4-storage/src/pages/edge_store.rs +++ b/db4-storage/src/pages/edge_store.rs @@ -10,6 +10,7 @@ use crate::{ api::edges::{EdgeRefOps, EdgeSegmentOps, LockedESegment}, error::StorageError, pages::{ + SegmentCounts, layer_counter::GraphStats, locked::edges::{LockedEdgePage, WriteLockedEdgePages}, }, @@ -573,4 +574,11 @@ impl, EXT: Config> EdgeStorageInner }) }) } + + pub(crate) fn segment_counts(&self) -> SegmentCounts { + SegmentCounts::new( + self.max_page_len(), + self.pages().iter().map(|(_, seg)| seg.num_edges()), + ) + } } diff --git a/db4-storage/src/pages/locked/edges.rs b/db4-storage/src/pages/locked/edges.rs index a07f03147b..1bfe0005d3 100644 --- a/db4-storage/src/pages/locked/edges.rs +++ b/db4-storage/src/pages/locked/edges.rs @@ -119,4 +119,12 @@ impl<'a, ES: EdgeSegmentOps> WriteLockedEdgePages<'a, ES> { } Ok(()) } + + pub fn len(&self) -> usize { + self.writers.len() + } + + pub fn is_empty(&self) -> bool { + self.writers.is_empty() + } } diff --git a/db4-storage/src/pages/locked/nodes.rs b/db4-storage/src/pages/locked/nodes.rs index a43d89fa2b..faaa782108 100644 --- a/db4-storage/src/pages/locked/nodes.rs +++ b/db4-storage/src/pages/locked/nodes.rs @@ -10,6 +10,7 @@ use raphtory_core::entities::VID; use rayon::prelude::*; use std::ops::DerefMut; +#[derive(Debug)] pub struct LockedNodePage<'a, NS> { page_id: usize, max_page_len: u32, diff --git a/db4-storage/src/pages/mod.rs b/db4-storage/src/pages/mod.rs index 6be52d59e5..afe4195f99 100644 --- a/db4-storage/src/pages/mod.rs +++ b/db4-storage/src/pages/mod.rs @@ -17,6 +17,7 @@ use raphtory_api::core::{ entities::properties::{meta::Meta, prop::Prop}, storage::dict_mapper::MaybeNew, }; +use rayon::prelude::*; use raphtory_core::{ entities::{EID, ELID, VID}, @@ -31,6 +32,7 @@ use std::{ atomic::{self, AtomicUsize}, }, }; +use tinyvec::TinyVec; pub mod edge_page; pub mod edge_store; @@ -126,6 +128,14 @@ impl< self.nodes.stats().latest().max(self.edges.stats().latest()) } + pub fn node_segment_counts(&self) -> SegmentCounts { + self.nodes.segment_counts() + } + + pub fn edge_segment_counts(&self) -> SegmentCounts { + self.edges.segment_counts() + } + pub fn load(graph_dir: impl AsRef) -> Result { let nodes_path = graph_dir.as_ref().join("nodes"); let edges_path = graph_dir.as_ref().join("edges"); @@ -433,6 +443,44 @@ impl< } } +#[derive(Debug)] +pub struct SegmentCounts { + max_seg_len: u32, + counts: TinyVec<[u32; node_store::N]>, // this might come to be a problem + _marker: std::marker::PhantomData, +} + +impl + Send> SegmentCounts { + pub fn new(max_seg_len: u32, counts: impl IntoIterator) -> Self { + let counts: TinyVec<[u32; node_store::N]> = counts.into_iter().collect(); + + Self { + max_seg_len, + counts, + _marker: std::marker::PhantomData, + } + } + + pub fn into_iter(self) -> impl Iterator { + let max_seg_len = self.max_seg_len as usize; + self.counts.into_iter().enumerate().flat_map(move |(i, c)| { + let g_pos = i * max_seg_len as usize; + (0..c).map(move |offset| I::from(g_pos + offset as usize)) + }) + } + + pub fn into_par_iter(self) -> impl ParallelIterator { + let max_seg_len = self.max_seg_len as usize; + (0..self.counts.len()).into_par_iter().flat_map(move |i| { + let c = self.counts[i]; + let g_pos = i * max_seg_len as usize; + (0..c) + .into_par_iter() + .map(move |offset| I::from(g_pos + offset as usize)) + }) + } +} + impl Drop for GraphStore { fn drop(&mut self) { let node_types = self.nodes.prop_meta().get_all_node_types(); diff --git a/db4-storage/src/pages/node_store.rs b/db4-storage/src/pages/node_store.rs index 5e4750ecc4..401b71991a 100644 --- a/db4-storage/src/pages/node_store.rs +++ b/db4-storage/src/pages/node_store.rs @@ -4,6 +4,7 @@ use crate::{ api::nodes::{LockedNSSegment, NodeSegmentOps}, error::StorageError, pages::{ + SegmentCounts, layer_counter::GraphStats, locked::nodes::{LockedNodePage, WriteLockedNodeSegments}, }, @@ -24,7 +25,7 @@ use std::{ }; // graph // (nodes|edges) // graph segments // layers // chunks -const N: usize = 32; +pub const N: usize = 32; #[derive(Debug)] pub struct NodeStorageInner { @@ -75,10 +76,9 @@ impl, EXT: Config> ReadLockedNodeStorage impl Iterator< Item = <::ArcLockedSegment as LockedNSSegment>::EntryRef<'_>, > + '_ { - (0..self.len()).map(move |i| { - let vid = VID(i); - self.node_ref(vid) - }) + self.locked_segments + .iter() + .flat_map(move |segment| segment.iter_entries()) } pub fn par_iter( @@ -86,10 +86,9 @@ impl, EXT: Config> ReadLockedNodeStorage impl rayon::iter::ParallelIterator< Item = <::ArcLockedSegment as LockedNSSegment>::EntryRef<'_>, > + '_ { - (0..self.len()).into_par_iter().map(move |i| { - let vid = VID(i); - self.node_ref(vid) - }) + self.locked_segments + .par_iter() + .flat_map(move |segment| segment.par_iter_entries()) } } @@ -168,6 +167,7 @@ impl, EXT: Config> NodeStorageInner } empty } + pub fn locked(self: &Arc) -> ReadLockedNodeStorage { let locked_segments = self .segments @@ -196,6 +196,7 @@ impl, EXT: Config> NodeStorageInner .collect(), ) } + pub fn reserve_free_pos(&self, row: usize) -> (usize, LocalPOS) { let slot_idx = row % N; let maybe_free_page = { @@ -488,4 +489,11 @@ impl, EXT: Config> NodeStorageInner } } } + + pub(crate) fn segment_counts(&self) -> SegmentCounts { + SegmentCounts::new( + self.max_segment_len(), + self.segments().iter().map(|(_, seg)| seg.num_nodes()), + ) + } } diff --git a/db4-storage/src/segments/node/segment.rs b/db4-storage/src/segments/node/segment.rs index 4af9aaa4d0..72df28f69e 100644 --- a/db4-storage/src/segments/node/segment.rs +++ b/db4-storage/src/segments/node/segment.rs @@ -373,6 +373,16 @@ pub struct NodeSegmentView { #[derive(Debug)] pub struct ArcLockedSegmentView { inner: ArcRwLockReadGuard, + num_nodes: u32, +} + +impl ArcLockedSegmentView { + pub fn new( + inner: ArcRwLockReadGuard, + num_nodes: u32, + ) -> Self { + Self { inner, num_nodes } + } } impl LockedNSSegment for ArcLockedSegmentView { @@ -382,6 +392,10 @@ impl LockedNSSegment for ArcLockedSegmentView { let pos = pos.into(); MemNodeRef::new(pos, &self.inner) } + + fn num_nodes(&self) -> u32 { + self.num_nodes + } } impl>> NodeSegmentOps for NodeSegmentView

{ @@ -499,9 +513,7 @@ impl>> NodeSegmentOps for NodeSegm } fn locked(self: &Arc) -> Self::ArcLockedSegment { - ArcLockedSegmentView { - inner: self.inner.read_arc(), - } + ArcLockedSegmentView::new(self.inner.read_arc(), self.num_nodes()) } fn num_layers(&self) -> usize { diff --git a/raphtory-storage/src/graph/graph.rs b/raphtory-storage/src/graph/graph.rs index 38176450d2..baa40b5a2b 100644 --- a/raphtory-storage/src/graph/graph.rs +++ b/raphtory-storage/src/graph/graph.rs @@ -14,7 +14,7 @@ use db4_graph::TemporalGraph; use raphtory_api::core::entities::{properties::meta::Meta, LayerIds, LayerVariants, EID, VID}; use raphtory_core::entities::{nodes::node_ref::NodeRef, properties::graph_meta::GraphMeta}; use std::{fmt::Debug, iter, sync::Arc}; -use storage::{Extension, GraphPropEntry}; +use storage::{pages::SegmentCounts, Extension, GraphPropEntry}; use thiserror::Error; #[derive(Clone, Debug)] @@ -259,4 +259,18 @@ impl GraphStorage { GraphStorage::Unlocked(storage) => storage.extension(), } } + + pub fn node_segment_counts(&self) -> SegmentCounts { + match self { + GraphStorage::Mem(storage) => storage.graph.storage().node_segment_counts(), + GraphStorage::Unlocked(storage) => storage.storage().node_segment_counts(), + } + } + + pub fn edge_segment_counts(&self) -> SegmentCounts { + match self { + GraphStorage::Mem(storage) => storage.graph.storage().edge_segment_counts(), + GraphStorage::Unlocked(storage) => storage.storage().edge_segment_counts(), + } + } } diff --git a/raphtory-storage/src/mutation/addition_ops_ext.rs b/raphtory-storage/src/mutation/addition_ops_ext.rs index 428290d044..db63cda054 100644 --- a/raphtory-storage/src/mutation/addition_ops_ext.rs +++ b/raphtory-storage/src/mutation/addition_ops_ext.rs @@ -234,10 +234,10 @@ impl InternalAdditionOps for TemporalGraph { match id { NodeRef::External(id) => { let id = self.logical_to_physical.get_or_init(id, || { - let (seg, pos) = self - .storage() - .nodes() - .reserve_free_pos(self.storage().nodes().stats().get(0)); + let (seg, pos) = self.storage().nodes().reserve_free_pos( + self.event_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed), + ); pos.as_vid(seg, self.extension().max_node_page_len()) })?; diff --git a/raphtory/src/db/api/view/internal/list_ops.rs b/raphtory/src/db/api/view/internal/list_ops.rs index 41d63aee49..a3da49bbb0 100644 --- a/raphtory/src/db/api/view/internal/list_ops.rs +++ b/raphtory/src/db/api/view/internal/list_ops.rs @@ -2,6 +2,7 @@ use crate::{ core::entities::{EID, VID}, db::api::{state::Index, view::Base}, }; +use raphtory_storage::graph::graph::GraphStorage; use rayon::{iter::Either, prelude::*}; use std::hash::Hash; @@ -62,27 +63,6 @@ impl + From + Send + Sync> List { } } - pub fn par_iter(&self) -> impl IndexedParallelIterator + '_ { - match self { - List::All { len } => Either::Left((0..*len).into_par_iter().map(From::from)), - List::List { elems } => Either::Right(elems.par_iter()), - } - } - - pub fn into_par_iter(self) -> impl IndexedParallelIterator { - match self { - List::All { len } => Either::Left((0..len).into_par_iter().map(From::from)), - List::List { elems } => Either::Right(elems.into_par_iter()), - } - } - - pub fn iter(&self) -> impl Iterator + '_ { - match self { - List::All { len } => Either::Left((0..*len).map(From::from)), - List::List { elems } => Either::Right(elems.iter()), - } - } - pub fn len(&self) -> usize { match self { List::All { len } => *len, @@ -95,16 +75,24 @@ impl + From + Send + Sync> List { } } -impl + From + Send + Sync + 'static> IntoIterator - for List -{ - type Item = I; - type IntoIter = Box + Send + Sync>; +impl List { + pub fn nodes_iter(self, g: &GraphStorage) -> impl Iterator { + match self { + List::All { .. } => { + let sc = g.node_segment_counts(); + Either::Left(sc.into_iter()) + } + List::List { elems } => Either::Right(elems.into_iter()), + } + } - fn into_iter(self) -> Self::IntoIter { + pub fn nodes_par_iter(self, g: &GraphStorage) -> impl ParallelIterator { match self { - List::All { len } => Box::new((0..len).map(From::from)), - List::List { elems } => Box::new(elems.into_iter()), + List::All { .. } => { + let sc = g.node_segment_counts(); + Either::Left(sc.into_par_iter()) + } + List::List { elems } => Either::Right(elems.into_par_iter()), } } } diff --git a/raphtory/src/db/graph/nodes.rs b/raphtory/src/db/graph/nodes.rs index 0682300881..9669a66f14 100644 --- a/raphtory/src/db/graph/nodes.rs +++ b/raphtory/src/db/graph/nodes.rs @@ -171,7 +171,7 @@ where let g = self.graph.core_graph().lock(); let view = self.graph.clone(); let node_types_filter = self.node_types_filter.clone(); - self.node_list().into_par_iter().filter(move |&vid| { + self.node_list().nodes_par_iter(&g).filter(move |&vid| { g.try_core_node(vid).is_some_and(|node| { node_types_filter .as_ref() @@ -199,7 +199,7 @@ where fn iter_vids(&self, g: GraphStorage) -> impl Iterator + Send + Sync + 'graph { let node_types_filter = self.node_types_filter.clone(); let view = self.graph.clone(); - self.node_list().into_iter().filter(move |&vid| { + self.node_list().nodes_iter(&g).filter(move |&vid| { g.try_core_node(vid).is_some_and(|node| { node_types_filter .as_ref() diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs index 3d4b73df58..87ab30e9dc 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -41,7 +41,7 @@ use std::{ collections::HashMap, sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, + mpsc, Arc, }, usize, }; @@ -232,7 +232,7 @@ pub fn load_nodes_from_df< } pub fn load_edges_from_df( - df_view: DFView>>, + df_view: DFView> + Send>, time: &str, secondary_index: Option<&str>, src: &str, @@ -278,310 +278,316 @@ pub fn load_edges_from_df = vec![]; - let mut dst_col_resolved: Vec<(VID, Prop)> = vec![]; + let mut src_col_resolved: Vec = vec![]; + let mut dst_col_resolved: Vec = vec![]; let mut eid_col_resolved: Vec = vec![]; let mut eids_exist: Vec = vec![]; // exists or needs to be created let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - // set the type of the resolver; - let chunks = df_view.chunks.peekable(); + rayon::scope(|s| { + let (tx, rx) = mpsc::sync_channel(2); - let num_nodes = AtomicUsize::new(write_locked_graph.graph().internal_num_nodes()); - - for chunk in chunks { - let df = chunk?; - let prop_cols = - combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, false) - .map_err(into_graph_err) - })?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; + s.spawn(move |_| { + let sender = tx; + for chunk in df_view.chunks { + sender.send(chunk).unwrap() + } + }); + + for chunk in rx.iter() { + let df = chunk?; + let prop_cols = + combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, false) + .map_err(into_graph_err) + })?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + src_col_resolved.resize_with(df.len(), Default::default); + dst_col_resolved.resize_with(df.len(), Default::default); + + // let src_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut src_col_resolved)); + // let dst_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut dst_col_resolved)); + + let layer = lift_layer_col(layer, layer_index, &df)?; + let layer_col_resolved = layer.resolve(graph)?; + + let src_col = df.node_col(src_index)?; + src_col.validate(graph, LoadError::MissingSrcError)?; + + let dst_col = df.node_col(dst_index)?; + dst_col.validate(graph, LoadError::MissingDstError)?; + + // avoid allocation of ArcStr + let gid_str_cache = FxDashMap::default(); + // It's our graph, no one else can change it + src_col + .par_iter() + .zip(src_col_resolved.par_iter_mut()) + .try_for_each(|(gid, resolved)| { + let gid = gid.ok_or(LoadError::FatalError)?; + let vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(|_| LoadError::FatalError)?; + + if vid.is_new() && gid_str_cache.get(&gid).is_none() { + gid_str_cache + .entry(gid) + .or_insert_with(|| (Prop::from(gid), vid)); + }; - src_col_resolved.resize_with(df.len(), || (VID(0), Prop::Bool(false))); - dst_col_resolved.resize_with(df.len(), || (VID(0), Prop::Bool(false))); + *resolved = vid.inner(); + Ok::<(), LoadError>(()) + })?; - // let src_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut src_col_resolved)); - // let dst_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut dst_col_resolved)); + dst_col + .par_iter() + .zip(dst_col_resolved.par_iter_mut()) + .try_for_each(|(gid, resolved)| { + let gid = gid.ok_or(LoadError::FatalError)?; - let layer = lift_layer_col(layer, layer_index, &df)?; - let layer_col_resolved = layer.resolve(graph)?; + let vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(|_| LoadError::FatalError)?; - let src_col = df.node_col(src_index)?; - src_col.validate(graph, LoadError::MissingSrcError)?; + if vid.is_new() && gid_str_cache.get(&gid).is_none() { + gid_str_cache + .entry(gid) + .or_insert_with(|| (Prop::from(gid), vid)); + }; - let dst_col = df.node_col(dst_index)?; - dst_col.validate(graph, LoadError::MissingDstError)?; + *resolved = vid.inner(); + Ok::<(), LoadError>(()) + })?; - // avoid allocation of ArcStr - let gid_str_cache: FxDashMap = FxDashMap::default(); - // It's our graph, no one else can change it - src_col - .par_iter() - .zip(src_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let gid_prop = if let Some(gid_prop) = gid_str_cache.get(&gid) { - gid_prop.value().clone() - } else { - let gid_prop = gid_str_cache.entry(gid).or_insert_with(|| Prop::from(gid)); - gid_prop.value().clone() - }; - let vid = write_locked_graph - .graph() - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; + let time_col = df.time_col(time_index)?; - if vid.is_new() { - num_nodes.fetch_add(1, Ordering::Relaxed); + // Load the secondary index column if it exists, otherwise generate from start_id. + let secondary_index_col = match secondary_index_index { + Some(col_index) => { + // Update the event_id to reflect ingesting new secondary indices. + let col = df.secondary_index_col(col_index)?; + session + .set_max_event_id(col.max()) + .map_err(into_graph_err)?; + col } - - *resolved = (vid.inner(), gid_prop); - Ok::<(), LoadError>(()) - })?; - - dst_col - .par_iter() - .zip(dst_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - - let gid_prop = if let Some(gid_prop) = gid_str_cache.get(&gid) { - gid_prop.value().clone() - } else { - let gid_prop = gid_str_cache.entry(gid).or_insert_with(|| Prop::from(gid)); - gid_prop.value().clone() - }; - let vid = write_locked_graph - .graph() - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; - - if vid.is_new() { - num_nodes.fetch_add(1, Ordering::Relaxed); + None => { + let start_id = session + .reserve_event_ids(df.len()) + .map_err(into_graph_err)?; + SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) } + }; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + eid_col_resolved.resize_with(df.len(), Default::default); + eids_exist.resize_with(df.len(), Default::default); + layer_eids_exist.resize_with(df.len(), Default::default); + let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); + + let edges = write_locked_graph.graph().storage().edges().clone(); + let next_edge_id = |row: usize| { + let (page, pos) = edges.reserve_free_pos(row); + pos.as_eid(page, edges.max_page_len()) + }; + + let mut per_segment_edge_count = Vec::with_capacity(write_locked_graph.nodes.len()); + per_segment_edge_count + .resize_with(write_locked_graph.nodes.len(), || AtomicUsize::new(0)); + + let WriteLockedGraph { + nodes, ref edges, .. + } = &mut write_locked_graph; + + // Generate all edge_ids + add outbound edges + nodes + .par_iter_mut() + .enumerate() // TODO: change to par_iter_mut but preserve edge_id order + .for_each(|(p_id, locked_page)| { + // Zip all columns for iteration. + let zip = izip!( + src_col_resolved.iter(), + dst_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + layer_col_resolved.iter() + ); + + for entry in gid_str_cache.iter() { + let (src_gid, vid) = entry.value(); + + if let Some(src_pos) = locked_page.resolve_pos(vid.inner()) { + let mut writer = locked_page.writer(); + writer.store_node_id(src_pos, 0, src_gid.clone(), 0); + } + } - *resolved = (vid.inner(), gid_prop); - Ok::<(), LoadError>(()) - })?; - - let time_col = df.time_col(time_index)?; - - // Load the secondary index column if it exists, otherwise generate from start_id. - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) - } - }; - - write_locked_graph.resize_chunks_to_num_nodes(num_nodes.load(Ordering::Relaxed)); - - eid_col_resolved.resize_with(df.len(), Default::default); - eids_exist.resize_with(df.len(), Default::default); - layer_eids_exist.resize_with(df.len(), Default::default); - let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); - - let edges = write_locked_graph.graph().storage().edges().clone(); - let next_edge_id = |row: usize| { - let (page, pos) = edges.reserve_free_pos(row); - pos.as_eid(page, edges.max_page_len()) - }; - - let mut per_segment_edge_count = Vec::with_capacity(write_locked_graph.nodes.len()); - per_segment_edge_count.resize_with(write_locked_graph.nodes.len(), || AtomicUsize::new(0)); - - let WriteLockedGraph { - nodes, ref edges, .. - } = &mut write_locked_graph; + for (row, (src, dst, time, secondary_index, layer)) in zip.enumerate() { + if let Some(src_pos) = locked_page.resolve_pos(*src) { + let mut writer = locked_page.writer(); + let t = TimeIndexEntry(time, secondary_index); + // find the original EID in the static graph if it exists + // otherwise create a new one + + let edge_id = + if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(true, Ordering::Relaxed); + edge_id.with_layer(*layer) + } else { + let edge_id = next_edge_id(row); + writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(false, Ordering::Relaxed); + edge_id.with_layer(*layer) + }; + + if edges.exists(edge_id) + // || writer.get_out_edge(src_pos, *dst, *layer).is_some() + { + layer_eids_exist[row].store(true, Ordering::Relaxed); + // node additions + writer.update_timestamp(t, src_pos, edge_id, 0); + } else { + layer_eids_exist[row].store(false, Ordering::Relaxed); + // actually adds the edge + writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); + } + } + } + }); - // Generate all edge_ids + add outbound edges - nodes - .iter_mut() // TODO: change to par_iter_mut but preserve edge_id order - .enumerate() - .for_each(|(page_id, locked_page)| { - // Zip all columns for iteration. + // s.spawn(|_| { + write_locked_graph.nodes.par_iter_mut().for_each(|shard| { let zip = izip!( src_col_resolved.iter(), dst_col_resolved.iter(), + eid_col_resolved.iter(), time_col.iter(), secondary_index_col.iter(), - layer_col_resolved.iter() + layer_col_resolved.iter(), + layer_eids_exist.iter().map(|a| a.load(Ordering::Relaxed)), + eids_exist.iter().map(|b| b.load(Ordering::Relaxed)) ); - for (row, ((src, src_gid), (dst, _), time, secondary_index, layer)) in - zip.enumerate() + for ( + src, + dst, + eid, + time, + secondary_index, + layer, + edge_exists_in_layer, + edge_exists_in_static_graph, + ) in zip { - if let Some(src_pos) = locked_page.resolve_pos(*src) { - let mut writer = locked_page.writer(); + if let Some(dst_pos) = shard.resolve_pos(*dst) { let t = TimeIndexEntry(time, secondary_index); - // writer.store_node_id(src_pos, 0, src_gid.clone(), 0); - // find the original EID in the static graph if it exists - // otherwise create a new one - - let edge_id = if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(true, Ordering::Relaxed); - edge_id.with_layer(*layer) - } else { - let edge_id = next_edge_id(row); - - writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(false, Ordering::Relaxed); - edge_id.with_layer(*layer) - }; - - if edges.exists(edge_id) - // || writer.get_out_edge(src_pos, *dst, *layer).is_some() - { - layer_eids_exist[row].store(true, Ordering::Relaxed); - // node additions - writer.update_timestamp(t, src_pos, edge_id, 0); - } else { - layer_eids_exist[row].store(false, Ordering::Relaxed); - // actually adds the edge - writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); + let mut writer = shard.writer(); + + if !edge_exists_in_static_graph { + writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); } - per_segment_edge_count[page_id].fetch_add(1, Ordering::Relaxed); + if !edge_exists_in_layer { + writer.add_inbound_edge( + Some(t), + dst_pos, + *src, + eid.with_layer(*layer), + 0, + ); + } else { + writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); + } } } }); + // }); - let aprox_num_edges = write_locked_graph.graph().internal_num_edges() + df.len(); - - write_locked_graph.resize_chunks_to_num_edges(aprox_num_edges); - - // rayon::scope(|sc| { - // // Add inbound edges - // sc.spawn(|_| { - // write_locked_graph - // .nodes - // .par_iter_mut() - // .enumerate() - // .for_each(|(page_id, shard)| { - // let zip = izip!( - // src_col_resolved.iter(), - // dst_col_resolved.iter(), - // dst_col.iter(), - // eid_col_resolved.iter(), - // time_col.iter(), - // secondary_index_col.iter(), - // layer_col_resolved.iter(), - // layer_eids_exist.iter().map(|a| a.load(Ordering::Relaxed)), - // eids_exist.iter().map(|b| b.load(Ordering::Relaxed)) - // ); - - // for ( - // src, - // dst, - // dst_gid, - // eid, - // time, - // secondary_index, - // layer, - // edge_exists_in_layer, - // edge_exists_in_static_graph, - // ) in zip - // { - // if let Some(dst_pos) = shard.resolve_pos(*dst) { - // let t = TimeIndexEntry(time, secondary_index); - // let mut writer = shard.writer(); - - // writer.store_node_id(dst_pos, 0, dst_gid, 0); - - // if !edge_exists_in_static_graph { - // writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); - // } - - // if !edge_exists_in_layer { - // writer.add_inbound_edge( - // Some(t), - // dst_pos, - // *src, - // eid.with_layer(*layer), - // 0, - // ); - // } else { - // writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); - // } - - // per_segment_edge_count[page_id].fetch_add(1, Ordering::Relaxed); - // } - // } - // }); - // }); - - // // Add temporal & constant properties to edges - // sc.spawn(|_| { - // write_locked_graph.edges.par_iter_mut().for_each(|shard| { - // let zip = izip!( - // src_col_resolved.iter(), - // dst_col_resolved.iter(), - // time_col.iter(), - // secondary_index_col.iter(), - // eid_col_resolved.iter(), - // layer_col_resolved.iter(), - // eids_exist - // .iter() - // .map(|exists| exists.load(Ordering::Relaxed)) - // ); - // let mut t_props: Vec<(usize, Prop)> = vec![]; - // let mut c_props: Vec<(usize, Prop)> = vec![]; - - // for (row, (src, dst, time, secondary_index, eid, layer, exists)) in - // zip.enumerate() - // { - // if let Some(eid_pos) = shard.resolve_pos(*eid) { - // let t = TimeIndexEntry(time, secondary_index); - // let mut writer = shard.writer(); - - // t_props.clear(); - // t_props.extend(prop_cols.iter_row(row)); - - // c_props.clear(); - // c_props.extend(metadata_cols.iter_row(row)); - // c_props.extend_from_slice(&shared_metadata); - - // writer.bulk_add_edge( - // t, - // eid_pos, - // *src, - // *dst, - // exists, - // *layer, - // c_props.drain(..), - // t_props.drain(..), - // 0, - // ); - // } - // } - // }); - // }); - // }); - - // #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } + drop(write_locked_graph); + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + let per_edge_segment_event_count = (0..write_locked_graph.edges.len()) + .map(|_| AtomicUsize::new(0)) + .collect::>(); + + // // Add temporal & constant properties to edges + // sc.spawn(|_| { + let now = std::time::Instant::now(); + write_locked_graph + .edges + .par_iter_mut() + .enumerate() + .for_each(|(seg_id, shard)| { + let zip = izip!( + src_col_resolved.iter(), + dst_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + eid_col_resolved.iter(), + layer_col_resolved.iter(), + eids_exist + .iter() + .map(|exists| exists.load(Ordering::Relaxed)) + ); + let mut t_props: Vec<(usize, Prop)> = vec![]; + let mut c_props: Vec<(usize, Prop)> = vec![]; + + for (row, (src, dst, time, secondary_index, eid, layer, exists)) in + zip.enumerate() + { + if let Some(eid_pos) = shard.resolve_pos(*eid) { + per_edge_segment_event_count[seg_id].fetch_add(1, Ordering::Relaxed); + let t = TimeIndexEntry(time, secondary_index); + let mut writer = shard.writer(); + + t_props.clear(); + t_props.extend(prop_cols.iter_row(row)); + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(row)); + c_props.extend_from_slice(&shared_metadata); + + writer.bulk_add_edge( + t, + eid_pos, + *src, + *dst, + exists, + *layer, + c_props.drain(..), + t_props.drain(..), + 0, + ); + } + } + }); + + println!("Loading edge events took {:?}", now.elapsed()); + // }); + // }); + // + // println!( + // "Per edge segment event count: {:?}", + // per_edge_segment_event_count + // ); + + // #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + Ok::<_, GraphError>(()) + })?; + // set the type of the resolver; Ok(()) } diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs index 6a8b6abfca..68792567f7 100644 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -207,7 +207,7 @@ pub fn load_edge_deletions_from_pandas< pub(crate) fn process_pandas_py_df<'a>( df: &Bound<'a, PyAny>, col_names: Vec<&str>, -) -> PyResult> + 'a>> { +) -> PyResult> + Send + 'a>> { let py = df.py(); is_jupyter(py); py.import("pandas")?; From b5df1bb27a76bebc21983b9b3aba2af9aba2033f Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Wed, 3 Dec 2025 14:09:02 +0000 Subject: [PATCH 04/24] added first draft of state mapping VID -> value --- Cargo.lock | 2 + db4-storage/src/state.rs | 391 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 393 insertions(+) create mode 100644 db4-storage/src/state.rs diff --git a/Cargo.lock b/Cargo.lock index 6f9620bdcb..2bc455c67a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2157,6 +2157,7 @@ dependencies = [ "sysinfo", "tempfile", "thiserror 2.0.17", + "tinyvec", ] [[package]] @@ -6585,6 +6586,7 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ + "serde", "tinyvec_macros", ] diff --git a/db4-storage/src/state.rs b/db4-storage/src/state.rs new file mode 100644 index 0000000000..d5d4c06014 --- /dev/null +++ b/db4-storage/src/state.rs @@ -0,0 +1,391 @@ +use std::ops::{Index, IndexMut}; + +/// Address resolver for sharded storage with fixed-size chunks +/// +/// Given a sharding scheme where items are distributed across chunks: +/// - chunk_id = index / max_page_len +/// - local_pos = index % max_page_len +/// +/// This struct provides O(1) lookup to map any index to a cell in a flat array, +/// accounting for partially filled chunks. +/// +/// # Example +/// With max_page_len = 1000: +/// - Chunk 0: 1000 items (offsets[0] = 0, offsets[1] = 1000) +/// - Chunk 1: 500 items (offsets[1] = 1000, offsets[2] = 1500) +/// - Chunk 2: 1000 items (offsets[2] = 1500, offsets[3] = 2500) +/// - state: Array of length 2500 +/// +/// To access index 1200: +/// - chunk = 1200 / 1000 = 1 +/// - local_pos = 1200 % 1000 = 200 +/// - cell_index = offsets[1] + 200 = 1000 + 200 = 1200 +#[derive(Debug)] +pub struct State { + /// Cumulative offsets: offsets[chunk_id] = starting position in `state` for that chunk + /// Length is equal to number of chunks + 1 (includes final cumulative value) + offsets: Box<[usize]>, + /// Flat array of state cells + state: Box<[A]>, + /// Maximum items per chunk + max_page_len: u32, +} + +impl State { + /// Create a new State with the given chunk configuration + /// + /// # Arguments + /// * `chunk_sizes` - The actual size of each chunk (can be <= max_page_len) + /// * `max_page_len` - Maximum capacity of each chunk + /// + /// # Example + /// ``` + /// use db4_storage::state::State; + /// use std::sync::atomic::AtomicUsize; + /// + /// // 3 chunks with sizes 1000, 500, 1000 and max capacity 1000 + /// let state: State = State::new(vec![1000, 500, 1000], 1000); + /// ``` + pub fn new(chunk_sizes: Vec, max_page_len: u32) -> Self { + let num_chunks = chunk_sizes.len(); + let total_size: usize = chunk_sizes.iter().sum(); + + // Build cumulative offsets (includes final cumulative value) + let mut offsets = Vec::with_capacity(num_chunks + 1); + let mut cumulative = 0; + for size in chunk_sizes { + offsets.push(cumulative); + cumulative += size; + } + offsets.push(cumulative); // Add final cumulative value + + // Initialize state array with default values + let state: Box<[A]> = (0..total_size) + .map(|_| A::default()) + .collect::>() + .into_boxed_slice(); + + Self { + offsets: offsets.into_boxed_slice(), + state, + max_page_len, + } + } + + /// Get a reference to the cell for the given global index + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Some(&A) if the index is valid, None otherwise + #[inline(always)] + pub fn get(&self, index: usize) -> Option<&A> { + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = *self.offsets.get(chunk)?; + let cell_index = offset + local_pos; + + self.state.get(cell_index) + } + + /// Get a mutable reference to the cell for the given global index + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Some(&mut A) if the index is valid, None otherwise + #[inline(always)] + pub fn get_mut(&mut self, index: usize) -> Option<&mut A> { + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = *self.offsets.get(chunk)?; + let cell_index = offset + local_pos; + + self.state.get_mut(cell_index) + } + + /// Get a reference to the cell for the given global index without bounds checking + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Reference to the corresponding cell + /// + /// # Safety + /// Panics if the index is out of bounds + #[inline(always)] + pub fn get_unchecked(&self, index: usize) -> &A { + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = self.offsets[chunk]; + let cell_index = offset + local_pos; + + &self.state[cell_index] + } + + /// Get a mutable reference to the cell for the given global index without bounds checking + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Mutable reference to the corresponding cell + /// + /// # Safety + /// Panics if the index is out of bounds + #[inline(always)] + pub fn get_mut_unchecked(&mut self, index: usize) -> &mut A { + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = self.offsets[chunk]; + let cell_index = offset + local_pos; + + &mut self.state[cell_index] + } + + /// Get the number of chunks + #[inline] + pub fn num_chunks(&self) -> usize { + self.offsets.len().saturating_sub(1) + } + + /// Get the total number of state cells + #[inline] + pub fn len(&self) -> usize { + self.state.len() + } + + /// Check if the state is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.state.is_empty() + } + + /// Get the maximum page length + #[inline] + pub fn max_page_len(&self) -> u32 { + self.max_page_len + } +} + +impl Index for State { + type Output = A; + + #[inline(always)] + fn index(&self, index: usize) -> &Self::Output { + self.get(index) + .unwrap_or_else(|| panic!("index out of bounds: {}", index)) + } +} + +impl IndexMut for State { + #[inline(always)] + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + self.get_mut(index) + .unwrap_or_else(|| panic!("index out of bounds: {}", index)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + + #[test] + fn test_basic_get() { + let state: State = State::new(vec![1000, 500, 1000], 1000); + + // Test chunk 0 + state.get_unchecked(0).store(42, Ordering::Relaxed); + assert_eq!(state.get_unchecked(0).load(Ordering::Relaxed), 42); + + state.get_unchecked(999).store(123, Ordering::Relaxed); + assert_eq!(state.get_unchecked(999).load(Ordering::Relaxed), 123); + + // Test chunk 1 (offset should be 1000) + state.get_unchecked(1000).store(77, Ordering::Relaxed); + assert_eq!(state.get_unchecked(1000).load(Ordering::Relaxed), 77); + + state.get_unchecked(1499).store(88, Ordering::Relaxed); + assert_eq!(state.get_unchecked(1499).load(Ordering::Relaxed), 88); + + // Test chunk 2 (offset should be 1500) + state.get_unchecked(2000).store(99, Ordering::Relaxed); + assert_eq!(state.get_unchecked(2000).load(Ordering::Relaxed), 99); + + state.get_unchecked(2999).store(111, Ordering::Relaxed); + assert_eq!(state.get_unchecked(2999).load(Ordering::Relaxed), 111); + } + + #[test] + fn test_get_option() { + let state: State = State::new(vec![100, 50], 100); + + assert!(state.get(0).is_some()); + assert!(state.get(99).is_some()); + assert!(state.get(100).is_some()); + assert!(state.get(149).is_some()); + + // Out of bounds chunk + assert!(state.get(200).is_none()); + assert!(state.get(1000).is_none()); + } + + #[test] + #[should_panic] + fn test_out_of_bounds_chunk() { + let state: State = State::new(vec![100], 100); + state.get_unchecked(200); // Should panic + } + + #[test] + fn test_partially_filled_chunks() { + // Simulate real scenario: chunks with varying fill levels + let state: State = State::new(vec![1000, 300, 1000, 50], 1000); + + // First chunk - fully filled + state.get_unchecked(0).store(1, Ordering::Relaxed); + state.get_unchecked(999).store(2, Ordering::Relaxed); + assert_eq!(state.get_unchecked(0).load(Ordering::Relaxed), 1); + assert_eq!(state.get_unchecked(999).load(Ordering::Relaxed), 2); + + // Second chunk - partially filled (300 items) + // Global indices: 1000-1299 + state.get_unchecked(1000).store(3, Ordering::Relaxed); + state.get_unchecked(1299).store(4, Ordering::Relaxed); + assert_eq!(state.get_unchecked(1000).load(Ordering::Relaxed), 3); + assert_eq!(state.get_unchecked(1299).load(Ordering::Relaxed), 4); + + // Third chunk - fully filled + // Global indices: 2000-2999 + state.get_unchecked(2000).store(5, Ordering::Relaxed); + state.get_unchecked(2999).store(6, Ordering::Relaxed); + assert_eq!(state.get_unchecked(2000).load(Ordering::Relaxed), 5); + assert_eq!(state.get_unchecked(2999).load(Ordering::Relaxed), 6); + + // Fourth chunk - minimally filled (50 items) + // Global indices: 3000-3049 + state.get_unchecked(3000).store(7, Ordering::Relaxed); + state.get_unchecked(3049).store(8, Ordering::Relaxed); + assert_eq!(state.get_unchecked(3000).load(Ordering::Relaxed), 7); + assert_eq!(state.get_unchecked(3049).load(Ordering::Relaxed), 8); + + assert_eq!(state.len(), 2350); // 1000 + 300 + 1000 + 50 + assert_eq!(state.num_chunks(), 4); + } + + #[test] + fn test_resolve_pos_consistency() { + // Test that our addressing matches the resolve_pos function + let max_page_len = 1000u32; + let state: State = State::new(vec![1000, 500, 1000], max_page_len); + + // Helper to simulate resolve_pos + let resolve_pos = |i: usize| -> (usize, u32) { + let chunk = i / max_page_len as usize; + let pos = (i % max_page_len as usize) as u32; + (chunk, pos) + }; + + for index in [0, 500, 999, 1000, 1250, 1499, 2000, 2500, 2999] { + let (chunk, local_pos) = resolve_pos(index); + + // Verify our addressing scheme matches + let computed_chunk = index / max_page_len as usize; + let computed_local = index % max_page_len as usize; + + assert_eq!(chunk, computed_chunk); + assert_eq!(local_pos, computed_local as u32); + + // Verify we can access the cell + state.get_unchecked(index).store(index, Ordering::Relaxed); + assert_eq!(state.get_unchecked(index).load(Ordering::Relaxed), index); + } + } + + #[test] + fn test_generic_over_different_types() { + // Test with usize + let state_usize: State = State::new(vec![10, 5], 10); + assert_eq!(*state_usize.get_unchecked(0), 0); + assert_eq!(*state_usize.get_unchecked(10), 0); + + // Test with Option + let state_option: State> = State::new(vec![10, 5], 10); + assert_eq!(*state_option.get_unchecked(0), None); + assert_eq!(*state_option.get_unchecked(10), None); + + // Test with AtomicUsize + let state_atomic: State = State::new(vec![10, 5], 10); + state_atomic.get_unchecked(0).store(42, Ordering::Relaxed); + assert_eq!(state_atomic.get_unchecked(0).load(Ordering::Relaxed), 42); + } + + #[test] + fn test_mutable_access() { + let mut state: State = State::new(vec![100, 50], 100); + + // Test get_mut + *state.get_mut(0).unwrap() = 42; + assert_eq!(*state.get(0).unwrap(), 42); + + *state.get_mut(50).unwrap() = 99; + assert_eq!(*state.get(50).unwrap(), 99); + + // Test get_mut in second chunk + *state.get_mut(100).unwrap() = 123; + assert_eq!(*state.get(100).unwrap(), 123); + + // Test get_mut_unchecked + *state.get_mut_unchecked(10) = 77; + assert_eq!(*state.get_unchecked(10), 77); + + // Test out of bounds returns None + assert!(state.get_mut(200).is_none()); + } + + #[test] + fn test_index_trait() { + let mut state: State = State::new(vec![100, 50], 100); + + // Test Index trait + state[0] = 42; + assert_eq!(state[0], 42); + + state[99] = 100; + assert_eq!(state[99], 100); + + // Test in second chunk + state[100] = 200; + assert_eq!(state[100], 200); + + state[149] = 300; + assert_eq!(state[149], 300); + } + + #[test] + #[should_panic(expected = "index out of bounds")] + fn test_index_out_of_bounds() { + let state: State = State::new(vec![100], 100); + let _ = state[200]; + } + + #[test] + fn test_offsets_include_final_cumulative() { + let state: State = State::new(vec![1000, 500, 1000], 1000); + + // offsets should be [0, 1000, 1500, 2500] + assert_eq!(state.num_chunks(), 3); + assert_eq!(state.len(), 2500); + + // Verify the final offset equals total length + assert_eq!(state.offsets[state.num_chunks()], state.len()); + } +} From 045ecb472a5aedf040fb7408482b7a7994bfc718 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Thu, 4 Dec 2025 11:42:01 +0000 Subject: [PATCH 05/24] use state index for node state when necessary --- db4-storage/src/pages/mod.rs | 21 +- db4-storage/src/state.rs | 532 +++++++++++++++--- raphtory-storage/src/graph/graph.rs | 6 +- .../algorithms/components/in_components.rs | 5 +- .../algorithms/components/out_components.rs | 5 +- .../algorithms/dynamics/temporal/epidemics.rs | 3 +- .../local_clustering_coefficient_batch.rs | 9 +- raphtory/src/algorithms/pathing/dijkstra.rs | 6 +- .../pathing/single_source_shortest_path.rs | 9 +- raphtory/src/db/api/state/group_by.rs | 5 +- raphtory/src/db/api/state/lazy_node_state.rs | 34 +- raphtory/src/db/api/state/node_state.rs | 112 ++-- raphtory/src/db/api/state/node_state_ops.rs | 13 +- raphtory/src/db/graph/nodes.rs | 20 +- .../src/python/graph/io/pandas_loaders.rs | 29 +- raphtory/src/python/packages/algorithms.rs | 5 +- 16 files changed, 600 insertions(+), 214 deletions(-) diff --git a/db4-storage/src/pages/mod.rs b/db4-storage/src/pages/mod.rs index afe4195f99..ed2fdd9ff7 100644 --- a/db4-storage/src/pages/mod.rs +++ b/db4-storage/src/pages/mod.rs @@ -450,7 +450,7 @@ pub struct SegmentCounts { _marker: std::marker::PhantomData, } -impl + Send> SegmentCounts { +impl> SegmentCounts { pub fn new(max_seg_len: u32, counts: impl IntoIterator) -> Self { let counts: TinyVec<[u32; node_store::N]> = counts.into_iter().collect(); @@ -469,11 +469,20 @@ impl + Send> SegmentCounts { }) } + pub(crate) fn counts(&self) -> &[u32] { + &self.counts + } + + pub(crate) fn max_seg_len(&self) -> u32 { + self.max_seg_len + } +} +impl + Send> SegmentCounts { pub fn into_par_iter(self) -> impl ParallelIterator { let max_seg_len = self.max_seg_len as usize; (0..self.counts.len()).into_par_iter().flat_map(move |i| { let c = self.counts[i]; - let g_pos = i * max_seg_len as usize; + let g_pos = i * max_seg_len; (0..c) .into_par_iter() .map(move |offset| I::from(g_pos + offset as usize)) @@ -485,10 +494,10 @@ impl Drop for GraphStore { fn drop(&mut self) { let node_types = self.nodes.prop_meta().get_all_node_types(); self._ext.set_node_types(node_types); - if let Some(graph_dir) = self.graph_dir.as_ref() { - if write_graph_config(graph_dir, &self._ext).is_err() { - eprintln!("Unrecoverable! Failed to write graph meta"); - } + if let Some(graph_dir) = self.graph_dir.as_ref() + && write_graph_config(graph_dir, &self._ext).is_err() + { + eprintln!("Unrecoverable! Failed to write graph meta"); } } } diff --git a/db4-storage/src/state.rs b/db4-storage/src/state.rs index d5d4c06014..1596b95867 100644 --- a/db4-storage/src/state.rs +++ b/db4-storage/src/state.rs @@ -1,12 +1,15 @@ +use rayon::prelude::*; use std::ops::{Index, IndexMut}; -/// Address resolver for sharded storage with fixed-size chunks +use crate::pages::SegmentCounts; + +/// Index resolver for sharded storage with fixed-size chunks /// /// Given a sharding scheme where items are distributed across chunks: /// - chunk_id = index / max_page_len /// - local_pos = index % max_page_len /// -/// This struct provides O(1) lookup to map any index to a cell in a flat array, +/// This struct provides O(1) lookup to map any global index to a flat array position, /// accounting for partially filled chunks. /// /// # Example @@ -14,24 +17,235 @@ use std::ops::{Index, IndexMut}; /// - Chunk 0: 1000 items (offsets[0] = 0, offsets[1] = 1000) /// - Chunk 1: 500 items (offsets[1] = 1000, offsets[2] = 1500) /// - Chunk 2: 1000 items (offsets[2] = 1500, offsets[3] = 2500) -/// - state: Array of length 2500 /// -/// To access index 1200: +/// To resolve index 1200: /// - chunk = 1200 / 1000 = 1 /// - local_pos = 1200 % 1000 = 200 -/// - cell_index = offsets[1] + 200 = 1000 + 200 = 1200 -#[derive(Debug)] -pub struct State { - /// Cumulative offsets: offsets[chunk_id] = starting position in `state` for that chunk +/// - flat_index = offsets[1] + 200 = 1000 + 200 = 1200 +#[derive(Debug, Clone)] +pub struct StateIndex { + /// Cumulative offsets: offsets[chunk_id] = starting position in flat array for that chunk /// Length is equal to number of chunks + 1 (includes final cumulative value) offsets: Box<[usize]>, - /// Flat array of state cells - state: Box<[A]>, /// Maximum items per chunk max_page_len: u32, + /// Phantom data for index type + _marker: std::marker::PhantomData, +} + +impl From> for StateIndex +where + I: From + Into, +{ + fn from(counts: SegmentCounts) -> Self { + Self::new( + counts.counts().iter().map(|c| *c as usize), + counts.max_seg_len(), + ) + } +} + +impl + Into> StateIndex { + /// Create a new StateIndex with the given chunk configuration + /// + /// # Arguments + /// * `chunk_sizes` - The actual size of each chunk (can be <= max_page_len) + /// * `max_page_len` - Maximum capacity of each chunk + pub fn new(chunk_sizes: impl IntoIterator, max_page_len: u32) -> Self { + // Build cumulative offsets (includes final cumulative value) + let mut offsets = Vec::new(); + let mut cumulative = 0; + for size in chunk_sizes { + offsets.push(cumulative); + cumulative += size; + } + offsets.push(cumulative); // Add final cumulative value + + Self { + offsets: offsets.into_boxed_slice(), + max_page_len, + _marker: std::marker::PhantomData, + } + } + + /// Resolve a global index to a flat array index + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// Some(flat_index) if the index is valid, None otherwise + #[inline(always)] + pub fn resolve(&self, index: I) -> Option { + let index: usize = index.into(); + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = *self.offsets.get(chunk)?; + let flat_index = offset + local_pos; + + // Verify the flat_index is within bounds of this chunk + let next_offset = *self.offsets.get(chunk + 1)?; + if flat_index < next_offset { + Some(flat_index) + } else { + None + } + } + + /// Resolve a global index to a flat array index without bounds checking + /// + /// # Arguments + /// * `index` - Global index across all chunks + /// + /// # Returns + /// The flat array index + /// + /// # Safety + /// Panics if the index is out of bounds + #[inline(always)] + pub fn resolve_unchecked(&self, index: I) -> usize { + let index: usize = index.into(); + let chunk = index / self.max_page_len as usize; + let local_pos = index % self.max_page_len as usize; + + let offset = self.offsets[chunk]; + offset + local_pos + } + + /// Get the number of chunks + #[inline] + pub fn num_chunks(&self) -> usize { + self.offsets.len().saturating_sub(1) + } + + /// Get the total number of items across all chunks + #[inline] + pub fn total_len(&self) -> usize { + self.offsets[self.num_chunks()] + } + + /// Get the maximum page length + #[inline] + pub fn max_page_len(&self) -> u32 { + self.max_page_len + } + + /// Create an iterator over all valid global indices + /// + /// This iterates through all chunks and yields the global indices for each item. + /// For example, with chunk_sizes [10, 1, 5] and max_page_len 10: + /// - Chunk 0: yields 0..10 + /// - Chunk 1: yields 10..11 + /// - Chunk 2: yields 20..25 + pub fn iter(&self) -> StateIndexIter<'_, I> { + StateIndexIter { + index: self, + current_chunk: 0, + current_local: 0, + } + } + + /// Create a parallel iterator over all valid global indices with their flat indices + /// + /// This iterates through all chunks in parallel and yields tuples of (flat_index, global_index). + /// The flat_index starts at 0 and increments for each item in iteration order. + /// + /// For example, with chunk_sizes [10, 1, 5] and max_page_len 10: + /// - Chunk 0: yields (0, 0)..(9, 9) + /// - Chunk 1: yields (10, 10) + /// - Chunk 2: yields (11, 20)..(15, 24) + pub fn par_iter(&self) -> impl ParallelIterator + '_ + where + I: Send + Sync, + { + let max_page_len = self.max_page_len as usize; + let num_chunks = self.num_chunks(); + (0..num_chunks).into_par_iter().flat_map(move |chunk_idx| { + let chunk_start = self.offsets[chunk_idx]; + let chunk_end = self.offsets[chunk_idx + 1]; + let chunk_size = chunk_end - chunk_start; + let global_base = chunk_idx * max_page_len; + (0..chunk_size).into_par_iter().map(move |local_offset| { + let flat_idx = chunk_start + local_offset; + let global_idx = I::from(global_base + local_offset); + (flat_idx, global_idx) + }) + }) + } +} + +/// Iterator over global indices in a StateIndex +#[derive(Debug)] +pub struct StateIndexIter<'a, I> { + index: &'a StateIndex, + current_chunk: usize, + current_local: usize, +} + +impl<'a, I: From + Into> Iterator for StateIndexIter<'a, I> { + type Item = I; + + fn next(&mut self) -> Option { + loop { + if self.current_chunk >= self.index.num_chunks() { + return None; + } + + let chunk_start = self.index.offsets[self.current_chunk]; + let chunk_end = self.index.offsets[self.current_chunk + 1]; + let chunk_size = chunk_end - chunk_start; + + if self.current_local < chunk_size { + let global_idx = + self.current_chunk * self.index.max_page_len as usize + self.current_local; + self.current_local += 1; + return Some(I::from(global_idx)); + } + + // Move to next chunk + self.current_chunk += 1; + self.current_local = 0; + } + } + + fn size_hint(&self) -> (usize, Option) { + let total = self.index.total_len(); + let consumed = if self.current_chunk < self.index.num_chunks() { + self.index.offsets[self.current_chunk] + self.current_local + } else { + total + }; + let remaining = total.saturating_sub(consumed); + (remaining, Some(remaining)) + } } -impl State { +impl<'a, I: From + Into> ExactSizeIterator for StateIndexIter<'a, I> { + fn len(&self) -> usize { + let total = self.index.total_len(); + let consumed = if self.current_chunk < self.index.num_chunks() { + self.index.offsets[self.current_chunk] + self.current_local + } else { + total + }; + total.saturating_sub(consumed) + } +} + +/// Address resolver for sharded storage with fixed-size chunks +/// +/// This struct combines a StateIndex with a flat array to provide O(1) access +/// to elements in a sharded storage scheme with partially filled chunks. +#[derive(Debug)] +pub struct State { + /// Index resolver + index: StateIndex, + /// Flat array of state cells + state: Box<[A]>, +} + +impl + Into> State { /// Create a new State with the given chunk configuration /// /// # Arguments @@ -47,17 +261,8 @@ impl State { /// let state: State = State::new(vec![1000, 500, 1000], 1000); /// ``` pub fn new(chunk_sizes: Vec, max_page_len: u32) -> Self { - let num_chunks = chunk_sizes.len(); - let total_size: usize = chunk_sizes.iter().sum(); - - // Build cumulative offsets (includes final cumulative value) - let mut offsets = Vec::with_capacity(num_chunks + 1); - let mut cumulative = 0; - for size in chunk_sizes { - offsets.push(cumulative); - cumulative += size; - } - offsets.push(cumulative); // Add final cumulative value + let index = StateIndex::::new(chunk_sizes, max_page_len); + let total_size = index.total_len(); // Initialize state array with default values let state: Box<[A]> = (0..total_size) @@ -65,11 +270,13 @@ impl State { .collect::>() .into_boxed_slice(); - Self { - offsets: offsets.into_boxed_slice(), - state, - max_page_len, - } + Self { index, state } + } + + /// Get a reference to the StateIndex + #[inline] + pub fn index(&self) -> &StateIndex { + &self.index } /// Get a reference to the cell for the given global index @@ -80,14 +287,9 @@ impl State { /// # Returns /// Some(&A) if the index is valid, None otherwise #[inline(always)] - pub fn get(&self, index: usize) -> Option<&A> { - let chunk = index / self.max_page_len as usize; - let local_pos = index % self.max_page_len as usize; - - let offset = *self.offsets.get(chunk)?; - let cell_index = offset + local_pos; - - self.state.get(cell_index) + pub fn get(&self, index: I) -> Option<&A> { + let flat_index = self.index.resolve(index)?; + self.state.get(flat_index) } /// Get a mutable reference to the cell for the given global index @@ -98,14 +300,9 @@ impl State { /// # Returns /// Some(&mut A) if the index is valid, None otherwise #[inline(always)] - pub fn get_mut(&mut self, index: usize) -> Option<&mut A> { - let chunk = index / self.max_page_len as usize; - let local_pos = index % self.max_page_len as usize; - - let offset = *self.offsets.get(chunk)?; - let cell_index = offset + local_pos; - - self.state.get_mut(cell_index) + pub fn get_mut(&mut self, index: I) -> Option<&mut A> { + let flat_index = self.index.resolve(index)?; + self.state.get_mut(flat_index) } /// Get a reference to the cell for the given global index without bounds checking @@ -119,14 +316,9 @@ impl State { /// # Safety /// Panics if the index is out of bounds #[inline(always)] - pub fn get_unchecked(&self, index: usize) -> &A { - let chunk = index / self.max_page_len as usize; - let local_pos = index % self.max_page_len as usize; - - let offset = self.offsets[chunk]; - let cell_index = offset + local_pos; - - &self.state[cell_index] + pub fn get_unchecked(&self, index: I) -> &A { + let flat_index = self.index.resolve_unchecked(index); + &self.state[flat_index] } /// Get a mutable reference to the cell for the given global index without bounds checking @@ -140,20 +332,15 @@ impl State { /// # Safety /// Panics if the index is out of bounds #[inline(always)] - pub fn get_mut_unchecked(&mut self, index: usize) -> &mut A { - let chunk = index / self.max_page_len as usize; - let local_pos = index % self.max_page_len as usize; - - let offset = self.offsets[chunk]; - let cell_index = offset + local_pos; - - &mut self.state[cell_index] + pub fn get_mut_unchecked(&mut self, index: I) -> &mut A { + let flat_index = self.index.resolve_unchecked(index); + &mut self.state[flat_index] } /// Get the number of chunks #[inline] pub fn num_chunks(&self) -> usize { - self.offsets.len().saturating_sub(1) + self.index.num_chunks() } /// Get the total number of state cells @@ -171,25 +358,63 @@ impl State { /// Get the maximum page length #[inline] pub fn max_page_len(&self) -> u32 { - self.max_page_len + self.index.max_page_len() + } + + /// Create an iterator over all elements in the state + /// + /// Yields references to each element in order of their global indices. + pub fn iter(&self) -> StateIter<'_, A, I> { + StateIter { + state: self, + inner: self.index.iter(), + } + } +} + +/// Iterator over elements in a State +#[derive(Debug)] +pub struct StateIter<'a, A, I> { + state: &'a State, + inner: StateIndexIter<'a, I>, +} + +impl<'a, A: Default, I: From + Into> Iterator for StateIter<'a, A, I> { + type Item = &'a A; + + fn next(&mut self) -> Option { + let global_idx = self.inner.next()?; + Some(self.state.get_unchecked(global_idx)) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl<'a, A: Default, I: From + Into> ExactSizeIterator for StateIter<'a, A, I> { + fn len(&self) -> usize { + self.inner.len() } } -impl Index for State { +impl + Into + std::fmt::Debug + Copy> Index for State { type Output = A; #[inline(always)] - fn index(&self, index: usize) -> &Self::Output { + fn index(&self, index: I) -> &Self::Output { self.get(index) - .unwrap_or_else(|| panic!("index out of bounds: {}", index)) + .unwrap_or_else(|| panic!("index out of bounds: {:?}", index)) } } -impl IndexMut for State { +impl + Into + std::fmt::Debug + Copy> IndexMut + for State +{ #[inline(always)] - fn index_mut(&mut self, index: usize) -> &mut Self::Output { + fn index_mut(&mut self, index: I) -> &mut Self::Output { self.get_mut(index) - .unwrap_or_else(|| panic!("index out of bounds: {}", index)) + .unwrap_or_else(|| panic!("index out of bounds: {:?}", index)) } } @@ -198,6 +423,31 @@ mod tests { use super::*; use std::sync::atomic::{AtomicUsize, Ordering}; + #[test] + fn test_state_index_resolve() { + let index: StateIndex = StateIndex::new(vec![1000, 500, 1000], 1000); + + assert_eq!(index.num_chunks(), 3); + assert_eq!(index.total_len(), 2500); + assert_eq!(index.max_page_len(), 1000); + + // Test chunk 0 + assert_eq!(index.resolve(0), Some(0)); + assert_eq!(index.resolve(999), Some(999)); + + // Test chunk 1 + assert_eq!(index.resolve(1000), Some(1000)); + assert_eq!(index.resolve(1499), Some(1499)); + + // Test chunk 2 + assert_eq!(index.resolve(2000), Some(1500)); + assert_eq!(index.resolve(2999), Some(2499)); + + // Test out of bounds + assert_eq!(index.resolve(3000), None); + assert_eq!(index.resolve(1500), None); // In chunk 1 but beyond its actual size + } + #[test] fn test_basic_get() { let state: State = State::new(vec![1000, 500, 1000], 1000); @@ -236,6 +486,9 @@ mod tests { // Out of bounds chunk assert!(state.get(200).is_none()); assert!(state.get(1000).is_none()); + + // In bounds chunk but beyond chunk's actual size + assert!(state.get(150).is_none()); } #[test] @@ -385,7 +638,148 @@ mod tests { assert_eq!(state.num_chunks(), 3); assert_eq!(state.len(), 2500); - // Verify the final offset equals total length - assert_eq!(state.offsets[state.num_chunks()], state.len()); + // Verify via StateIndex API + assert_eq!(state.index().total_len(), state.len()); + } + + #[test] + fn test_state_index_can_be_used_independently() { + // StateIndex can be used independently of State + let index: StateIndex = StateIndex::new(vec![1000, 500, 1000], 1000); + + // Create your own array + let mut data = vec![0usize; index.total_len()]; + + // Use the index to access elements + if let Some(flat_idx) = index.resolve(1200) { + data[flat_idx] = 42; + } + + if let Some(flat_idx) = index.resolve(1200) { + assert_eq!(data[flat_idx], 42); + } + } + + #[test] + fn test_state_index_iter() { + let index: StateIndex = StateIndex::new(vec![10, 1, 5], 10); + + let global_indices: Vec = index.iter().collect(); + + // Chunk 0: global indices 0-9 (10 items) + // Chunk 1: global index 10 (1 item) + // Chunk 2: global indices 20-24 (5 items) + let expected = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // Chunk 0 + 10, // Chunk 1 + 20, 21, 22, 23, 24, // Chunk 2 + ]; + + assert_eq!(global_indices, expected); + assert_eq!(index.iter().len(), 16); + } + + #[test] + fn test_state_index_par_iter() { + let index: StateIndex = StateIndex::new(vec![10, 1, 5], 10); + + let mut results: Vec<(usize, usize)> = index.par_iter().collect(); + results.sort_by_key(|(flat_idx, _)| *flat_idx); // Sort by flat index + + // Expected: (flat_idx, global_idx) tuples + // Chunk 0: flat indices 0-9, global indices 0-9 + // Chunk 1: flat index 10, global index 10 + // Chunk 2: flat indices 11-15, global indices 20-24 + let expected = vec![ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), // Chunk 0 + (10, 10), // Chunk 1 + (11, 20), + (12, 21), + (13, 22), + (14, 23), + (15, 24), // Chunk 2 + ]; + + assert_eq!(results, expected); + + // Verify count matches + assert_eq!(index.par_iter().count(), 16); + + // Verify flat indices are sequential + let flat_indices: Vec = results.iter().map(|(flat_idx, _)| *flat_idx).collect(); + assert_eq!(flat_indices, (0..16).collect::>()); + } + + #[test] + fn test_state_iter() { + let mut state: State = State::new(vec![10, 1, 5], 10); + + // Collect global indices first to avoid borrow checker issues + let global_indices: Vec = state.index().iter().collect(); + + // Initialize state with global indices + for global_idx in global_indices { + state[global_idx] = global_idx * 10; + } + + // Collect values via iter + let values: Vec = state.iter().copied().collect(); + + let expected = vec![ + 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, // Chunk 0 + 100, // Chunk 1 + 200, 210, 220, 230, 240, // Chunk 2 + ]; + + assert_eq!(values, expected); + assert_eq!(state.iter().len(), 16); + } + + #[test] + fn test_state_iter_mut() { + let mut state: State = State::new(vec![5, 2], 10); + + // Modify all elements via iter_mut + for (i, value) in state.iter_mut().enumerate() { + *value = i * 2; + } + + // Verify via iter + let values: Vec = state.iter().copied().collect(); + assert_eq!(values, vec![0, 2, 4, 6, 8, 10, 12]); + } + + #[test] + fn test_state_iter_with_atomics() { + let state: State = State::new(vec![10, 5], 10); + + // Collect global indices first to avoid borrow checker issues + let global_indices: Vec = state.index().iter().collect(); + + // Set values via global indices + for global_idx in global_indices { + state + .get_unchecked(global_idx) + .store(global_idx, Ordering::Relaxed); + } + + // Read via iterator + let values: Vec = state.iter().map(|a| a.load(Ordering::Relaxed)).collect(); + + let expected = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // Chunk 0 + 10, 11, 12, 13, 14, // Chunk 1 + ]; + + assert_eq!(values, expected); } } diff --git a/raphtory-storage/src/graph/graph.rs b/raphtory-storage/src/graph/graph.rs index baa40b5a2b..e047b0f0ce 100644 --- a/raphtory-storage/src/graph/graph.rs +++ b/raphtory-storage/src/graph/graph.rs @@ -14,7 +14,7 @@ use db4_graph::TemporalGraph; use raphtory_api::core::entities::{properties::meta::Meta, LayerIds, LayerVariants, EID, VID}; use raphtory_core::entities::{nodes::node_ref::NodeRef, properties::graph_meta::GraphMeta}; use std::{fmt::Debug, iter, sync::Arc}; -use storage::{pages::SegmentCounts, Extension, GraphPropEntry}; +use storage::{pages::SegmentCounts, state::StateIndex, Extension, GraphPropEntry}; use thiserror::Error; #[derive(Clone, Debug)] @@ -267,6 +267,10 @@ impl GraphStorage { } } + pub fn node_state_index(&self) -> StateIndex { + self.node_segment_counts().into() + } + pub fn edge_segment_counts(&self) -> SegmentCounts { match self { GraphStorage::Mem(storage) => storage.graph.storage().edge_segment_counts(), diff --git a/raphtory/src/algorithms/components/in_components.rs b/raphtory/src/algorithms/components/in_components.rs index d2e6925575..4856d51da0 100644 --- a/raphtory/src/algorithms/components/in_components.rs +++ b/raphtory/src/algorithms/components/in_components.rs @@ -15,6 +15,7 @@ use crate::{ }, prelude::GraphViewOps, }; +use either::Either; use indexmap::IndexSet; use itertools::Itertools; use std::collections::{hash_map::Entry, HashMap, HashSet, VecDeque}; @@ -76,7 +77,7 @@ where Nodes::new_filtered( g.clone(), g.clone(), - Some(Index::from_iter(v.in_components)), + Either::Right(Index::from_iter(v.in_components)), None, ) }) @@ -127,6 +128,6 @@ pub fn in_component<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>>( node.base_graph.clone(), node.base_graph.clone(), distances.into(), - Some(Index::new(nodes)), + Either::Right(Index::new(nodes)), ) } diff --git a/raphtory/src/algorithms/components/out_components.rs b/raphtory/src/algorithms/components/out_components.rs index 3709764c7a..d5a4b633ef 100644 --- a/raphtory/src/algorithms/components/out_components.rs +++ b/raphtory/src/algorithms/components/out_components.rs @@ -15,6 +15,7 @@ use crate::{ }, prelude::GraphViewOps, }; +use either::Either; use indexmap::IndexSet; use itertools::Itertools; use std::collections::{hash_map::Entry, HashMap, HashSet, VecDeque}; @@ -76,7 +77,7 @@ where Nodes::new_filtered( g.clone(), g.clone(), - Some(Index::from_iter(v.out_components)), + Either::Right(Index::from_iter(v.out_components)), None, ) }) @@ -127,6 +128,6 @@ pub fn out_component<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>>( node.base_graph.clone(), node.base_graph.clone(), distances.into(), - Some(Index::new(nodes)), + Either::Right(Index::new(nodes)), ) } diff --git a/raphtory/src/algorithms/dynamics/temporal/epidemics.rs b/raphtory/src/algorithms/dynamics/temporal/epidemics.rs index e3d6cd3c50..39154bcf69 100644 --- a/raphtory/src/algorithms/dynamics/temporal/epidemics.rs +++ b/raphtory/src/algorithms/dynamics/temporal/epidemics.rs @@ -9,6 +9,7 @@ use crate::{ }, prelude::*, }; +use either::Either; use indexmap::IndexSet; use rand::{ distr::{Bernoulli, Distribution}, @@ -252,7 +253,7 @@ where g.clone(), g.clone(), values.into(), - Some(Index::new(index)), + Either::Right(Index::new(index)), )) } diff --git a/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs b/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs index 8c26674d67..0cc37f34cb 100644 --- a/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs +++ b/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs @@ -5,6 +5,7 @@ use crate::{ view::*, }, }; +use either::Either; use indexmap::IndexSet; use itertools::Itertools; use rayon::prelude::*; @@ -47,6 +48,10 @@ pub fn local_clustering_coefficient_batch( )) }) .unzip(); - let result: Option<_> = Some(Index::new(index)); - NodeState::new(graph.clone(), graph.clone(), values.into(), result) + NodeState::new( + graph.clone(), + graph.clone(), + values.into(), + Either::Right(Index::new(index)), + ) } diff --git a/raphtory/src/algorithms/pathing/dijkstra.rs b/raphtory/src/algorithms/pathing/dijkstra.rs index 27e93a13c5..0315d344d4 100644 --- a/raphtory/src/algorithms/pathing/dijkstra.rs +++ b/raphtory/src/algorithms/pathing/dijkstra.rs @@ -9,6 +9,7 @@ use crate::{ errors::GraphError, prelude::*, }; +use either::Either; use indexmap::IndexSet; use raphtory_api::core::{ entities::{ @@ -189,7 +190,8 @@ pub fn dijkstra_single_source_shortest_paths, Vec<_>) = paths .into_iter() .map(|(id, (cost, path))| { - let nodes = Nodes::new_filtered(g.clone(), g.clone(), Some(Index::new(path)), None); + let nodes = + Nodes::new_filtered(g.clone(), g.clone(), Either::Right(Index::new(path)), None); (id, (cost, nodes)) }) .unzip(); @@ -197,6 +199,6 @@ pub fn dijkstra_single_source_shortest_paths, T: AsNodeRef } } NodeState::new_from_map(g.clone(), paths, |v| { - Nodes::new_filtered(g.clone(), g.clone(), Some(Index::from_iter(v)), None) + Nodes::new_filtered( + g.clone(), + g.clone(), + Either::Right(Index::from_iter(v)), + None, + ) }) } diff --git a/raphtory/src/db/api/state/group_by.rs b/raphtory/src/db/api/state/group_by.rs index e2b2361c4c..2ac9b10887 100644 --- a/raphtory/src/db/api/state/group_by.rs +++ b/raphtory/src/db/api/state/group_by.rs @@ -6,6 +6,7 @@ use crate::{ prelude::{GraphViewOps, NodeStateOps}, }; use dashmap::DashMap; +use either::Either; use indexmap::IndexSet; use raphtory_api::core::entities::VID; use rayon::prelude::*; @@ -40,7 +41,7 @@ impl<'graph, V: Hash + Eq + Send + Sync + Clone, G: GraphViewOps<'graph>> NodeGr Nodes::new_filtered( self.graph.clone(), self.graph.clone(), - Some(nodes.clone()), + Either::Right(nodes.clone()), None, ), ) @@ -86,7 +87,7 @@ impl<'graph, V: Hash + Eq + Send + Sync + Clone, G: GraphViewOps<'graph>> NodeGr Nodes::new_filtered( self.graph.clone(), self.graph.clone(), - Some(nodes.clone()), + Either::Right(nodes.clone()), None, ), ) diff --git a/raphtory/src/db/api/state/lazy_node_state.rs b/raphtory/src/db/api/state/lazy_node_state.rs index 9ab1632ec8..7a28473227 100644 --- a/raphtory/src/db/api/state/lazy_node_state.rs +++ b/raphtory/src/db/api/state/lazy_node_state.rs @@ -15,12 +15,14 @@ use crate::{ }, prelude::*, }; +use either::Either; use indexmap::IndexSet; use rayon::prelude::*; use std::{ borrow::Borrow, fmt::{Debug, Formatter}, }; +use storage::state::StateIndex; #[derive(Clone)] pub struct LazyNodeState<'graph, Op, G, GH = G> { @@ -162,7 +164,7 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra self.nodes.base_graph.clone(), self.nodes.graph.clone(), values.into(), - Some(Index::new(keys)), + Either::Right(Index::new(keys)), ) } else { let values = self.collect_vec(); @@ -170,7 +172,7 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra self.nodes.base_graph.clone(), self.nodes.graph.clone(), values.into(), - None, + Either::Left(self.nodes.graph.core_graph().node_state_index().into()), ) } } @@ -268,34 +270,6 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra .map(move |node| (node, self.op.apply(&storage, node.node))) } - fn get_by_index( - &self, - index: usize, - ) -> Option<( - NodeView<'_, &Self::BaseGraph, &Self::Graph>, - Self::Value<'_>, - )> { - if self.graph().filtered() { - self.iter().nth(index) - } else { - let vid = match self.graph().node_list() { - NodeList::All { len } => { - if index < len { - VID(index) - } else { - return None; - } - } - NodeList::List { elems } => elems.key(index)?, - }; - let cg = self.graph().core_graph(); - Some(( - NodeView::new_one_hop_filtered(self.base_graph(), self.graph(), vid), - self.op.apply(cg, vid), - )) - } - } - fn get_by_node(&self, node: N) -> Option> { let node = (&self.graph()).node(node); node.map(|node| self.op.apply(self.graph().core_graph(), node.node)) diff --git a/raphtory/src/db/api/state/node_state.rs b/raphtory/src/db/api/state/node_state.rs index c399f12913..19cf75d623 100644 --- a/raphtory/src/db/api/state/node_state.rs +++ b/raphtory/src/db/api/state/node_state.rs @@ -22,6 +22,7 @@ use std::{ marker::PhantomData, sync::Arc, }; +use storage::state::StateIndex; #[derive(Debug, Default)] pub struct Index { @@ -120,7 +121,7 @@ pub struct NodeState<'graph, V, G, GH = G> { base_graph: G, graph: GH, values: Arc<[V]>, - keys: Option>, + keys: Either>, Index>, _marker: PhantomData<&'graph ()>, } @@ -220,6 +221,9 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { .map(|vid| values[vid.index()].clone()) .collect(), }; + let index = index + .map(Either::Right) + .unwrap_or_else(|| Either::Left(graph.core_graph().node_state_index().into())); Self::new(graph.clone(), graph, values.into(), index) } @@ -238,18 +242,27 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { .map(|vid| map(values[vid.index()].clone())) .collect(), }; + let index = index + .map(Either::Right) + .unwrap_or_else(|| Either::Left(graph.core_graph().node_state_index().into())); Self::new(graph.clone(), graph, values, index) } /// create a new empty NodeState pub fn new_empty(graph: G) -> Self { - Self::new(graph.clone(), graph, [].into(), Some(Index::default())) + let index = Either::Left(Arc::new(StateIndex::from( + graph.core_graph().node_segment_counts(), + ))); + Self::new(graph.clone(), graph, [].into(), index) } /// create a new NodeState from a list of values for the node (takes care of creating an index for /// node filtering when needed) pub fn new_from_values(graph: G, values: impl Into>) -> Self { let index = Index::for_graph(&graph); + let index = index + .map(Either::Right) + .unwrap_or_else(|| Either::Left(graph.core_graph().node_state_index().into())); Self::new(graph.clone(), graph, values.into(), index) } @@ -272,13 +285,23 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { .iter() .flat_map(|node| Some((node.node, map(values.remove(&node.node)?)))) .unzip(); - Self::new(graph.clone(), graph, values.into(), Some(Index::new(index))) + Self::new( + graph.clone(), + graph, + values.into(), + Either::Right(Index::new(index)), + ) } } } impl<'graph, V, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> NodeState<'graph, V, G, GH> { - pub fn new(base_graph: G, graph: GH, values: Arc<[V]>, keys: Option>) -> Self { + pub fn new( + base_graph: G, + graph: GH, + values: Arc<[V]>, + keys: Either>, Index>, + ) -> Self { Self { base_graph, graph, @@ -288,10 +311,6 @@ impl<'graph, V, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> NodeState<'gr } } - pub fn into_inner(self) -> (Arc<[V]>, Option>) { - (self.values, self.keys) - } - pub fn values(&self) -> &Arc<[V]> { &self.values } @@ -375,27 +394,22 @@ impl< 'graph: 'a, { match &self.keys { - Some(index) => index - .iter() - .zip(self.values.iter()) - .map(|(n, v)| { + Either::Right(index) => { + Either::Right(index.iter().zip(self.values.iter()).map(|(n, v)| { ( NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), v, ) - }) - .into_dyn_boxed(), - None => self - .values - .iter() - .enumerate() - .map(|(i, v)| { + })) + } + Either::Left(index) => { + Either::Left(index.iter().zip(self.values.iter()).map(|(n, v)| { ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, VID(i)), + NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), v, ) - }) - .into_dyn_boxed(), + })) + } } } @@ -424,7 +438,7 @@ impl< 'graph: 'a, { match &self.keys { - Some(index) => { + Either::Right(index) => { Either::Left(index.par_iter().zip(self.values.par_iter()).map(|(n, v)| { ( NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), @@ -432,43 +446,20 @@ impl< ) })) } - None => Either::Right(self.values.par_iter().enumerate().map(|(i, v)| { + Either::Left(index) => Either::Right(index.par_iter().map(|(i, v)| { ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, VID(i)), - v, + NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, v), + &self.values[i], ) })), } } - fn get_by_index( - &self, - index: usize, - ) -> Option<( - NodeView<'_, &Self::BaseGraph, &Self::Graph>, - Self::Value<'_>, - )> { - match &self.keys { - Some(node_index) => node_index.key(index).map(|n| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), - &self.values[index], - ) - }), - None => self.values.get(index).map(|v| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, VID(index)), - v, - ) - }), - } - } - fn get_by_node(&self, node: N) -> Option> { let id = self.graph.internalise_node(node.as_node_ref())?; match &self.keys { - Some(index) => index.index(&id).map(|i| &self.values[i]), - None => Some(&self.values[id.0]), + Either::Right(index) => index.index(&id).map(|i| &self.values[i]), + Either::Left(index) => Some(&self.values[id.0]), } } @@ -479,6 +470,8 @@ impl< #[cfg(test)] mod test { + use raphtory_storage::core_ops::CoreGraphOps; + use crate::{ db::api::state::{node_state::NodeState, AsOrderedNodeStateOps, OrderedNodeStateOps}, prelude::*, @@ -488,21 +481,8 @@ mod test { fn float_state() { let g = Graph::new(); g.add_node(0, 0, NO_PROPS, None).unwrap(); - let float_state = NodeState { - base_graph: g.clone(), - graph: g.clone(), - values: [0.0f64].into(), - keys: None, - _marker: Default::default(), - }; - - let int_state = NodeState { - base_graph: g.clone(), - graph: g.clone(), - values: [1i64].into(), - keys: None, - _marker: Default::default(), - }; + let float_state = NodeState::new_from_values(g.clone(), [0.0f64]); + let int_state = NodeState::new_from_values(g.clone(), [1i64]); let min_float = float_state.min_item().unwrap().1; let min_int = int_state.min_item().unwrap().1; assert_eq!(min_float, &0.0); diff --git a/raphtory/src/db/api/state/node_state_ops.rs b/raphtory/src/db/api/state/node_state_ops.rs index efe2c7a865..8f4e9a812f 100644 --- a/raphtory/src/db/api/state/node_state_ops.rs +++ b/raphtory/src/db/api/state/node_state_ops.rs @@ -6,6 +6,7 @@ use crate::{ }, prelude::{GraphViewOps, NodeViewOps}, }; +use either::Either; use indexmap::IndexSet; use num_traits::AsPrimitive; use rayon::prelude::*; @@ -69,14 +70,6 @@ pub trait NodeStateOps<'graph>: where 'graph: 'a; - fn get_by_index( - &self, - index: usize, - ) -> Option<( - NodeView<'_, &Self::BaseGraph, &Self::Graph>, - Self::Value<'_>, - )>; - fn get_by_node(&self, node: N) -> Option>; fn len(&self) -> usize; @@ -111,7 +104,7 @@ pub trait NodeStateOps<'graph>: self.base_graph().clone(), self.graph().clone(), values.into(), - Some(Index::new(keys)), + Either::Right(Index::new(keys)), ) } @@ -171,7 +164,7 @@ pub trait NodeStateOps<'graph>: self.base_graph().clone(), self.graph().clone(), values.into(), - Some(Index::new(keys)), + Either::Right(Index::new(keys)), ) } diff --git a/raphtory/src/db/graph/nodes.rs b/raphtory/src/db/graph/nodes.rs index 9669a66f14..715fc66f6c 100644 --- a/raphtory/src/db/graph/nodes.rs +++ b/raphtory/src/db/graph/nodes.rs @@ -13,6 +13,7 @@ use crate::{ }, prelude::*, }; +use either::Either; use raphtory_storage::{ core_ops::is_view_compatible, graph::{graph::GraphStorage, nodes::node_storage_ops::NodeStorageOps}, @@ -25,12 +26,13 @@ use std::{ marker::PhantomData, sync::Arc, }; +use storage::state::StateIndex; #[derive(Clone)] pub struct Nodes<'graph, G, GH = G> { pub(crate) base_graph: G, pub(crate) graph: GH, - pub(crate) nodes: Option>, + pub(crate) nodes: Either>, Index>, pub(crate) node_types_filter: Option>, _marker: PhantomData<&'graph ()>, } @@ -113,10 +115,11 @@ where { pub fn new(graph: G) -> Self { let base_graph = graph.clone(); + let node_index = StateIndex::from(graph.core_graph().node_segment_counts()); Self { base_graph, graph, - nodes: None, + nodes: Either::Left(Arc::new(node_index)), node_types_filter: None, _marker: PhantomData, } @@ -148,7 +151,7 @@ where pub fn new_filtered( base_graph: G, graph: GH, - nodes: Option>, + nodes: Either>, Index>, node_types_filter: Option>, ) -> Self { Self { @@ -162,8 +165,8 @@ where pub fn node_list(&self) -> NodeList { match self.nodes.clone() { - None => self.graph.node_list(), - Some(elems) => NodeList::List { elems }, + Either::Right(elems) => NodeList::List { elems }, + _ => self.graph.node_list(), } } @@ -185,7 +188,7 @@ where Nodes::new_filtered( self.base_graph.clone(), self.graph.clone(), - Some(index), + Either::Right(index), self.node_types_filter.clone(), ) } @@ -260,14 +263,14 @@ where #[inline] pub fn len(&self) -> usize { match self.nodes.as_ref() { - None => { + Either::Left(_) => { if self.is_list_filtered() { self.par_iter_refs().count() } else { self.graph.node_list().len() } } - Some(nodes) => { + Either::Right(nodes) => { if self.is_filtered() { self.par_iter_refs().count() } else { @@ -349,6 +352,7 @@ where && self .nodes .as_ref() + .right() .map(|nodes| nodes.contains(&node.node)) .unwrap_or(true) }) diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs index 68792567f7..14811d41e2 100644 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -245,19 +245,28 @@ pub(crate) fn process_pandas_py_df<'a>( .collect(); let names_len = names.len(); - let chunks = rb.into_iter().map(move |rb| { - let chunk = (0..names_len) - .map(|i| { - let array = rb.call_method1("column", (i,)).map_err(GraphError::from)?; - let arr = array_to_rust(&array).map_err(GraphError::from)?; - Ok::<_, GraphError>(arr) - }) - .collect::, GraphError>>()?; - Ok(DFChunk { chunk }) - }); + // Convert all Python batches to Rust Arrow arrays while we have the GIL + // This makes the iterator Send-safe + let rust_batches: Vec> = rb + .into_iter() + .map(|rb| { + let chunk = (0..names_len) + .map(|i| { + let array = rb.call_method1("column", (i,)).map_err(GraphError::from)?; + let arr = array_to_rust(&array).map_err(GraphError::from)?; + Ok::<_, GraphError>(arr) + }) + .collect::, GraphError>>()?; + + Ok(DFChunk { chunk }) + }) + .collect(); + let num_rows: usize = dropped_df.call_method0("__len__")?.extract()?; + let chunks = rust_batches.into_iter(); + Ok(DFView { names, chunks, diff --git a/raphtory/src/python/packages/algorithms.rs b/raphtory/src/python/packages/algorithms.rs index 13bbb90019..ae715c5ef5 100644 --- a/raphtory/src/python/packages/algorithms.rs +++ b/raphtory/src/python/packages/algorithms.rs @@ -69,6 +69,7 @@ use crate::{ utils::{PyNodeRef, PyTime}, }, }; +use either::Either; use pyo3::{prelude::*, types::PyList}; use rand::{prelude::StdRng, SeedableRng}; use raphtory_api::core::Direction; @@ -772,9 +773,9 @@ pub fn k_core( ) -> Nodes<'static, DynamicGraph> { let v_set = k_core_set(&graph.graph, k, iter_count, threads); let index = if v_set.len() == graph.graph.unfiltered_num_nodes() { - None + Either::Left(graph.graph.core_graph().node_state_index().into()) } else { - Some(Index::from_iter(v_set)) + Either::Right(Index::from_iter(v_set)) }; Nodes::new_filtered(graph.graph.clone(), graph.graph.clone(), index, None) } From 49852ae586041df8c10a65663450a8d3f7b6268b Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Thu, 4 Dec 2025 14:42:14 +0000 Subject: [PATCH 06/24] Index is now an enum --- db4-storage/src/state.rs | 71 ++++++- .../algorithms/components/in_components.rs | 4 +- .../algorithms/components/out_components.rs | 4 +- .../algorithms/dynamics/temporal/epidemics.rs | 2 +- .../local_clustering_coefficient_batch.rs | 2 +- raphtory/src/algorithms/pathing/dijkstra.rs | 4 +- .../pathing/single_source_shortest_path.rs | 7 +- raphtory/src/core/state/shuffle_state.rs | 12 +- raphtory/src/db/api/state/group_by.rs | 14 +- raphtory/src/db/api/state/lazy_node_state.rs | 4 +- raphtory/src/db/api/state/node_state.rs | 188 +++++++++--------- raphtory/src/db/api/state/node_state_ops.rs | 4 +- raphtory/src/db/graph/nodes.rs | 25 +-- raphtory/src/python/packages/algorithms.rs | 4 +- 14 files changed, 192 insertions(+), 153 deletions(-) diff --git a/db4-storage/src/state.rs b/db4-storage/src/state.rs index 1596b95867..b599e85949 100644 --- a/db4-storage/src/state.rs +++ b/db4-storage/src/state.rs @@ -1,5 +1,8 @@ use rayon::prelude::*; -use std::ops::{Index, IndexMut}; +use std::{ + ops::{Index, IndexMut}, + sync::Arc, +}; use crate::pages::SegmentCounts; @@ -121,10 +124,16 @@ impl + Into> StateIndex { /// Get the total number of items across all chunks #[inline] - pub fn total_len(&self) -> usize { + pub fn len(&self) -> usize { self.offsets[self.num_chunks()] } + /// Check if there are no items + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Get the maximum page length #[inline] pub fn max_page_len(&self) -> u32 { @@ -173,6 +182,52 @@ impl + Into> StateIndex { }) }) } + + pub fn arc_into_iter(self: Arc) -> impl Iterator { + let max_page_len = self.max_page_len as usize; + let num_chunks = self.num_chunks(); + (0..num_chunks).flat_map(move |chunk_idx| { + let chunk_start = self.offsets[chunk_idx]; + let chunk_end = self.offsets[chunk_idx + 1]; + let chunk_size = chunk_end - chunk_start; + let global_base = chunk_idx * max_page_len; + (0..chunk_size).map(move |local_offset| { + let flat_idx = chunk_start + local_offset; + let global_idx = I::from(global_base + local_offset); + (flat_idx, global_idx) + }) + }) + } +} + +impl + Into> StateIndex { + /// Create a parallel iterator over all valid global indices with their flat indices + /// + /// This iterates through all chunks in parallel and yields tuples of (flat_index, global_index). + /// The flat_index starts at 0 and increments for each item in iteration order. + /// + /// For example, with chunk_sizes [10, 1, 5] and max_page_len 10: + /// - Chunk 0: yields (0, 0)..(9, 9) + /// - Chunk 1: yields (10, 10) + /// - Chunk 2: yields (11, 20)..(15, 24) + pub fn into_par_iter(self: Arc) -> impl ParallelIterator + where + I: Send + Sync, + { + let max_page_len = self.max_page_len as usize; + let num_chunks = self.num_chunks(); + (0..num_chunks).into_par_iter().flat_map(move |chunk_idx| { + let chunk_start = self.offsets[chunk_idx]; + let chunk_end = self.offsets[chunk_idx + 1]; + let chunk_size = chunk_end - chunk_start; + let global_base = chunk_idx * max_page_len; + (0..chunk_size).into_par_iter().map(move |local_offset| { + let flat_idx = chunk_start + local_offset; + let global_idx = I::from(global_base + local_offset); + (flat_idx, global_idx) + }) + }) + } } /// Iterator over global indices in a StateIndex @@ -210,7 +265,7 @@ impl<'a, I: From + Into> Iterator for StateIndexIter<'a, I> { } fn size_hint(&self) -> (usize, Option) { - let total = self.index.total_len(); + let total = self.index.len(); let consumed = if self.current_chunk < self.index.num_chunks() { self.index.offsets[self.current_chunk] + self.current_local } else { @@ -223,7 +278,7 @@ impl<'a, I: From + Into> Iterator for StateIndexIter<'a, I> { impl<'a, I: From + Into> ExactSizeIterator for StateIndexIter<'a, I> { fn len(&self) -> usize { - let total = self.index.total_len(); + let total = self.index.len(); let consumed = if self.current_chunk < self.index.num_chunks() { self.index.offsets[self.current_chunk] + self.current_local } else { @@ -262,7 +317,7 @@ impl + Into> State { /// ``` pub fn new(chunk_sizes: Vec, max_page_len: u32) -> Self { let index = StateIndex::::new(chunk_sizes, max_page_len); - let total_size = index.total_len(); + let total_size = index.len(); // Initialize state array with default values let state: Box<[A]> = (0..total_size) @@ -428,7 +483,7 @@ mod tests { let index: StateIndex = StateIndex::new(vec![1000, 500, 1000], 1000); assert_eq!(index.num_chunks(), 3); - assert_eq!(index.total_len(), 2500); + assert_eq!(index.len(), 2500); assert_eq!(index.max_page_len(), 1000); // Test chunk 0 @@ -639,7 +694,7 @@ mod tests { assert_eq!(state.len(), 2500); // Verify via StateIndex API - assert_eq!(state.index().total_len(), state.len()); + assert_eq!(state.index().len(), state.len()); } #[test] @@ -648,7 +703,7 @@ mod tests { let index: StateIndex = StateIndex::new(vec![1000, 500, 1000], 1000); // Create your own array - let mut data = vec![0usize; index.total_len()]; + let mut data = vec![0usize; index.len()]; // Use the index to access elements if let Some(flat_idx) = index.resolve(1200) { diff --git a/raphtory/src/algorithms/components/in_components.rs b/raphtory/src/algorithms/components/in_components.rs index 4856d51da0..2a2e3ceddd 100644 --- a/raphtory/src/algorithms/components/in_components.rs +++ b/raphtory/src/algorithms/components/in_components.rs @@ -77,7 +77,7 @@ where Nodes::new_filtered( g.clone(), g.clone(), - Either::Right(Index::from_iter(v.in_components)), + Index::from_iter(v.in_components), None, ) }) @@ -128,6 +128,6 @@ pub fn in_component<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>>( node.base_graph.clone(), node.base_graph.clone(), distances.into(), - Either::Right(Index::new(nodes)), + Index::Partial(nodes.into()), ) } diff --git a/raphtory/src/algorithms/components/out_components.rs b/raphtory/src/algorithms/components/out_components.rs index d5a4b633ef..b3deb06988 100644 --- a/raphtory/src/algorithms/components/out_components.rs +++ b/raphtory/src/algorithms/components/out_components.rs @@ -77,7 +77,7 @@ where Nodes::new_filtered( g.clone(), g.clone(), - Either::Right(Index::from_iter(v.out_components)), + Index::from_iter(v.out_components), None, ) }) @@ -128,6 +128,6 @@ pub fn out_component<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>>( node.base_graph.clone(), node.base_graph.clone(), distances.into(), - Either::Right(Index::new(nodes)), + Index::Partial(nodes.into()), ) } diff --git a/raphtory/src/algorithms/dynamics/temporal/epidemics.rs b/raphtory/src/algorithms/dynamics/temporal/epidemics.rs index 39154bcf69..46e8bb729e 100644 --- a/raphtory/src/algorithms/dynamics/temporal/epidemics.rs +++ b/raphtory/src/algorithms/dynamics/temporal/epidemics.rs @@ -253,7 +253,7 @@ where g.clone(), g.clone(), values.into(), - Either::Right(Index::new(index)), + Index::Partial(index.into()), )) } diff --git a/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs b/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs index 0cc37f34cb..fa8f66a7e1 100644 --- a/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs +++ b/raphtory/src/algorithms/metrics/clustering_coefficient/local_clustering_coefficient_batch.rs @@ -52,6 +52,6 @@ pub fn local_clustering_coefficient_batch( graph.clone(), graph.clone(), values.into(), - Either::Right(Index::new(index)), + Index::Partial(index.into()), ) } diff --git a/raphtory/src/algorithms/pathing/dijkstra.rs b/raphtory/src/algorithms/pathing/dijkstra.rs index 0315d344d4..0f6b88d7ff 100644 --- a/raphtory/src/algorithms/pathing/dijkstra.rs +++ b/raphtory/src/algorithms/pathing/dijkstra.rs @@ -191,7 +191,7 @@ pub fn dijkstra_single_source_shortest_paths, T: AsNodeRef } } NodeState::new_from_map(g.clone(), paths, |v| { - Nodes::new_filtered( - g.clone(), - g.clone(), - Either::Right(Index::from_iter(v)), - None, - ) + Nodes::new_filtered(g.clone(), g.clone(), Index::from_iter(v), None) }) } diff --git a/raphtory/src/core/state/shuffle_state.rs b/raphtory/src/core/state/shuffle_state.rs index 715001395a..98e99700d4 100644 --- a/raphtory/src/core/state/shuffle_state.rs +++ b/raphtory/src/core/state/shuffle_state.rs @@ -1,3 +1,6 @@ +use either::Either; +use storage::state::StateIndex; + use super::{ accumulator_id::AccId, compute_state::ComputeState, @@ -7,7 +10,7 @@ use super::{ use crate::{ core::state::agg::Accumulator, db::{ - api::view::StaticGraphViewOps, + api::{state::Index, view::StaticGraphViewOps}, task::task_state::{Global, Shard}, }, }; @@ -94,7 +97,12 @@ impl ShuffleComputeState { self.global.reset_states(ss, states); } - pub fn new(total_len: usize, n_parts: usize, morcel_size: usize) -> Self { + pub fn new( + total_len: usize, + n_parts: usize, + morcel_size: usize, + // keys: Either>, Index>, + ) -> Self { let last_one_size = if morcel_size == 0 { 1 } else { diff --git a/raphtory/src/db/api/state/group_by.rs b/raphtory/src/db/api/state/group_by.rs index 2ac9b10887..d4b7cb53d6 100644 --- a/raphtory/src/db/api/state/group_by.rs +++ b/raphtory/src/db/api/state/group_by.rs @@ -38,12 +38,7 @@ impl<'graph, V: Hash + Eq + Send + Sync + Clone, G: GraphViewOps<'graph>> NodeGr self.groups.iter().map(|(v, nodes)| { ( v, - Nodes::new_filtered( - self.graph.clone(), - self.graph.clone(), - Either::Right(nodes.clone()), - None, - ), + Nodes::new_filtered(self.graph.clone(), self.graph.clone(), nodes.clone(), None), ) }) } @@ -84,12 +79,7 @@ impl<'graph, V: Hash + Eq + Send + Sync + Clone, G: GraphViewOps<'graph>> NodeGr self.groups.get(index).map(|(v, nodes)| { ( v, - Nodes::new_filtered( - self.graph.clone(), - self.graph.clone(), - Either::Right(nodes.clone()), - None, - ), + Nodes::new_filtered(self.graph.clone(), self.graph.clone(), nodes.clone(), None), ) }) } diff --git a/raphtory/src/db/api/state/lazy_node_state.rs b/raphtory/src/db/api/state/lazy_node_state.rs index 7a28473227..bdf061b75a 100644 --- a/raphtory/src/db/api/state/lazy_node_state.rs +++ b/raphtory/src/db/api/state/lazy_node_state.rs @@ -164,7 +164,7 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra self.nodes.base_graph.clone(), self.nodes.graph.clone(), values.into(), - Either::Right(Index::new(keys)), + Index::Partial(keys.into()), ) } else { let values = self.collect_vec(); @@ -172,7 +172,7 @@ impl<'graph, Op: NodeOp + 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'gra self.nodes.base_graph.clone(), self.nodes.graph.clone(), values.into(), - Either::Left(self.nodes.graph.core_graph().node_state_index().into()), + Index::for_graph(self.nodes.graph.clone()), ) } } diff --git a/raphtory/src/db/api/state/node_state.rs b/raphtory/src/db/api/state/node_state.rs index 19cf75d623..f448ad6ac0 100644 --- a/raphtory/src/db/api/state/node_state.rs +++ b/raphtory/src/db/api/state/node_state.rs @@ -24,95 +24,132 @@ use std::{ }; use storage::state::StateIndex; -#[derive(Debug, Default)] -pub struct Index { - index: Arc>, +#[derive(Debug)] +pub enum Index { + Full(Arc>), + Partial(Arc>), +} + +impl Default for Index { + fn default() -> Self { + Self::Partial(Arc::new(Default::default())) + } } impl Clone for Index { fn clone(&self) -> Self { - let index = self.index.clone(); - Self { index } + match self { + Index::Full(index) => Index::Full(index.clone()), + Index::Partial(index) => Index::Partial(index.clone()), + } } } impl + From + Send + Sync> FromIterator for Index { fn from_iter>(iter: T) -> Self { - Self { - index: Arc::new(IndexSet::from_iter(iter)), - } + Self::Partial(Arc::new(IndexSet::from_iter(iter))) } } impl Index { - pub fn for_graph<'graph>(graph: impl GraphViewOps<'graph>) -> Option { + pub fn for_graph<'graph>(graph: impl GraphViewOps<'graph>) -> Self { if graph.filtered() { if graph.node_list_trusted() { match graph.node_list() { - NodeList::All { .. } => None, - NodeList::List { elems } => Some(elems), + NodeList::All { .. } => { + Self::Full(graph.core_graph().node_state_index().into()) + } + NodeList::List { elems } => elems.into(), } } else { - Some(Self::from_iter(graph.nodes().iter().map(|node| node.node))) + Self::from_iter(graph.nodes().iter().map(|node| node.node)) } } else { - None + Self::Full(graph.core_graph().node_state_index().into()) } } } impl + From + Send + Sync> Index { pub fn new(keys: impl Into>>) -> Self { - Self { index: keys.into() } + Self::Partial(keys.into()) } #[inline] pub fn iter(&self) -> impl Iterator + '_ { - self.index.iter().copied() + match self { + Index::Full(index) => Either::Left(index.iter()), + Index::Partial(index) => Either::Right(index.iter().copied()), + } } - pub fn into_par_iter(self) -> impl IndexedParallelIterator { - (0..self.len()) - .into_par_iter() - .map(move |i| *self.index.get_index(i).unwrap()) + pub fn into_par_iter(self) -> impl ParallelIterator { + match self { + Index::Full(index) => Either::Left(index.into_par_iter().map(|(_, k)| k)), + Index::Partial(index) => Either::Right( + (0..index.len()) + .into_par_iter() + .map(move |i| *index.get_index(i).unwrap()), + ), + } } pub fn into_iter(self) -> impl Iterator { - (0..self.len()).map(move |i| *self.index.get_index(i).unwrap()) + match self { + Index::Full(index) => Either::Left(index.arc_into_iter().map(|(_, k)| k)), + Index::Partial(index) => { + Either::Right((0..index.len()).map(move |i| *index.get_index(i).unwrap())) + } + } } #[inline] pub fn index(&self, key: &K) -> Option { - self.index.get_index_of(key) - } - - #[inline] - pub fn key(&self, index: usize) -> Option { - self.index.get_index(index).copied() + // self.index.get_index_of(key) + match self { + Index::Full(index) => index.resolve(*key), + Index::Partial(index) => index.get_index_of(key), + } } #[inline] pub fn len(&self) -> usize { - self.index.len() + match self { + Index::Full(index) => index.len(), + Index::Partial(index) => index.len(), + } } pub fn is_empty(&self) -> bool { - self.index.is_empty() + self.len() == 0 } #[inline] pub fn contains(&self, key: &K) -> bool { - self.index.contains(key) + match self { + Index::Full(index) => index.resolve(*key).is_some(), + Index::Partial(index) => index.contains(key), + } } - pub fn par_iter(&self) -> impl IndexedParallelIterator + '_ { - (0..self.len()) - .into_par_iter() - .map(move |i| *self.index.get_index(i).unwrap()) + pub fn par_iter(&self) -> impl ParallelIterator + '_ { + match self { + Index::Full(index) => Either::Left(index.par_iter()), + Index::Partial(index) => Either::Right( + (0..index.len()) + .into_par_iter() + .map(move |i| (i, *index.get_index(i).unwrap())), + ), + } } pub fn intersection(&self, other: &Self) -> Self { - self.index.intersection(&other.index).copied().collect() + match (self, other) { + (Self::Full(_), Self::Partial(a)) => Self::Partial(a.clone()), + (Self::Partial(a), Self::Full(_)) => Self::Partial(a.clone()), + (Self::Partial(a), Self::Partial(b)) => a.intersection(b).copied().collect(), + _ => self.clone(), + } } } @@ -121,7 +158,7 @@ pub struct NodeState<'graph, V, G, GH = G> { base_graph: G, graph: GH, values: Arc<[V]>, - keys: Either>, Index>, + keys: Index, _marker: PhantomData<&'graph ()>, } @@ -215,15 +252,12 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { { let index = Index::for_graph(graph.clone()); let values = match &index { - None => values, - Some(index) => index + Index::Full(_) => values, + Index::Partial(index) => index .iter() .map(|vid| values[vid.index()].clone()) .collect(), }; - let index = index - .map(Either::Right) - .unwrap_or_else(|| Either::Left(graph.core_graph().node_state_index().into())); Self::new(graph.clone(), graph, values.into(), index) } @@ -236,23 +270,18 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { pub fn new_from_eval_mapped(graph: G, values: Vec, map: impl Fn(R) -> V) -> Self { let index = Index::for_graph(graph.clone()); let values = match &index { - None => values.into_iter().map(map).collect(), - Some(index) => index + Index::Full(_) => values.into_iter().map(map).collect(), + Index::Partial(index) => index .iter() .map(|vid| map(values[vid.index()].clone())) .collect(), }; - let index = index - .map(Either::Right) - .unwrap_or_else(|| Either::Left(graph.core_graph().node_state_index().into())); Self::new(graph.clone(), graph, values, index) } /// create a new empty NodeState pub fn new_empty(graph: G) -> Self { - let index = Either::Left(Arc::new(StateIndex::from( - graph.core_graph().node_segment_counts(), - ))); + let index = Index::for_graph(&graph); Self::new(graph.clone(), graph, [].into(), index) } @@ -260,9 +289,6 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { /// node filtering when needed) pub fn new_from_values(graph: G, values: impl Into>) -> Self { let index = Index::for_graph(&graph); - let index = index - .map(Either::Right) - .unwrap_or_else(|| Either::Left(graph.core_graph().node_state_index().into())); Self::new(graph.clone(), graph, values.into(), index) } @@ -289,19 +315,14 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { graph.clone(), graph, values.into(), - Either::Right(Index::new(index)), + Index::Partial(index.into()), ) } } } impl<'graph, V, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> NodeState<'graph, V, G, GH> { - pub fn new( - base_graph: G, - graph: GH, - values: Arc<[V]>, - keys: Either>, Index>, - ) -> Self { + pub fn new(base_graph: G, graph: GH, values: Arc<[V]>, keys: Index) -> Self { Self { base_graph, graph, @@ -393,24 +414,12 @@ impl< where 'graph: 'a, { - match &self.keys { - Either::Right(index) => { - Either::Right(index.iter().zip(self.values.iter()).map(|(n, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), - v, - ) - })) - } - Either::Left(index) => { - Either::Left(index.iter().zip(self.values.iter()).map(|(n, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), - v, - ) - })) - } - } + self.keys.iter().zip(self.values.iter()).map(move |(n, v)| { + ( + NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), + v, + ) + }) } fn nodes(&self) -> Nodes<'graph, Self::BaseGraph, Self::Graph> { @@ -437,30 +446,17 @@ impl< where 'graph: 'a, { - match &self.keys { - Either::Right(index) => { - Either::Left(index.par_iter().zip(self.values.par_iter()).map(|(n, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), - v, - ) - })) - } - Either::Left(index) => Either::Right(index.par_iter().map(|(i, v)| { - ( - NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, v), - &self.values[i], - ) - })), - } + self.keys.par_iter().map(move |(val_id, n)| { + ( + NodeView::new_one_hop_filtered(&self.base_graph, &self.graph, n), + &self.values[val_id], + ) + }) } fn get_by_node(&self, node: N) -> Option> { let id = self.graph.internalise_node(node.as_node_ref())?; - match &self.keys { - Either::Right(index) => index.index(&id).map(|i| &self.values[i]), - Either::Left(index) => Some(&self.values[id.0]), - } + self.keys.index(&id).map(|i| &self.values[i]) } fn len(&self) -> usize { diff --git a/raphtory/src/db/api/state/node_state_ops.rs b/raphtory/src/db/api/state/node_state_ops.rs index 8f4e9a812f..0004284937 100644 --- a/raphtory/src/db/api/state/node_state_ops.rs +++ b/raphtory/src/db/api/state/node_state_ops.rs @@ -104,7 +104,7 @@ pub trait NodeStateOps<'graph>: self.base_graph().clone(), self.graph().clone(), values.into(), - Either::Right(Index::new(keys)), + Index::Partial(keys.into()), ) } @@ -164,7 +164,7 @@ pub trait NodeStateOps<'graph>: self.base_graph().clone(), self.graph().clone(), values.into(), - Either::Right(Index::new(keys)), + Index::Partial(keys.into()), ) } diff --git a/raphtory/src/db/graph/nodes.rs b/raphtory/src/db/graph/nodes.rs index 715fc66f6c..490cb81394 100644 --- a/raphtory/src/db/graph/nodes.rs +++ b/raphtory/src/db/graph/nodes.rs @@ -32,7 +32,7 @@ use storage::state::StateIndex; pub struct Nodes<'graph, G, GH = G> { pub(crate) base_graph: G, pub(crate) graph: GH, - pub(crate) nodes: Either>, Index>, + pub(crate) nodes: Index, pub(crate) node_types_filter: Option>, _marker: PhantomData<&'graph ()>, } @@ -115,11 +115,11 @@ where { pub fn new(graph: G) -> Self { let base_graph = graph.clone(); - let node_index = StateIndex::from(graph.core_graph().node_segment_counts()); + let node_index = Index::for_graph(base_graph.clone()); Self { base_graph, graph, - nodes: Either::Left(Arc::new(node_index)), + nodes: node_index, node_types_filter: None, _marker: PhantomData, } @@ -151,7 +151,7 @@ where pub fn new_filtered( base_graph: G, graph: GH, - nodes: Either>, Index>, + nodes: Index, node_types_filter: Option>, ) -> Self { Self { @@ -165,7 +165,7 @@ where pub fn node_list(&self) -> NodeList { match self.nodes.clone() { - Either::Right(elems) => NodeList::List { elems }, + elems @ Index::Partial(_) => NodeList::List { elems }, _ => self.graph.node_list(), } } @@ -188,7 +188,7 @@ where Nodes::new_filtered( self.base_graph.clone(), self.graph.clone(), - Either::Right(index), + index, self.node_types_filter.clone(), ) } @@ -262,15 +262,15 @@ where /// Returns the number of nodes in the graph. #[inline] pub fn len(&self) -> usize { - match self.nodes.as_ref() { - Either::Left(_) => { + match &self.nodes { + Index::Full(_) => { if self.is_list_filtered() { self.par_iter_refs().count() } else { self.graph.node_list().len() } } - Either::Right(nodes) => { + Index::Partial(nodes) => { if self.is_filtered() { self.par_iter_refs().count() } else { @@ -349,12 +349,7 @@ where .as_ref() .map(|filter| filter[node.node_type_id()]) .unwrap_or(true) - && self - .nodes - .as_ref() - .right() - .map(|nodes| nodes.contains(&node.node)) - .unwrap_or(true) + && self.nodes.contains(&node.node) }) .is_some() } diff --git a/raphtory/src/python/packages/algorithms.rs b/raphtory/src/python/packages/algorithms.rs index ae715c5ef5..e48b0e51fc 100644 --- a/raphtory/src/python/packages/algorithms.rs +++ b/raphtory/src/python/packages/algorithms.rs @@ -773,9 +773,9 @@ pub fn k_core( ) -> Nodes<'static, DynamicGraph> { let v_set = k_core_set(&graph.graph, k, iter_count, threads); let index = if v_set.len() == graph.graph.unfiltered_num_nodes() { - Either::Left(graph.graph.core_graph().node_state_index().into()) + Index::for_graph(graph.graph.clone()) } else { - Either::Right(Index::from_iter(v_set)) + Index::from_iter(v_set) }; Nodes::new_filtered(graph.graph.clone(), graph.graph.clone(), index, None) } From f430064472807923c289f6bec1644adf22288943 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Fri, 5 Dec 2025 09:21:02 +0000 Subject: [PATCH 07/24] changes to task runners to support non contiguous VIDs --- db4-storage/src/state.rs | 15 +----- raphtory/src/core/state/shuffle_state.rs | 31 +++++++------ raphtory/src/db/task/node/eval_node.rs | 5 +- raphtory/src/db/task/task_runner.rs | 58 ++++++++++++++++-------- raphtory/src/db/task/task_state.rs | 19 ++++++-- 5 files changed, 75 insertions(+), 53 deletions(-) diff --git a/db4-storage/src/state.rs b/db4-storage/src/state.rs index b599e85949..3b0563ab5e 100644 --- a/db4-storage/src/state.rs +++ b/db4-storage/src/state.rs @@ -1,3 +1,4 @@ +use rayon::iter::plumbing::{Producer, ProducerCallback, UnindexedConsumer, bridge}; use rayon::prelude::*; use std::{ ops::{Index, IndexMut}, @@ -799,20 +800,6 @@ mod tests { assert_eq!(state.iter().len(), 16); } - #[test] - fn test_state_iter_mut() { - let mut state: State = State::new(vec![5, 2], 10); - - // Modify all elements via iter_mut - for (i, value) in state.iter_mut().enumerate() { - *value = i * 2; - } - - // Verify via iter - let values: Vec = state.iter().copied().collect(); - assert_eq!(values, vec![0, 2, 4, 6, 8, 10, 12]); - } - #[test] fn test_state_iter_with_atomics() { let state: State = State::new(vec![10, 5], 10); diff --git a/raphtory/src/core/state/shuffle_state.rs b/raphtory/src/core/state/shuffle_state.rs index 98e99700d4..8755d34b72 100644 --- a/raphtory/src/core/state/shuffle_state.rs +++ b/raphtory/src/core/state/shuffle_state.rs @@ -1,5 +1,4 @@ -use either::Either; -use storage::state::StateIndex; +use raphtory_core::entities::VID; use super::{ accumulator_id::AccId, @@ -21,6 +20,7 @@ pub struct ShuffleComputeState { morcel_size: usize, pub global: MorcelComputeState, pub parts: Vec>, + index: Index, } // every partition has a struct as such @@ -97,12 +97,7 @@ impl ShuffleComputeState { self.global.reset_states(ss, states); } - pub fn new( - total_len: usize, - n_parts: usize, - morcel_size: usize, - // keys: Either>, Index>, - ) -> Self { + pub fn new(total_len: usize, n_parts: usize, morcel_size: usize, index: Index) -> Self { let last_one_size = if morcel_size == 0 { 1 } else { @@ -122,26 +117,29 @@ impl ShuffleComputeState { morcel_size, parts, global: MorcelComputeState::new(1), + index, } } - pub fn global() -> Self { + pub fn global(index: Index) -> Self { Self { morcel_size: 1, parts: vec![], global: MorcelComputeState::new(1), + index, } } pub fn accumulate_into>( &mut self, ss: usize, - p_id: usize, + vid: VID, a: IN, agg_ref: &AccId, ) where A: StateType, { + let p_id = self.index.index(&vid).expect("VID not found in index"); let (morcel_id, offset) = self.resolve_pid(p_id); self.parts[morcel_id].accumulate_into(ss, offset, a, agg_ref) } @@ -149,13 +147,18 @@ impl ShuffleComputeState { pub fn read_with_pid>( &self, ss: usize, - p_id: usize, + vid: VID, agg_ref: &AccId, ) -> Option where A: StateType, OUT: std::fmt::Debug, { + dbg!(&vid, &self.index); + let p_id = self + .index + .index(&vid) + .unwrap_or_else(|| panic!("VID {:?} not found in index", vid)); let (morcel_id, offset) = self.resolve_pid(p_id); self.parts[morcel_id].read::(offset, agg_ref.id(), ss) } @@ -175,13 +178,14 @@ impl ShuffleComputeState { pub fn read>( &self, ss: usize, - p_id: usize, + vid: VID, agg_ref: &AccId, ) -> Option where A: StateType, OUT: std::fmt::Debug, { + let p_id = self.index.index(&vid).expect("VID not found in index"); let (morcel_id, offset) = self.resolve_pid(p_id); self.parts[morcel_id].read::(offset, agg_ref.id(), ss) } @@ -189,12 +193,13 @@ impl ShuffleComputeState { pub fn read_ref>( &self, ss: usize, - p_id: usize, + vid: VID, agg_ref: &AccId, ) -> Option<&A> where A: StateType, { + let p_id = self.index.index(&vid).expect("VID not found in index"); let (morcel_id, offset) = self.resolve_pid(p_id); self.parts[morcel_id].read_ref::(offset, agg_ref.id(), ss) } diff --git a/raphtory/src/db/task/node/eval_node.rs b/raphtory/src/db/task/node/eval_node.rs index f5830ab1c2..c20f6c513b 100644 --- a/raphtory/src/db/task/node/eval_node.rs +++ b/raphtory/src/db/task/node/eval_node.rs @@ -115,9 +115,8 @@ impl< } } - fn pid(&self) -> usize { - let VID(i) = self.node; - i + fn pid(&self) -> VID { + self.node } fn node_state(&self) -> Ref<'_, EVState<'a, CS>> { diff --git a/raphtory/src/db/task/task_runner.rs b/raphtory/src/db/task/task_runner.rs index 90cec1a44a..7ec01a9f00 100644 --- a/raphtory/src/db/task/task_runner.rs +++ b/raphtory/src/db/task/task_runner.rs @@ -14,7 +14,7 @@ use crate::{ }, }, db::{ - api::view::StaticGraphViewOps, + api::{state::Index, view::StaticGraphViewOps}, task::{ eval_graph::EvalGraph, node::{eval_node::EvalNodeView, eval_node_state::EVState}, @@ -22,6 +22,7 @@ use crate::{ }, prelude::GraphViewOps, }; +use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; use raphtory_storage::graph::graph::GraphStorage; use rayon::{prelude::*, ThreadPool}; use std::{ @@ -55,6 +56,7 @@ impl TaskRunner { global_state: &Global, morcel: &mut [S], prev_local_state: &Vec, + reverse_vids: &Vec, storage: &GraphStorage, atomic_done: &AtomicBool, morcel_size: usize, @@ -72,23 +74,24 @@ impl TaskRunner { let mut v_ref = morcel_id * morcel_size; for local_state in morcel { - if g.has_node(VID(v_ref)) { - let eval_graph = EvalGraph { - ss: self.ctx.ss(), - base_graph: &g, - storage, - local_state_prev: &local, - node_state: node_state.clone(), - }; - let mut vv = EvalNodeView::new_local(v_ref.into(), eval_graph, Some(local_state)); + let node = reverse_vids[v_ref]; + // if g.has_node(VID(v_ref)) { + let eval_graph = EvalGraph { + ss: self.ctx.ss(), + base_graph: &g, + storage, + local_state_prev: &local, + node_state: node_state.clone(), + }; + let mut vv = EvalNodeView::new_local(node, eval_graph, Some(local_state)); - match task.run(&mut vv) { - Step::Continue => { - done = false; - } - Step::Done => {} + match task.run(&mut vv) { + Step::Continue => { + done = false; } + Step::Done => {} } + // } v_ref += 1; } @@ -128,6 +131,7 @@ impl TaskRunner { global_state: Global, mut local_state: Vec, prev_local_state: &Vec, + reverse_vids: &Vec, storage: &GraphStorage, ) -> (bool, Shard, Global, Vec) { pool.install(move || { @@ -149,6 +153,7 @@ impl TaskRunner { &new_global_state, morcel, prev_local_state, + reverse_vids, storage, &atomic_done, morcel_size, @@ -167,6 +172,7 @@ impl TaskRunner { &new_global_state, morcel, prev_local_state, + reverse_vids, storage, &atomic_done, morcel_size, @@ -226,8 +232,9 @@ impl TaskRunner { ) -> B { let pool = num_threads.map(custom_pool).unwrap_or_else(|| POOL.clone()); - let num_nodes = self.ctx.graph().unfiltered_num_nodes(); let graph = self.ctx.graph(); + let node_index = Index::for_graph(graph.clone()); + let num_nodes = node_index.len(); let storage = graph.core_graph(); let morcel_size = num_nodes.min(16_000); let num_chunks = if morcel_size == 0 { @@ -236,16 +243,27 @@ impl TaskRunner { (num_nodes + morcel_size - 1) / morcel_size }; - let mut shard_state = - shard_initial_state.unwrap_or_else(|| Shard::new(num_nodes, num_chunks, morcel_size)); + let index = Index::for_graph(graph.clone()); + + let mut shard_state = shard_initial_state + .unwrap_or_else(|| Shard::new(num_nodes, num_chunks, morcel_size, index.clone())); - let mut global_state = global_initial_state.unwrap_or_else(|| Global::new()); + let mut global_state = global_initial_state.unwrap_or_else(|| Global::new(index.clone())); let (mut cur_local_state, mut prev_local_state) = self.make_cur_and_prev_states::(init.unwrap_or_default()); let mut _done = false; + let mut reverse_vids = vec![VID(0); node_index.len()]; + { + let atom_vids = atomic_vid_from_mut_slice(&mut reverse_vids); + + node_index.par_iter().for_each(|(i, vid)| { + atom_vids[i].store(vid.0, Ordering::Relaxed); + }); + } + (_done, shard_state, global_state, cur_local_state) = self.run_task_list( &init_tasks, &pool, @@ -254,6 +272,7 @@ impl TaskRunner { global_state, cur_local_state, &prev_local_state, + &reverse_vids, storage, ); @@ -269,6 +288,7 @@ impl TaskRunner { global_state, cur_local_state, &prev_local_state, + &reverse_vids, storage, ); diff --git a/raphtory/src/db/task/task_state.rs b/raphtory/src/db/task/task_state.rs index 8c9d7654ca..1d0787503f 100644 --- a/raphtory/src/db/task/task_state.rs +++ b/raphtory/src/db/task/task_state.rs @@ -1,4 +1,9 @@ -use crate::core::state::{compute_state::ComputeState, shuffle_state::ShuffleComputeState}; +use raphtory_core::entities::VID; + +use crate::{ + core::state::{compute_state::ComputeState, shuffle_state::ShuffleComputeState}, + db::api::state::Index, +}; use std::{borrow::Cow, sync::Arc}; // this only contains the global state and it is synchronized after each task run @@ -23,11 +28,17 @@ impl<'a, S: 'static> PrevLocalState<'a, S> { } impl Shard { - pub(crate) fn new(total_len: usize, num_morcels: usize, morcel_size: usize) -> Self { + pub(crate) fn new( + total_len: usize, + num_morcels: usize, + morcel_size: usize, + index: Index, + ) -> Self { Self(Arc::new(ShuffleComputeState::new( total_len, num_morcels, morcel_size, + index, ))) } @@ -66,8 +77,8 @@ impl From>> for Shard { } impl Global { - pub(crate) fn new() -> Self { - Self(Arc::new(ShuffleComputeState::global())) + pub(crate) fn new(index: Index) -> Self { + Self(Arc::new(ShuffleComputeState::global(index))) } pub(crate) fn as_cow(&self) -> Cow<'_, ShuffleComputeState> { From 9b26888b2af4ac5f10194eb46381a2110bafb40a Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Mon, 8 Dec 2025 09:56:21 +0000 Subject: [PATCH 08/24] fix test_hits --- raphtory/src/algorithms/centrality/hits.rs | 22 +++++++++++ .../algorithms/components/in_components.rs | 2 +- .../algorithms/components/out_components.rs | 2 +- raphtory/src/core/state/shuffle_state.rs | 35 ++++++----------- raphtory/src/db/api/state/node_state.rs | 26 +++++-------- raphtory/src/db/task/edge/eval_edge.rs | 14 +++++++ raphtory/src/db/task/edge/eval_edges.rs | 13 +++++++ raphtory/src/db/task/eval_graph.rs | 25 +++++++++--- raphtory/src/db/task/node/eval_node.rs | 39 ++++++++++++------- raphtory/src/db/task/task_runner.rs | 37 ++++++++++++++---- raphtory/src/db/task/task_state.rs | 12 ++---- 11 files changed, 148 insertions(+), 79 deletions(-) diff --git a/raphtory/src/algorithms/centrality/hits.rs b/raphtory/src/algorithms/centrality/hits.rs index f93c2fe0e3..207b6b20f2 100644 --- a/raphtory/src/algorithms/centrality/hits.rs +++ b/raphtory/src/algorithms/centrality/hits.rs @@ -81,6 +81,12 @@ pub fn hits( let step2 = ATask::new(move |evv: &mut EvalNodeView| { let hub_score = evv.get().hub_score; let auth_score = evv.get().auth_score; + if evv.graph().base_graph.unfiltered_num_nodes() <= 10 { + println!( + "DEBUG step2: node={:?}, state_pos={}, hub_score={}, auth_score={}", + evv.node, evv.state_pos, hub_score, auth_score + ); + } for t in evv.out_neighbours() { t.update(&recv_hub_score, hub_score) } @@ -108,6 +114,16 @@ pub fn hits( evv.get_mut().hub_score = recv_auth_score / evv.read_global_state(&total_auth_score).unwrap(); + if evv.graph().base_graph.unfiltered_num_nodes() <= 10 { + println!( + "DEBUG step4: node={:?}, state_pos={}, new_hub={}, new_auth={}", + evv.node, + evv.state_pos, + evv.get().hub_score, + evv.get().auth_score + ); + } + let prev_hub_score = evv.prev().hub_score; let curr_hub_score = evv.get().hub_score; @@ -142,6 +158,12 @@ pub fn hits( vec![Job::new(step2), Job::new(step3), Job::new(step4), step5], None, |_, _, _, local| { + if g.unfiltered_num_nodes() <= 10 { + println!("\nDEBUG Final local state (index -> (hub, auth)):"); + for (i, h) in local.iter().enumerate() { + println!(" local[{}] = ({}, {})", i, h.hub_score, h.auth_score); + } + } NodeState::new_from_eval_mapped(g.clone(), local, |h| (h.hub_score, h.auth_score)) }, threads, diff --git a/raphtory/src/algorithms/components/in_components.rs b/raphtory/src/algorithms/components/in_components.rs index 2a2e3ceddd..3afea2a78d 100644 --- a/raphtory/src/algorithms/components/in_components.rs +++ b/raphtory/src/algorithms/components/in_components.rs @@ -38,7 +38,7 @@ struct InState { /// pub fn in_components(g: &G, threads: Option) -> NodeState<'static, Nodes<'static, G>, G> where - G: StaticGraphViewOps, + G: StaticGraphViewOps + std::fmt::Debug, { let ctx: Context = g.into(); let step1 = ATask::new(move |vv: &mut EvalNodeView| { diff --git a/raphtory/src/algorithms/components/out_components.rs b/raphtory/src/algorithms/components/out_components.rs index b3deb06988..4b7c999d34 100644 --- a/raphtory/src/algorithms/components/out_components.rs +++ b/raphtory/src/algorithms/components/out_components.rs @@ -38,7 +38,7 @@ struct OutState { /// pub fn out_components(g: &G, threads: Option) -> NodeState<'static, Nodes<'static, G>, G> where - G: StaticGraphViewOps, + G: StaticGraphViewOps + std::fmt::Debug, { let ctx: Context = g.into(); let step1 = ATask::new(move |vv: &mut EvalNodeView| { diff --git a/raphtory/src/core/state/shuffle_state.rs b/raphtory/src/core/state/shuffle_state.rs index 8755d34b72..5f290d1ab3 100644 --- a/raphtory/src/core/state/shuffle_state.rs +++ b/raphtory/src/core/state/shuffle_state.rs @@ -1,5 +1,3 @@ -use raphtory_core::entities::VID; - use super::{ accumulator_id::AccId, compute_state::ComputeState, @@ -9,7 +7,7 @@ use super::{ use crate::{ core::state::agg::Accumulator, db::{ - api::{state::Index, view::StaticGraphViewOps}, + api::view::StaticGraphViewOps, task::task_state::{Global, Shard}, }, }; @@ -20,7 +18,6 @@ pub struct ShuffleComputeState { morcel_size: usize, pub global: MorcelComputeState, pub parts: Vec>, - index: Index, } // every partition has a struct as such @@ -97,7 +94,7 @@ impl ShuffleComputeState { self.global.reset_states(ss, states); } - pub fn new(total_len: usize, n_parts: usize, morcel_size: usize, index: Index) -> Self { + pub fn new(total_len: usize, n_parts: usize, morcel_size: usize) -> Self { let last_one_size = if morcel_size == 0 { 1 } else { @@ -117,49 +114,41 @@ impl ShuffleComputeState { morcel_size, parts, global: MorcelComputeState::new(1), - index, } } - pub fn global(index: Index) -> Self { + pub fn global() -> Self { Self { morcel_size: 1, parts: vec![], global: MorcelComputeState::new(1), - index, } } pub fn accumulate_into>( &mut self, ss: usize, - vid: VID, + state_pos: usize, a: IN, agg_ref: &AccId, ) where A: StateType, { - let p_id = self.index.index(&vid).expect("VID not found in index"); - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].accumulate_into(ss, offset, a, agg_ref) } pub fn read_with_pid>( &self, ss: usize, - vid: VID, + state_pos: usize, agg_ref: &AccId, ) -> Option where A: StateType, OUT: std::fmt::Debug, { - dbg!(&vid, &self.index); - let p_id = self - .index - .index(&vid) - .unwrap_or_else(|| panic!("VID {:?} not found in index", vid)); - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].read::(offset, agg_ref.id(), ss) } @@ -178,29 +167,27 @@ impl ShuffleComputeState { pub fn read>( &self, ss: usize, - vid: VID, + state_pos: usize, agg_ref: &AccId, ) -> Option where A: StateType, OUT: std::fmt::Debug, { - let p_id = self.index.index(&vid).expect("VID not found in index"); - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].read::(offset, agg_ref.id(), ss) } pub fn read_ref>( &self, ss: usize, - vid: VID, + state_pos: usize, agg_ref: &AccId, ) -> Option<&A> where A: StateType, { - let p_id = self.index.index(&vid).expect("VID not found in index"); - let (morcel_id, offset) = self.resolve_pid(p_id); + let (morcel_id, offset) = self.resolve_pid(state_pos); self.parts[morcel_id].read_ref::(offset, agg_ref.id(), ss) } diff --git a/raphtory/src/db/api/state/node_state.rs b/raphtory/src/db/api/state/node_state.rs index f448ad6ac0..f405c329c8 100644 --- a/raphtory/src/db/api/state/node_state.rs +++ b/raphtory/src/db/api/state/node_state.rs @@ -245,19 +245,13 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { /// /// # Arguments /// - `graph`: the graph view - /// - `values`: the unfiltered values (i.e., `values.len() == graph.unfiltered_num_nodes()`). This method handles the filtering. + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). pub fn new_from_eval(graph: G, values: Vec) -> Self where V: Clone, { let index = Index::for_graph(graph.clone()); - let values = match &index { - Index::Full(_) => values, - Index::Partial(index) => index - .iter() - .map(|vid| values[vid.index()].clone()) - .collect(), - }; + // Values are already in flat index order from TaskRunner Self::new(graph.clone(), graph, values.into(), index) } @@ -265,17 +259,15 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { /// /// # Arguments /// - `graph`: the graph view - /// - `values`: the unfiltered values (i.e., `values.len() == graph.unfiltered_num_nodes()`). This method handles the filtering. + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). /// - `map`: Closure mapping input to output values - pub fn new_from_eval_mapped(graph: G, values: Vec, map: impl Fn(R) -> V) -> Self { + pub fn new_from_eval_mapped(graph: G, values: Vec, map: impl Fn(R) -> V) -> Self + where + V: std::fmt::Debug, + { let index = Index::for_graph(graph.clone()); - let values = match &index { - Index::Full(_) => values.into_iter().map(map).collect(), - Index::Partial(index) => index - .iter() - .map(|vid| map(values[vid.index()].clone())) - .collect(), - }; + // Values are already in flat index order from TaskRunner, just map them + let values = values.into_iter().map(map).collect(); Self::new(graph.clone(), graph, values, index) } diff --git a/raphtory/src/db/task/edge/eval_edge.rs b/raphtory/src/db/task/edge/eval_edge.rs index bf2629af81..1e64bfb09d 100644 --- a/raphtory/src/db/task/edge/eval_edge.rs +++ b/raphtory/src/db/task/edge/eval_edge.rs @@ -6,6 +6,7 @@ use crate::{ db::{ api::{ properties::Properties, + state::Index, view::{internal::OneHopFilter, *}, }, graph::edge::EdgeView, @@ -26,6 +27,7 @@ pub struct EvalEdgeView<'graph, 'a, G, GH, CS: Clone, S> { pub(crate) ss: usize, pub(crate) edge: EdgeView<&'graph G, GH>, pub(crate) storage: &'graph GraphStorage, + pub(crate) index: &'graph Index, pub(crate) node_state: Rc>>, pub(crate) local_state_prev: &'graph PrevLocalState<'a, S>, } @@ -43,6 +45,7 @@ impl< ss: usize, edge: EdgeView<&'graph G, GH>, storage: &'graph GraphStorage, + index: &'graph Index, node_state: Rc>>, local_state_prev: &'graph PrevLocalState<'a, S>, ) -> Self { @@ -50,6 +53,7 @@ impl< ss, edge, storage, + index, node_state, local_state_prev, } @@ -117,9 +121,15 @@ impl< storage, local_state_prev, node_state, + index: self.index, }; + let state_pos = self + .index + .index(&node.node) + .unwrap_or_else(|| panic!("Internal Error, node {:?} needs to be in index", node.node)); EvalNodeView { node: node.node, + state_pos, graph: node.base_graph, eval_graph, local_state: None, @@ -138,10 +148,12 @@ impl< let node_state = self.node_state.clone(); let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; EvalEdges { ss, edges, storage, + index, node_state, local_state_prev, } @@ -162,6 +174,7 @@ impl< ss: self.ss, edge: self.edge.clone(), storage: self.storage, + index: self.index, node_state: self.node_state.clone(), local_state_prev: self.local_state_prev, } @@ -198,6 +211,7 @@ impl< self.ss, edge, self.storage, + self.index, self.node_state.clone(), self.local_state_prev, ) diff --git a/raphtory/src/db/task/edge/eval_edges.rs b/raphtory/src/db/task/edge/eval_edges.rs index 1addb8798a..5f22847b1f 100644 --- a/raphtory/src/db/task/edge/eval_edges.rs +++ b/raphtory/src/db/task/edge/eval_edges.rs @@ -6,6 +6,7 @@ use crate::{ db::{ api::{ properties::{Metadata, Properties}, + state::Index, view::{internal::OneHopFilter, BaseEdgeViewOps, BoxedLIter}, }, graph::edges::Edges, @@ -25,6 +26,7 @@ pub struct EvalEdges<'graph, 'a, G, GH, CS: Clone, S> { pub(crate) ss: usize, pub(crate) edges: Edges<'graph, &'graph G, GH>, pub(crate) storage: &'graph GraphStorage, + pub(crate) index: &'graph Index, pub(crate) node_state: Rc>>, pub(crate) local_state_prev: &'graph PrevLocalState<'a, S>, } @@ -37,6 +39,7 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>, CS: ss: self.ss, edges: self.edges.clone(), storage: self.storage, + index: self.index, node_state: self.node_state.clone(), local_state_prev: self.local_state_prev, } @@ -67,10 +70,12 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>, CS: let node_state = self.node_state.clone(); let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; EvalEdges { ss, edges, storage, + index, node_state, local_state_prev, } @@ -91,6 +96,7 @@ impl< let ss = self.ss; let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; self.edges .clone() .into_iter() @@ -98,6 +104,7 @@ impl< ss, edge, storage, + index, node_state: node_state.clone(), local_state_prev, }) @@ -121,10 +128,12 @@ impl< let ss = self.ss; let local_state_prev = self.local_state_prev; let storage = self.storage; + let index = self.index; Box::new(self.edges.into_iter().map(move |edge| EvalEdgeView { ss, edge, storage, + index, node_state: node_state.clone(), local_state_prev, })) @@ -186,10 +195,12 @@ impl< let path = self.edges.map_nodes(op); let base_graph = self.edges.base_graph; let storage = self.storage; + let index = self.index; let eval_graph = EvalGraph { ss, base_graph, storage, + index, local_state_prev, node_state, }; @@ -212,9 +223,11 @@ impl< let local_state_prev = self.local_state_prev; let edges = self.edges.map_exploded(op); let storage = self.storage; + let index = self.index; Self { ss, storage, + index, node_state, local_state_prev, edges, diff --git a/raphtory/src/db/task/eval_graph.rs b/raphtory/src/db/task/eval_graph.rs index fa4742910d..8b6d0ac071 100644 --- a/raphtory/src/db/task/eval_graph.rs +++ b/raphtory/src/db/task/eval_graph.rs @@ -3,13 +3,17 @@ use crate::{ entities::nodes::node_ref::AsNodeRef, state::compute_state::{ComputeState, ComputeStateVec}, }, - db::task::{ - edge::eval_edge::EvalEdgeView, - node::{eval_node::EvalNodeView, eval_node_state::EVState}, - task_state::PrevLocalState, + db::{ + api::state::Index, + task::{ + edge::eval_edge::EvalEdgeView, + node::{eval_node::EvalNodeView, eval_node_state::EVState}, + task_state::PrevLocalState, + }, }, prelude::GraphViewOps, }; +use raphtory_core::entities::VID; use raphtory_storage::graph::graph::GraphStorage; use std::{cell::RefCell, rc::Rc}; @@ -20,6 +24,7 @@ pub struct EvalGraph<'graph, 'a, G, S, CS: Clone = ComputeStateVec> { pub(crate) storage: &'graph GraphStorage, pub(crate) local_state_prev: &'graph PrevLocalState<'a, S>, pub(crate) node_state: Rc>>, + pub(crate) index: &'graph Index, } impl<'graph, 'a, G, S, CS: Clone> Clone for EvalGraph<'graph, 'a, G, S, CS> { @@ -30,6 +35,7 @@ impl<'graph, 'a, G, S, CS: Clone> Clone for EvalGraph<'graph, 'a, G, S, CS> { storage: self.storage, local_state_prev: self.local_state_prev, node_state: self.node_state.clone(), + index: self.index, } } } @@ -39,7 +45,15 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, S: 'static, CS: ComputeState + { pub fn node(&self, n: impl AsNodeRef) -> Option> { let node = (&self.base_graph).node(n)?; - Some(EvalNodeView::new_local(node.node, self.clone(), None)) + let state_pos = self.index.index(&node.node).unwrap_or_else(|| { + panic!("Internal Error, node {:?} needs to be in index", node.node); + }); + Some(EvalNodeView::new_local( + node.node, + state_pos, + self.clone(), + None, + )) } pub fn edge( @@ -52,6 +66,7 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, S: 'static, CS: ComputeState + self.ss, edge, self.storage, + self.index, self.node_state.clone(), self.local_state_prev, )) diff --git a/raphtory/src/db/task/node/eval_node.rs b/raphtory/src/db/task/node/eval_node.rs index c20f6c513b..d3ce4a16d7 100644 --- a/raphtory/src/db/task/node/eval_node.rs +++ b/raphtory/src/db/task/node/eval_node.rs @@ -28,6 +28,7 @@ use std::{ pub struct EvalNodeView<'graph, 'a: 'graph, G, S, GH = &'graph G, CS: Clone = ComputeStateVec> { pub node: VID, + pub(crate) state_pos: usize, pub(crate) eval_graph: EvalGraph<'graph, 'a, G, S, CS>, pub(crate) graph: GH, pub(crate) local_state: Option<&'graph mut S>, @@ -38,12 +39,14 @@ impl<'graph, 'a: 'graph, G: GraphViewOps<'graph>, CS: ComputeState + 'a, S> { pub(crate) fn new_local( node: VID, + state_pos: usize, eval_graph: EvalGraph<'graph, 'a, G, S, CS>, local_state: Option<&'graph mut S>, ) -> Self { let graph = eval_graph.base_graph; Self { node, + state_pos, eval_graph, graph, local_state, @@ -63,6 +66,7 @@ impl< fn clone(&self) -> Self { Self { node: self.node, + state_pos: self.state_pos, eval_graph: self.eval_graph.clone(), graph: self.graph.clone(), local_state: None, @@ -83,8 +87,7 @@ impl< self.eval_graph.clone() } pub fn prev(&self) -> &S { - let VID(i) = self.node; - &self.eval_graph.local_state_prev.state[i] + &self.eval_graph.local_state_prev.state[self.state_pos] } pub fn get_mut(&mut self) -> &mut S { @@ -103,22 +106,20 @@ impl< pub(crate) fn new_filtered( node: VID, + state_pos: usize, eval_graph: EvalGraph<'graph, 'a, G, S, CS>, graph: GH, local_state: Option<&'graph mut S>, ) -> Self { Self { node, + state_pos, eval_graph, graph, local_state, } } - fn pid(&self) -> VID { - self.node - } - fn node_state(&self) -> Ref<'_, EVState<'a, CS>> { RefCell::borrow(&self.eval_graph.node_state) } @@ -132,9 +133,12 @@ impl< id: &AccId, a: IN, ) { - self.node_state_mut() - .shard_mut() - .accumulate_into(self.eval_graph.ss, self.pid(), a, id); + self.node_state_mut().shard_mut().accumulate_into( + self.eval_graph.ss, + self.state_pos, + a, + id, + ); } pub fn global_update>( @@ -189,7 +193,7 @@ impl< { self.node_state() .shard() - .read_with_pid(self.eval_graph.ss, self.pid(), agg_r) + .read_with_pid(self.eval_graph.ss, self.state_pos, agg_r) .unwrap_or(ACC::finish(&ACC::zero())) } @@ -218,7 +222,7 @@ impl< { self.node_state() .shard() - .read_with_pid(self.eval_graph.ss + 1, self.pid(), agg_r) + .read_with_pid(self.eval_graph.ss + 1, self.state_pos, agg_r) .unwrap_or(ACC::finish(&ACC::zero())) } @@ -266,8 +270,11 @@ impl< pub fn iter(&self) -> impl Iterator> + 'graph { let base_graph = self.base_graph.clone(); let graph = self.graph.clone(); - self.iter_refs() - .map(move |v| EvalNodeView::new_filtered(v, base_graph.clone(), graph.clone(), None)) + let index = self.base_graph.index; + self.iter_refs().map(move |v| { + let state_pos = index.index(&v).expect("VID not found in index"); + EvalNodeView::new_filtered(v, state_pos, base_graph.clone(), graph.clone(), None) + }) } pub fn type_filter, V: AsRef>(&self, node_types: I) -> Self { @@ -373,6 +380,7 @@ impl< self.graph.clone(), self.op.clone(), ); + let index = self.base_graph.index; let edges = path.map_edges(op); EvalEdges { ss, @@ -380,6 +388,7 @@ impl< node_state, local_state_prev, storage, + index, } } @@ -469,7 +478,7 @@ impl< filtered_graph: GHH, ) -> Self::Filtered { let eval_graph = self.eval_graph.clone(); - EvalNodeView::new_filtered(self.node, eval_graph, filtered_graph, None) + EvalNodeView::new_filtered(self.node, self.state_pos, eval_graph, filtered_graph, None) } } @@ -522,12 +531,14 @@ impl< graph: self.graph.clone(), edges, }; + let index = self.eval_graph.index; EvalEdges { ss, edges, node_state, local_state_prev, storage, + index, } } diff --git a/raphtory/src/db/task/task_runner.rs b/raphtory/src/db/task/task_runner.rs index 7ec01a9f00..df75ca7296 100644 --- a/raphtory/src/db/task/task_runner.rs +++ b/raphtory/src/db/task/task_runner.rs @@ -58,6 +58,7 @@ impl TaskRunner { prev_local_state: &Vec, reverse_vids: &Vec, storage: &GraphStorage, + index: &Index, atomic_done: &AtomicBool, morcel_size: usize, morcel_id: usize, @@ -80,10 +81,11 @@ impl TaskRunner { ss: self.ctx.ss(), base_graph: &g, storage, + index, local_state_prev: &local, node_state: node_state.clone(), }; - let mut vv = EvalNodeView::new_local(node, eval_graph, Some(local_state)); + let mut vv = EvalNodeView::new_local(node, v_ref, eval_graph, Some(local_state)); match task.run(&mut vv) { Step::Continue => { @@ -133,6 +135,7 @@ impl TaskRunner { prev_local_state: &Vec, reverse_vids: &Vec, storage: &GraphStorage, + index: &Index, ) -> (bool, Shard, Global, Vec) { pool.install(move || { let mut new_shard_state = shard_state; @@ -155,6 +158,7 @@ impl TaskRunner { prev_local_state, reverse_vids, storage, + index, &atomic_done, morcel_size, morcel_id, @@ -174,6 +178,7 @@ impl TaskRunner { prev_local_state, reverse_vids, storage, + index, &atomic_done, morcel_size, morcel_id, @@ -208,9 +213,8 @@ impl TaskRunner { }) } - fn make_cur_and_prev_states(&self, mut init: Vec) -> (Vec, Vec) { - let g = self.ctx.graph(); - init.resize(g.unfiltered_num_nodes(), S::default()); + fn make_cur_and_prev_states(&self, mut init: Vec, num_nodes: usize) -> (Vec, Vec) { + init.resize(num_nodes, S::default()); (init.clone(), init) } @@ -245,13 +249,23 @@ impl TaskRunner { let index = Index::for_graph(graph.clone()); - let mut shard_state = shard_initial_state - .unwrap_or_else(|| Shard::new(num_nodes, num_chunks, morcel_size, index.clone())); + println!("DEBUG TaskRunner::run:"); + println!(" graph.unfiltered_num_nodes() = {}", graph.unfiltered_num_nodes()); + println!(" node_index.len() = {}", num_nodes); + println!(" morcel_size = {}", morcel_size); + println!(" num_chunks = {}", num_chunks); + println!(" index variant = {:?}", match &index { + Index::Full(_) => "Full", + Index::Partial(_) => "Partial", + }); + + let mut shard_state = + shard_initial_state.unwrap_or_else(|| Shard::new(num_nodes, num_chunks, morcel_size)); - let mut global_state = global_initial_state.unwrap_or_else(|| Global::new(index.clone())); + let mut global_state = global_initial_state.unwrap_or_else(|| Global::new()); let (mut cur_local_state, mut prev_local_state) = - self.make_cur_and_prev_states::(init.unwrap_or_default()); + self.make_cur_and_prev_states::(init.unwrap_or_default(), num_nodes); let mut _done = false; @@ -264,6 +278,11 @@ impl TaskRunner { }); } + println!(" reverse_vids mapping (flat_idx -> VID):"); + for (flat_idx, vid) in reverse_vids.iter().enumerate() { + println!(" {} -> {}", flat_idx, vid.0); + } + (_done, shard_state, global_state, cur_local_state) = self.run_task_list( &init_tasks, &pool, @@ -274,6 +293,7 @@ impl TaskRunner { &prev_local_state, &reverse_vids, storage, + &index, ); // To allow the init step to cache stuff we will copy everything from cur_local_state to prev_local_state @@ -290,6 +310,7 @@ impl TaskRunner { &prev_local_state, &reverse_vids, storage, + &index, ); // copy and reset the state from the step that just ended diff --git a/raphtory/src/db/task/task_state.rs b/raphtory/src/db/task/task_state.rs index 1d0787503f..c62051d2f8 100644 --- a/raphtory/src/db/task/task_state.rs +++ b/raphtory/src/db/task/task_state.rs @@ -28,17 +28,11 @@ impl<'a, S: 'static> PrevLocalState<'a, S> { } impl Shard { - pub(crate) fn new( - total_len: usize, - num_morcels: usize, - morcel_size: usize, - index: Index, - ) -> Self { + pub(crate) fn new(total_len: usize, num_morcels: usize, morcel_size: usize) -> Self { Self(Arc::new(ShuffleComputeState::new( total_len, num_morcels, morcel_size, - index, ))) } @@ -77,8 +71,8 @@ impl From>> for Shard { } impl Global { - pub(crate) fn new(index: Index) -> Self { - Self(Arc::new(ShuffleComputeState::global(index))) + pub(crate) fn new() -> Self { + Self(Arc::new(ShuffleComputeState::global())) } pub(crate) fn as_cow(&self) -> Cow<'_, ShuffleComputeState> { From fab89ed8564b3d06d8c772981554c500845f0081 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Mon, 8 Dec 2025 14:02:57 +0000 Subject: [PATCH 09/24] fixes to algorithms and materialize to integrate Index --- db4-storage/src/pages/edge_store.rs | 5 ++ db4-storage/src/pages/node_store.rs | 5 ++ .../src/algorithms/centrality/betweenness.rs | 56 ++++++++++--------- .../algorithms/components/in_components.rs | 1 - raphtory/src/algorithms/cores/k_core.rs | 8 ++- raphtory/src/db/api/state/node_state.rs | 8 ++- raphtory/src/db/api/view/graph.rs | 39 +++++++++---- raphtory/src/db/graph/nodes.rs | 5 +- 8 files changed, 84 insertions(+), 43 deletions(-) diff --git a/db4-storage/src/pages/edge_store.rs b/db4-storage/src/pages/edge_store.rs index 65fa64ce86..e321a0d068 100644 --- a/db4-storage/src/pages/edge_store.rs +++ b/db4-storage/src/pages/edge_store.rs @@ -485,6 +485,11 @@ impl, EXT: Config> EdgeStorageInner } } + pub fn reserve_new_eid(&self, row: usize) -> EID { + let (segment_id, local_pos) = self.reserve_free_pos(row); + local_pos.as_eid(segment_id, self.max_page_len()) + } + pub fn reserve_free_pos(&self, row: usize) -> (usize, LocalPOS) { let slot_idx = row % N; let maybe_free_page = { diff --git a/db4-storage/src/pages/node_store.rs b/db4-storage/src/pages/node_store.rs index 401b71991a..b22a313e38 100644 --- a/db4-storage/src/pages/node_store.rs +++ b/db4-storage/src/pages/node_store.rs @@ -197,6 +197,11 @@ impl, EXT: Config> NodeStorageInner ) } + pub fn reserve_vid(&self, row: usize) -> VID { + let (seg, pos) = self.reserve_free_pos(row); + pos.as_vid(seg, self.max_segment_len()) + } + pub fn reserve_free_pos(&self, row: usize) -> (usize, LocalPOS) { let slot_idx = row % N; let maybe_free_page = { diff --git a/raphtory/src/algorithms/centrality/betweenness.rs b/raphtory/src/algorithms/centrality/betweenness.rs index 4e8865fe3a..fbf79dccd9 100644 --- a/raphtory/src/algorithms/centrality/betweenness.rs +++ b/raphtory/src/algorithms/centrality/betweenness.rs @@ -1,6 +1,9 @@ use crate::{ core::entities::VID, - db::{api::state::NodeState, graph::node::NodeView}, + db::{ + api::state::{Index, NodeState}, + graph::node::NodeView, + }, prelude::{GraphViewOps, NodeViewOps}, }; use std::collections::{HashMap, VecDeque}; @@ -21,8 +24,9 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( k: Option, normalized: bool, ) -> NodeState<'graph, f64, G> { + let index = Index::for_graph(g); // Initialize a hashmap to store betweenness centrality values. - let mut betweenness: Vec = vec![0.0; g.unfiltered_num_nodes()]; + let mut betweenness: Vec = vec![0.0; g.count_nodes()]; // Get the nodes and the total number of nodes in the graph. let nodes = g.nodes(); @@ -31,49 +35,47 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( // Main loop over each node to compute betweenness centrality. for node in nodes.iter().take(k_sample) { - let mut stack = Vec::new(); - let mut predecessors: HashMap> = HashMap::new(); - let mut sigma: HashMap = HashMap::new(); - let mut dist: HashMap = HashMap::new(); + let mut stack: Vec = Vec::new(); + let mut predecessors: HashMap> = HashMap::new(); + let mut sigma: HashMap = HashMap::new(); + let mut dist: HashMap = HashMap::new(); let mut queue = VecDeque::new(); // Initialize distance and sigma values for each node. for node in nodes.iter() { - dist.insert(node.node.0, -1); - sigma.insert(node.node.0, 0.0); + dist.insert(node.node, -1); + sigma.insert(node.node, 0.0); } - dist.insert(node.node.0, 0); - sigma.insert(node.node.0, 1.0); - queue.push_back(node.node.0); + dist.insert(node.node, 0); + sigma.insert(node.node, 1.0); + queue.push_back(node.node); // BFS loop to find shortest paths. while let Some(current_node_id) = queue.pop_front() { stack.push(current_node_id); - for neighbor in - NodeView::new_internal(g.clone(), VID::from(current_node_id)).out_neighbours() - { + for neighbor in NodeView::new_internal(g.clone(), current_node_id).out_neighbours() { // Path discovery - if dist[&neighbor.node.0] < 0 { - queue.push_back(neighbor.node.0); - dist.insert(neighbor.node.0, dist[¤t_node_id] + 1); + if dist[&neighbor.node] < 0 { + queue.push_back(neighbor.node); + dist.insert(neighbor.node, dist[¤t_node_id] + 1); } // Path counting - if dist[&neighbor.node.0] == dist[¤t_node_id] + 1 { + if dist[&neighbor.node] == dist[¤t_node_id] + 1 { sigma.insert( - neighbor.node.0, - sigma[&neighbor.node.0] + sigma[¤t_node_id], + neighbor.node, + sigma[&neighbor.node] + sigma[¤t_node_id], ); predecessors - .entry(neighbor.node.0) + .entry(neighbor.node) .or_default() .push(current_node_id); } } } - let mut delta: HashMap = HashMap::new(); + let mut delta: HashMap = HashMap::new(); for node in nodes.iter() { - delta.insert(node.node.0, 0.0); + delta.insert(node.node, 0.0); } // Accumulation @@ -83,8 +85,9 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( let new_delta_v = delta[v] + coeff; delta.insert(*v, new_delta_v); } - if w != node.node.0 { - betweenness[w] += delta[&w]; + if w != node.node { + let pos = index.index(&w).unwrap(); + betweenness[pos] += delta[&w]; } } } @@ -93,7 +96,8 @@ pub fn betweenness_centrality<'graph, G: GraphViewOps<'graph>>( if normalized { let factor = 1.0 / ((n as f64 - 1.0) * (n as f64 - 2.0)); for node in nodes.iter() { - betweenness[node.node.index()] *= factor; + let pos = index.index(&node.node).unwrap(); + betweenness[pos] *= factor; } } diff --git a/raphtory/src/algorithms/components/in_components.rs b/raphtory/src/algorithms/components/in_components.rs index 3afea2a78d..a5afa40629 100644 --- a/raphtory/src/algorithms/components/in_components.rs +++ b/raphtory/src/algorithms/components/in_components.rs @@ -15,7 +15,6 @@ use crate::{ }, prelude::GraphViewOps, }; -use either::Either; use indexmap::IndexSet; use itertools::Itertools; use std::collections::{hash_map::Entry, HashMap, HashSet, VecDeque}; diff --git a/raphtory/src/algorithms/cores/k_core.rs b/raphtory/src/algorithms/cores/k_core.rs index 4640dbc0e1..84947f2ddb 100644 --- a/raphtory/src/algorithms/cores/k_core.rs +++ b/raphtory/src/algorithms/cores/k_core.rs @@ -1,7 +1,10 @@ use crate::{ core::{entities::VID, state::compute_state::ComputeStateVec}, db::{ - api::view::{NodeViewOps, StaticGraphViewOps}, + api::{ + state::Index, + view::{NodeViewOps, StaticGraphViewOps}, + }, graph::views::node_subgraph::NodeSubgraph, task::{ context::Context, @@ -73,6 +76,7 @@ where }); let mut runner: TaskRunner = TaskRunner::new(ctx); + let index = Index::for_graph(g); runner.run( vec![Job::new(step1)], @@ -81,7 +85,7 @@ where |_, _, _, local| { g.nodes() .iter() - .filter(|node| local[node.node.0].alive) + .filter(|node| local[index.index(&node.node).unwrap()].alive) .map(|node| node.node) .collect() }, diff --git a/raphtory/src/db/api/state/node_state.rs b/raphtory/src/db/api/state/node_state.rs index f405c329c8..3fc6e17409 100644 --- a/raphtory/src/db/api/state/node_state.rs +++ b/raphtory/src/db/api/state/node_state.rs @@ -30,6 +30,12 @@ pub enum Index { Partial(Arc>), } +impl From> for Index { + fn from(index: StateIndex) -> Self { + Self::Full(index.into()) + } +} + impl Default for Index { fn default() -> Self { Self::Partial(Arc::new(Default::default())) @@ -59,7 +65,7 @@ impl Index { NodeList::All { .. } => { Self::Full(graph.core_graph().node_state_index().into()) } - NodeList::List { elems } => elems.into(), + NodeList::List { elems } => elems, } } else { Self::from_iter(graph.nodes().iter().map(|node| node.node)) diff --git a/raphtory/src/db/api/view/graph.rs b/raphtory/src/db/api/view/graph.rs index f52adefbdc..06f5c51988 100644 --- a/raphtory/src/db/api/view/graph.rs +++ b/raphtory/src/db/api/view/graph.rs @@ -8,6 +8,7 @@ use crate::{ db::{ api::{ properties::{internal::InternalMetadataOps, Metadata, Properties}, + state::Index, view::{internal::*, *}, }, graph::{ @@ -339,11 +340,27 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { let node_map_shared = atomic_usize_from_mut_slice(bytemuck::cast_slice_mut(&mut node_map)); + // reverse index pos -> new_vid + let index = Index::for_graph(self); + self.nodes().par_iter().for_each(|node| { + let vid = node.node; + if let Some(pos) = index.index(&vid) { + let new_vid = new_storage.graph().storage().nodes().reserve_vid(pos); + node_map_shared[pos].store(new_vid.index(), Ordering::Relaxed); + } + }); + + let get_new_vid = |old_vid: VID, index: &Index, node_map: &[VID]| -> VID { + let pos = index + .index(&old_vid) + .expect("old_vid should exist in index"); + node_map[pos] + }; + new_storage.nodes.par_iter_mut().try_for_each(|shard| { - for (index, node) in self.nodes().iter().enumerate() { - let new_id = VID(index); + for node in self.nodes().iter() { + let new_id = get_new_vid(node.node, &index, &node_map); let gid = node.id(); - node_map_shared[node.node.index()].store(new_id.index(), Ordering::Relaxed); if let Some(node_pos) = shard.resolve_pos(new_id) { let mut writer = shard.writer(); if let Some(node_type) = node.node_type() { @@ -388,11 +405,13 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { new_storage.edges.ensure_layer(*layer_id); } + let edge_storage = new_storage.graph().storage().edges().clone(); + new_storage.edges.par_iter_mut().try_for_each(|shard| { - for (eid, edge) in self.edges().iter().enumerate() { - let src = node_map[edge.edge.src().index()]; - let dst = node_map[edge.edge.dst().index()]; - let eid = EID(eid); + for (row, edge) in self.edges().iter().enumerate() { + let src = get_new_vid(edge.edge.src(), &index, &node_map); + let dst = get_new_vid(edge.edge.dst(), &index, &node_map); + let eid = edge_storage.reserve_new_eid(row); if let Some(edge_pos) = shard.resolve_pos(eid) { let mut writer = shard.writer(); // make the edge for the first time @@ -453,8 +472,8 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { new_storage.nodes.par_iter_mut().try_for_each(|shard| { for (eid, edge) in self.edges().iter().enumerate() { let eid = EID(eid); - let src_id = node_map[edge.edge.src().index()]; - let dst_id = node_map[edge.edge.dst().index()]; + let src_id = get_new_vid(edge.edge.src(), &index, &node_map); + let dst_id = get_new_vid(edge.edge.dst(), &index, &node_map); let maybe_src_pos = shard.resolve_pos(src_id); let maybe_dst_pos = shard.resolve_pos(dst_id); @@ -614,7 +633,7 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { self.get_layer_names_from_ids(self.layer_ids()) } - #[inline] + // #[inline] fn earliest_time(&self) -> Option { match self.filter_state() { FilterState::Neither => self.earliest_time_global(), diff --git a/raphtory/src/db/graph/nodes.rs b/raphtory/src/db/graph/nodes.rs index 490cb81394..4585821e53 100644 --- a/raphtory/src/db/graph/nodes.rs +++ b/raphtory/src/db/graph/nodes.rs @@ -26,7 +26,6 @@ use std::{ marker::PhantomData, sync::Arc, }; -use storage::state::StateIndex; #[derive(Clone)] pub struct Nodes<'graph, G, GH = G> { @@ -115,11 +114,11 @@ where { pub fn new(graph: G) -> Self { let base_graph = graph.clone(); - let node_index = Index::for_graph(base_graph.clone()); + let node_index = base_graph.core_graph().node_state_index(); Self { base_graph, graph, - nodes: node_index, + nodes: node_index.into(), node_types_filter: None, _marker: PhantomData, } From abb695a701cbc1462b74d2ab829277585c82e956 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Mon, 8 Dec 2025 14:26:40 +0000 Subject: [PATCH 10/24] fixes for motifs --- .../local_temporal_three_node_motifs.rs | 31 ++++++++----------- raphtory/src/db/task/node/eval_node.rs | 15 ++++++--- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs b/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs index b91f1d9c75..a06d28690d 100644 --- a/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs +++ b/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs @@ -4,7 +4,7 @@ use crate::{ core::state::{accumulator_id::accumulators, compute_state::ComputeStateVec}, db::{ api::{ - state::NodeState, + state::{Index, NodeState}, view::{NodeViewOps, *}, }, graph::views::node_subgraph::NodeSubgraph, @@ -18,6 +18,7 @@ use crate::{ }; use itertools::Itertools; use num_traits::Zero; +use rand::seq::index; use raphtory_api::core::entities::VID; use rayon::prelude::*; use rustc_hash::FxHashSet; @@ -212,20 +213,12 @@ where for v in u.neighbours() { // Find triangles on the UV edge let intersection_nbs = { - match ( - u.entry(&neighbours_set) - .read_ref() - .unwrap_or(&FxHashSet::default()), - v.entry(&neighbours_set) - .read_ref() - .unwrap_or(&FxHashSet::default()), - ) { - (u_set, v_set) => { - let intersection = - u_set.intersection(v_set).cloned().collect::>(); - intersection - } - } + let default = FxHashSet::default(); + let u_entry = u.entry(&neighbours_set); + let u_set = u_entry.read_ref().unwrap_or(&default); + let v_entry = v.entry(&neighbours_set); + let v_set = v_entry.read_ref().unwrap_or(&default); + u_set.intersection(v_set).cloned().collect::>() }; if intersection_nbs.is_empty() { @@ -298,6 +291,7 @@ where }); let mut runner: TaskRunner, _> = TaskRunner::new(ctx_subgraph); + let index = Index::for_graph(&kcore_subgraph); runner.run( vec![Job::new(neighbourhood_update_step)], @@ -305,9 +299,9 @@ where None, |_, _, _els, mut local| { let mut tri_motifs = HashMap::new(); - for node in graph.nodes() { + for node in kcore_subgraph.nodes() { let v_gid = node.name(); - let triangle = mem::take(&mut local[node.node.0].triangle); + let triangle = mem::take(&mut local[index.index(&node.node).unwrap()].triangle); if triangle.is_empty() { tri_motifs.insert(v_gid.clone(), vec![[0; 8]; delta_len]); } else { @@ -360,6 +354,7 @@ where }); let mut runner: TaskRunner = TaskRunner::new(ctx); + let index = Index::for_graph(g); runner.run( vec![Job::new(star_motif_step)], @@ -370,7 +365,7 @@ where .nodes() .par_iter() .map(|n| { - let mc = &local[n.node.index()]; + let mc = &local[index.index(&n.node).unwrap()]; let v_gid = n.name(); let triangles = triadic_motifs .get(&v_gid) diff --git a/raphtory/src/db/task/node/eval_node.rs b/raphtory/src/db/task/node/eval_node.rs index d3ce4a16d7..e516cdc597 100644 --- a/raphtory/src/db/task/node/eval_node.rs +++ b/raphtory/src/db/task/node/eval_node.rs @@ -207,7 +207,12 @@ impl< A: StateType, OUT: std::fmt::Debug, { - Entry::new(self.node_state(), *agg_r, &self.node, self.eval_graph.ss) + Entry::new( + self.node_state(), + *agg_r, + self.state_pos, + self.eval_graph.ss, + ) } /// Read the prev value of the node state using the given accumulator. @@ -570,7 +575,7 @@ impl< pub struct Entry<'a, 'b, A: StateType, IN, OUT, ACC: Accumulator, CS: ComputeState> { state: Ref<'a, EVState<'b, CS>>, acc_id: AccId, - v_ref: &'a VID, + state_pos: usize, ss: usize, } @@ -589,13 +594,13 @@ impl<'a, 'b, A: StateType, IN, OUT, ACC: Accumulator, CS: ComputeSta pub(crate) fn new( state: Ref<'a, EVState<'b, CS>>, acc_id: AccId, - v_ref: &'a VID, + state_pos: usize, ss: usize, ) -> Entry<'a, 'b, A, IN, OUT, ACC, CS> { Entry { state, acc_id, - v_ref, + state_pos, ss, } } @@ -604,6 +609,6 @@ impl<'a, 'b, A: StateType, IN, OUT, ACC: Accumulator, CS: ComputeSta pub fn read_ref(&self) -> Option<&A> { self.state .shard() - .read_ref(self.ss, (*self.v_ref).into(), &self.acc_id) + .read_ref(self.ss, self.state_pos, &self.acc_id) } } From e142476f61112609c80f0f814658611a3317887b Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Mon, 8 Dec 2025 14:45:43 +0000 Subject: [PATCH 11/24] fixes for dijkstra --- raphtory/src/algorithms/pathing/dijkstra.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/raphtory/src/algorithms/pathing/dijkstra.rs b/raphtory/src/algorithms/pathing/dijkstra.rs index 0f6b88d7ff..92efacbba4 100644 --- a/raphtory/src/algorithms/pathing/dijkstra.rs +++ b/raphtory/src/algorithms/pathing/dijkstra.rs @@ -66,6 +66,7 @@ pub fn dijkstra_single_source_shortest_paths, direction: Direction, ) -> Result), G>, GraphError> { + let index = Index::for_graph(g); let source_ref = source.as_node_ref(); let source_node = match g.node(source_ref) { Some(src) => src, @@ -86,10 +87,11 @@ pub fn dijkstra_single_source_shortest_paths Date: Mon, 8 Dec 2025 17:06:18 +0000 Subject: [PATCH 12/24] fixes for algo tests, CCs still remaining --- raphtory-core/src/storage/lazy_vec.rs | 4 +- raphtory/src/algorithms/centrality/hits.rs | 6 +- .../src/algorithms/centrality/pagerank.rs | 4 +- .../algorithms/components/in_components.rs | 4 +- .../algorithms/components/out_components.rs | 5 +- raphtory/src/algorithms/components/scc.rs | 13 +++-- raphtory/src/algorithms/cores/k_core.rs | 3 +- raphtory/src/algorithms/embeddings/fast_rp.rs | 6 +- .../global_temporal_three_node_motifs.rs | 4 +- .../local_temporal_three_node_motifs.rs | 6 +- .../src/algorithms/motifs/triangle_count.rs | 2 +- .../src/algorithms/motifs/triplet_count.rs | 2 +- .../pathing/single_source_shortest_path.rs | 1 - .../pathing/temporal_reachability.rs | 12 ++-- raphtory/src/core/state/compute_state.rs | 12 +++- raphtory/src/core/state/mod.rs | 18 ++++-- raphtory/src/core/state/morcel_state.rs | 2 +- raphtory/src/core/state/shuffle_state.rs | 58 ++++++++++++++++++- raphtory/src/db/api/state/node_state.rs | 34 +++++++++++ raphtory/src/db/task/mod.rs | 2 +- raphtory/src/db/task/task_runner.rs | 31 +++++++--- 21 files changed, 176 insertions(+), 53 deletions(-) diff --git a/raphtory-core/src/storage/lazy_vec.rs b/raphtory-core/src/storage/lazy_vec.rs index d8da2041e4..0c75965c63 100644 --- a/raphtory-core/src/storage/lazy_vec.rs +++ b/raphtory-core/src/storage/lazy_vec.rs @@ -277,7 +277,7 @@ where #[cfg(test)] fn iter(&self) -> Box + Send + '_> { match self { - LazyVec::Empty => Box::new(iter::empty()), + LazyVec::Empty => Box::new(std::iter::empty()), LazyVec::LazyVec1(default, tuples) => { Box::new(tuples.iter().map(|value| value.unwrap_or(default))) } @@ -290,7 +290,7 @@ where #[cfg(test)] fn iter_opt(&self) -> Box> + Send + '_> { match self { - LazyVec::Empty => Box::new(iter::empty()), + LazyVec::Empty => Box::new(std::iter::empty()), LazyVec::LazyVec1(_, tuples) => Box::new(tuples.iter()), LazyVec::LazyVecN(_, vector) => Box::new(vector.iter()), } diff --git a/raphtory/src/algorithms/centrality/hits.rs b/raphtory/src/algorithms/centrality/hits.rs index 207b6b20f2..6f4d3e251f 100644 --- a/raphtory/src/algorithms/centrality/hits.rs +++ b/raphtory/src/algorithms/centrality/hits.rs @@ -157,14 +157,16 @@ pub fn hits( vec![], vec![Job::new(step2), Job::new(step3), Job::new(step4), step5], None, - |_, _, _, local| { + |_, _, _, local, index| { if g.unfiltered_num_nodes() <= 10 { println!("\nDEBUG Final local state (index -> (hub, auth)):"); for (i, h) in local.iter().enumerate() { println!(" local[{}] = ({}, {})", i, h.hub_score, h.auth_score); } } - NodeState::new_from_eval_mapped(g.clone(), local, |h| (h.hub_score, h.auth_score)) + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |h| { + (h.hub_score, h.auth_score) + }) }, threads, iter_count, diff --git a/raphtory/src/algorithms/centrality/pagerank.rs b/raphtory/src/algorithms/centrality/pagerank.rs index f9d10842c6..cf762530d5 100644 --- a/raphtory/src/algorithms/centrality/pagerank.rs +++ b/raphtory/src/algorithms/centrality/pagerank.rs @@ -161,7 +161,9 @@ pub fn unweighted_page_rank( vec![Job::new(step1)], vec![Job::new(step2), Job::new(step3), Job::new(step4), step5], Some(vec![PageRankState::new(num_nodes); num_nodes]), - |_, _, _, local| NodeState::new_from_eval_mapped(g.clone(), local, |v| v.score), + |_, _, _, local, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| v.score) + }, threads, iter_count, None, diff --git a/raphtory/src/algorithms/components/in_components.rs b/raphtory/src/algorithms/components/in_components.rs index a5afa40629..71b1effe8a 100644 --- a/raphtory/src/algorithms/components/in_components.rs +++ b/raphtory/src/algorithms/components/in_components.rs @@ -71,8 +71,8 @@ where vec![Job::new(step1)], vec![], None, - |_, _, _, local: Vec| { - NodeState::new_from_eval_mapped(g.clone(), local, |v| { + |_, _, _, local: Vec, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| { Nodes::new_filtered( g.clone(), g.clone(), diff --git a/raphtory/src/algorithms/components/out_components.rs b/raphtory/src/algorithms/components/out_components.rs index 4b7c999d34..cb2edf0d5f 100644 --- a/raphtory/src/algorithms/components/out_components.rs +++ b/raphtory/src/algorithms/components/out_components.rs @@ -15,7 +15,6 @@ use crate::{ }, prelude::GraphViewOps, }; -use either::Either; use indexmap::IndexSet; use itertools::Itertools; use std::collections::{hash_map::Entry, HashMap, HashSet, VecDeque}; @@ -72,8 +71,8 @@ where vec![Job::new(step1)], vec![], None, - |_, _, _, local: Vec| { - NodeState::new_from_eval_mapped(g.clone(), local, |v| { + |_, _, _, local: Vec, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| { Nodes::new_filtered( g.clone(), g.clone(), diff --git a/raphtory/src/algorithms/components/scc.rs b/raphtory/src/algorithms/components/scc.rs index 5557d3f73c..49425fec03 100644 --- a/raphtory/src/algorithms/components/scc.rs +++ b/raphtory/src/algorithms/components/scc.rs @@ -1,7 +1,10 @@ use crate::{ core::entities::VID, db::{ - api::{state::NodeState, view::StaticGraphViewOps}, + api::{ + state::{Index, NodeState}, + view::StaticGraphViewOps, + }, graph::node::NodeView, }, prelude::*, @@ -148,12 +151,14 @@ where ); */ let groups = tarjan_scc(graph); + let index = Index::for_graph(graph); - let mut values = vec![usize::MAX; graph.unfiltered_num_nodes()]; + let mut values = vec![usize::MAX; graph.count_nodes()]; for (id, group) in groups.into_iter().enumerate() { - for VID(node) in group { - values[node] = id; + for vid in &group { + let pos = index.index(vid).unwrap(); + values[pos] = id; } } diff --git a/raphtory/src/algorithms/cores/k_core.rs b/raphtory/src/algorithms/cores/k_core.rs index 84947f2ddb..a2709b6cfa 100644 --- a/raphtory/src/algorithms/cores/k_core.rs +++ b/raphtory/src/algorithms/cores/k_core.rs @@ -76,13 +76,12 @@ where }); let mut runner: TaskRunner = TaskRunner::new(ctx); - let index = Index::for_graph(g); runner.run( vec![Job::new(step1)], vec![Job::read_only(step2)], None, - |_, _, _, local| { + |_, _, _, local, index| { g.nodes() .iter() .filter(|node| local[index.index(&node.node).unwrap()].alive) diff --git a/raphtory/src/algorithms/embeddings/fast_rp.rs b/raphtory/src/algorithms/embeddings/fast_rp.rs index 6ff61b94f8..1608157857 100644 --- a/raphtory/src/algorithms/embeddings/fast_rp.rs +++ b/raphtory/src/algorithms/embeddings/fast_rp.rs @@ -97,8 +97,10 @@ where vec![Job::new(step1)], vec![Job::read_only(step2)], None, - |_, _, _, local: Vec| { - NodeState::new_from_eval_mapped(g.clone(), local, |v| v.embedding_state) + |_, _, _, local: Vec, index| { + NodeState::new_from_eval_mapped_with_index(g.clone(), local, index, |v| { + v.embedding_state + }) }, threads, num_iters, diff --git a/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs b/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs index 0be0541854..7e1bb351bf 100644 --- a/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs +++ b/raphtory/src/algorithms/motifs/global_temporal_three_node_motifs.rs @@ -242,7 +242,7 @@ where vec![Job::new(neighbourhood_update_step)], vec![Job::new(intersection_compute_step)], None, - |egs, _, _, _| { + |egs, _, _, _, _| { tri_mc.iter().map(|mc| egs.finalize::<[usize; 8], [usize;8], [usize; 8], ArrConst,8>>(mc)).collect_vec() }, threads, @@ -294,7 +294,7 @@ where vec![], vec![Job::new(star_count_step)], None, - |egs, _ , _ , _ | { + |egs, _ , _ , _ ,_| { triadic_motifs.iter().enumerate().map(|(i,tri)| { let mut tmp = egs.finalize::<[usize; 32], [usize;32], [usize; 32], ArrConst,32>>(&star_clone[i]) .iter().copied() diff --git a/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs b/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs index a06d28690d..5cbd025b4a 100644 --- a/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs +++ b/raphtory/src/algorithms/motifs/local_temporal_three_node_motifs.rs @@ -291,13 +291,12 @@ where }); let mut runner: TaskRunner, _> = TaskRunner::new(ctx_subgraph); - let index = Index::for_graph(&kcore_subgraph); runner.run( vec![Job::new(neighbourhood_update_step)], vec![Job::new(intersection_compute_step)], None, - |_, _, _els, mut local| { + |_, _, _els, mut local, index| { let mut tri_motifs = HashMap::new(); for node in kcore_subgraph.nodes() { let v_gid = node.name(); @@ -354,13 +353,12 @@ where }); let mut runner: TaskRunner = TaskRunner::new(ctx); - let index = Index::for_graph(g); runner.run( vec![Job::new(star_motif_step)], vec![], None, - |_, _, _, local| { + |_, _, _, local, index| { let values: Vec<_> = g .nodes() .par_iter() diff --git a/raphtory/src/algorithms/motifs/triangle_count.rs b/raphtory/src/algorithms/motifs/triangle_count.rs index 2e4381bbd6..de8ad3d6cd 100644 --- a/raphtory/src/algorithms/motifs/triangle_count.rs +++ b/raphtory/src/algorithms/motifs/triangle_count.rs @@ -107,7 +107,7 @@ pub fn triangle_count(graph: &G, threads: Option) init_tasks, tasks, None, - |egs, _, _, _| egs.finalize(&count), + |egs, _, _, _, _| egs.finalize(&count), threads, 1, None, diff --git a/raphtory/src/algorithms/motifs/triplet_count.rs b/raphtory/src/algorithms/motifs/triplet_count.rs index c85a2dc462..590f899224 100644 --- a/raphtory/src/algorithms/motifs/triplet_count.rs +++ b/raphtory/src/algorithms/motifs/triplet_count.rs @@ -104,7 +104,7 @@ pub fn triplet_count(g: &G, threads: Option) -> us vec![], vec![Job::new(step1)], None, - |egs, _, _, _| egs.finalize(&count), + |egs, _, _, _, _| egs.finalize(&count), threads, 1, None, diff --git a/raphtory/src/algorithms/pathing/single_source_shortest_path.rs b/raphtory/src/algorithms/pathing/single_source_shortest_path.rs index 490beae799..1b8df528fd 100644 --- a/raphtory/src/algorithms/pathing/single_source_shortest_path.rs +++ b/raphtory/src/algorithms/pathing/single_source_shortest_path.rs @@ -2,7 +2,6 @@ //! //! This module provides an implementation of the Single Source Shortest Path algorithm. //! It finds the shortest paths from a given source node to all other nodes in a graph. -use either::Either; use crate::{ core::entities::{nodes::node_ref::AsNodeRef, VID}, diff --git a/raphtory/src/algorithms/pathing/temporal_reachability.rs b/raphtory/src/algorithms/pathing/temporal_reachability.rs index 7368f171e2..560f3c1f99 100644 --- a/raphtory/src/algorithms/pathing/temporal_reachability.rs +++ b/raphtory/src/algorithms/pathing/temporal_reachability.rs @@ -181,25 +181,25 @@ pub fn temporally_reachable_nodes( })); let mut runner: TaskRunner = TaskRunner::new(ctx); - let result: HashMap> = runner.run( + let (index, values) = runner.run( vec![Job::new(step1)], vec![Job::new(step2), step3], None, - |_, ess, _, _| { - ess.finalize(&taint_history, |taint_history| { + |_, ess, _, _, index| { + let data = ess.finalize_vec(&taint_history, |taint_history| { let mut hist = taint_history .into_iter() .map(|tmsg| (tmsg.event_time, tmsg.src_node)) .collect_vec(); hist.sort(); hist - }) + }); + (index, data) }, threads, max_hops, None, None, ); - let result: FxHashMap<_, _> = result.into_iter().map(|(k, v)| (VID(k), v)).collect(); - NodeState::new_from_map(g.clone(), result, |v| v) + NodeState::new_from_eval_with_index(g.clone(), values, index) } diff --git a/raphtory/src/core/state/compute_state.rs b/raphtory/src/core/state/compute_state.rs index f604dd81d3..b4ad7e94d5 100644 --- a/raphtory/src/core/state/compute_state.rs +++ b/raphtory/src/core/state/compute_state.rs @@ -26,7 +26,11 @@ pub trait ComputeState: std::fmt::Debug + Clone + Send + Sync { i: usize, ) -> Option<&A>; - fn iter(&self, ss: usize, extend_to: usize) -> Box + '_>; + fn iter( + &self, + ss: usize, + extend_to: usize, + ) -> Box + Send + '_>; fn agg>(&mut self, ss: usize, a: IN, ki: usize) where @@ -118,7 +122,11 @@ impl ComputeState for ComputeStateVec { vec.current(ss).get(i) } - fn iter(&self, ss: usize, extend_to: usize) -> Box + '_> { + fn iter( + &self, + ss: usize, + extend_to: usize, + ) -> Box + Send + '_> { let vec = self .current() .as_any() diff --git a/raphtory/src/core/state/mod.rs b/raphtory/src/core/state/mod.rs index 834060f451..89adcc16ba 100644 --- a/raphtory/src/core/state/mod.rs +++ b/raphtory/src/core/state/mod.rs @@ -295,8 +295,9 @@ mod state_test { let mut actual = part1_state .clone() - .finalize(&sum, 0, &g, |c| c) + .finalize_vec(&sum, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -305,8 +306,9 @@ mod state_test { let mut actual = part1_state .clone() - .finalize(&min, 0, &g, |c| c) + .finalize_vec(&min, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -318,8 +320,9 @@ mod state_test { let mut actual = part2_state .clone() - .finalize(&sum, 0, &g, |c| c) + .finalize_vec(&sum, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -328,8 +331,9 @@ mod state_test { let mut actual = part2_state .clone() - .finalize(&min, 0, &g, |c| c) + .finalize_vec(&min, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -342,8 +346,9 @@ mod state_test { ShuffleComputeState::merge_mut(&mut part1_state, &part2_state, sum, 0); let mut actual = part1_state .clone() - .finalize(&sum, 0, &g, |c| c) + .finalize_vec(&sum, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); @@ -360,8 +365,9 @@ mod state_test { ShuffleComputeState::merge_mut(&mut part1_state, &part2_state, min, 0); let mut actual = part1_state .clone() - .finalize(&min, 0, &g, |c| c) + .finalize_vec(&min, 0, &g, |c| c) .into_iter() + .enumerate() .collect_vec(); actual.sort(); diff --git a/raphtory/src/core/state/morcel_state.rs b/raphtory/src/core/state/morcel_state.rs index f3a8c59481..6a54db9eed 100644 --- a/raphtory/src/core/state/morcel_state.rs +++ b/raphtory/src/core/state/morcel_state.rs @@ -138,7 +138,7 @@ impl MorcelComputeState { &self, ss: usize, agg_ref: &AccId, - ) -> Box> + '_> + ) -> Box> + Send + '_> where A: StateType, { diff --git a/raphtory/src/core/state/shuffle_state.rs b/raphtory/src/core/state/shuffle_state.rs index 5f290d1ab3..cb4bed57e2 100644 --- a/raphtory/src/core/state/shuffle_state.rs +++ b/raphtory/src/core/state/shuffle_state.rs @@ -1,3 +1,7 @@ +use either::Either; +use raphtory_api::iter::IntoDynBoxed; +use raphtory_core::utils::iter::GenLockedIter; + use super::{ accumulator_id::AccId, compute_state::ComputeState, @@ -204,13 +208,22 @@ impl ShuffleComputeState { .read::(GLOBAL_STATE_KEY, agg_ref.id(), ss) } - pub fn finalize, G: StaticGraphViewOps>( + pub fn finalize< + A, + B, + F, + IN, + OUT, + ACC: Accumulator, + G: StaticGraphViewOps, + C: FromIterator<(usize, B)>, + >( &self, agg_def: &AccId, ss: usize, _g: &G, f: F, - ) -> HashMap + ) -> C where OUT: StateType, A: StateType, @@ -225,12 +238,33 @@ impl ShuffleComputeState { }) .collect() } + pub fn finalize_vec, G: StaticGraphViewOps>( + &self, + agg_def: &AccId, + ss: usize, + _g: &G, + f: F, + ) -> Vec + where + OUT: StateType, + A: StateType, + F: Fn(OUT) -> B + Copy, + { + self.iter(ss, *agg_def) + .map(|(_, a)| { + let out = a + .map(|a| ACC::finish(a)) + .unwrap_or_else(|| ACC::finish(&ACC::zero())); + f(out) + }) + .collect() + } pub fn iter<'a, A: StateType, IN: 'a, OUT: 'a, ACC: Accumulator>( &'a self, ss: usize, acc_id: AccId, - ) -> impl Iterator)> + 'a { + ) -> impl Iterator)> + Send + 'a { self.parts .iter() .flat_map(move |part| part.iter(ss, &acc_id)) @@ -312,6 +346,24 @@ impl EvalShardState { } } + pub fn finalize_vec>( + self, + agg_def: &AccId, + f: F, + ) -> Vec + where + OUT: StateType, + A: StateType, + F: Fn(OUT) -> B + Copy, + { + let inner = self.shard_states.consume(); + if let Ok(inner) = inner { + inner.finalize_vec(agg_def, self.ss, &self.g, f) + } else { + vec![] + } + } + pub fn values(&self) -> &Shard { &self.shard_states } diff --git a/raphtory/src/db/api/state/node_state.rs b/raphtory/src/db/api/state/node_state.rs index 3fc6e17409..f1c465a9be 100644 --- a/raphtory/src/db/api/state/node_state.rs +++ b/raphtory/src/db/api/state/node_state.rs @@ -261,6 +261,40 @@ impl<'graph, V, G: GraphViewOps<'graph>> NodeState<'graph, V, G> { Self::new(graph.clone(), graph, values.into(), index) } + /// Construct a node state from an eval result + /// + /// # Arguments + /// - `graph`: the graph view + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). + /// - `index`: the index mapping VID to flat position in values + pub fn new_from_eval_with_index(graph: G, values: Vec, index: Index) -> Self + where + V: Clone, + { + // Values are already in flat index order from TaskRunner + Self::new(graph.clone(), graph, values.into(), index) + } + + /// Construct a node state from an eval result, mapping values + /// + /// # Arguments + /// - `graph`: the graph view + /// - `values`: the values indexed by flat position (i.e., `values.len() == index.len()`). + /// - `map`: Closure mapping input to output values + pub fn new_from_eval_mapped_with_index( + graph: G, + values: Vec, + index: Index, + map: impl Fn(R) -> V, + ) -> Self + where + V: std::fmt::Debug, + { + // Values are already in flat index order from TaskRunner, just map them + let values = values.into_iter().map(map).collect(); + Self::new(graph.clone(), graph, values, index) + } + /// Construct a node state from an eval result, mapping values /// /// # Arguments diff --git a/raphtory/src/db/task/mod.rs b/raphtory/src/db/task/mod.rs index 141ef726e9..276feb8d57 100644 --- a/raphtory/src/db/task/mod.rs +++ b/raphtory/src/db/task/mod.rs @@ -89,7 +89,7 @@ mod task_tests { vec![], vec![Job::new(step1)], None, - |egs, _, _, _| egs.finalize(&count), + |egs, _, _, _,_| egs.finalize(&count), Some(2), 1, None, diff --git a/raphtory/src/db/task/task_runner.rs b/raphtory/src/db/task/task_runner.rs index df75ca7296..e6c905937a 100644 --- a/raphtory/src/db/task/task_runner.rs +++ b/raphtory/src/db/task/task_runner.rs @@ -213,7 +213,11 @@ impl TaskRunner { }) } - fn make_cur_and_prev_states(&self, mut init: Vec, num_nodes: usize) -> (Vec, Vec) { + fn make_cur_and_prev_states( + &self, + mut init: Vec, + num_nodes: usize, + ) -> (Vec, Vec) { init.resize(num_nodes, S::default()); (init.clone(), init) @@ -221,7 +225,13 @@ impl TaskRunner { pub fn run< B, - F: FnOnce(GlobalState, EvalShardState, EvalLocalState, Vec) -> B, + F: FnOnce( + GlobalState, + EvalShardState, + EvalLocalState, + Vec, + Index, + ) -> B, S: Send + Sync + Clone + 'static + std::fmt::Debug + Default, >( &mut self, @@ -250,14 +260,20 @@ impl TaskRunner { let index = Index::for_graph(graph.clone()); println!("DEBUG TaskRunner::run:"); - println!(" graph.unfiltered_num_nodes() = {}", graph.unfiltered_num_nodes()); + println!( + " graph.unfiltered_num_nodes() = {}", + graph.unfiltered_num_nodes() + ); println!(" node_index.len() = {}", num_nodes); println!(" morcel_size = {}", morcel_size); println!(" num_chunks = {}", num_chunks); - println!(" index variant = {:?}", match &index { - Index::Full(_) => "Full", - Index::Partial(_) => "Partial", - }); + println!( + " index variant = {:?}", + match &index { + Index::Full(_) => "Full", + Index::Partial(_) => "Partial", + } + ); let mut shard_state = shard_initial_state.unwrap_or_else(|| Shard::new(num_nodes, num_chunks, morcel_size)); @@ -336,6 +352,7 @@ impl TaskRunner { EvalShardState::new(ss, self.ctx.graph(), shard_state), EvalLocalState::new(ss, self.ctx.graph(), vec![]), last_local_state, + index, ); self.ctx.reset_ss(); to_return From 74ba68d534b665b7cfe6f9ed95f693352066359d Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Thu, 11 Dec 2025 12:07:55 +0000 Subject: [PATCH 13/24] refactor edge_loading into a separate module --- raphtory/Cargo.toml | 1 + raphtory/src/io/arrow/df_loaders/edges.rs | 428 ++++++++++++++++++ .../{df_loaders.rs => df_loaders/mod.rs} | 377 +-------------- raphtory/src/io/parquet_loaders.rs | 5 +- .../src/python/graph/io/pandas_loaders.rs | 5 +- 5 files changed, 443 insertions(+), 373 deletions(-) create mode 100644 raphtory/src/io/arrow/df_loaders/edges.rs rename raphtory/src/io/arrow/{df_loaders.rs => df_loaders/mod.rs} (61%) diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index b710a80068..3a1cdec585 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -107,6 +107,7 @@ dotenv = { workspace = true } # for vector testing streaming-stats = { workspace = true } indoc = { workspace = true } raphtory = { workspace = true, features = ["test-utils"] } # enable test-utils for integration tests +tikv-jemallocator = "0.6.1" [build-dependencies] prost-build = { workspace = true, optional = true } diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs new file mode 100644 index 0000000000..33ce92d710 --- /dev/null +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -0,0 +1,428 @@ +use crate::{ + core::entities::nodes::node_ref::AsNodeRef, + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, + io::arrow::{ + dataframe::{DFChunk, DFView, SecondaryIndexCol}, + df_loaders::{build_progress_bar, process_shared_properties}, + layer_col::lift_layer_col, + node_col::NodeCol, + prop_handler::*, + }, + prelude::*, +}; +use bytemuck::checked::cast_slice_mut; +use db4_graph::WriteLockedGraph; +use itertools::izip; +use kdam::BarExt; +use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; +use raphtory_api::{ + atomic_extra::atomic_usize_from_mut_slice, + core::{ + entities::EID, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, + }, +}; +use raphtory_core::entities::{GidRef, VID}; +use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; +use rayon::prelude::*; +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}, + mpsc, + }, +}; + +pub fn load_edges_from_df( + df_view: DFView> + Send>, + time: &str, + secondary_index: Option<&str>, + src: &str, + dst: &str, + properties: &[&str], + metadata: &[&str], + shared_metadata: Option<&HashMap>, + layer: Option<&str>, + layer_col: Option<&str>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + + let properties_indices = properties + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let src_index = df_view.get_index(src)?; + let dst_index = df_view.get_index(dst)?; + let time_index = df_view.get_index(time)?; + let secondary_index_index = secondary_index + .map(|col| df_view.get_index(col)) + .transpose()?; + let layer_index = if let Some(layer_col) = layer_col { + Some(df_view.get_index(layer_col.as_ref())?) + } else { + None + }; + let session = graph.write_session().map_err(into_graph_err)?; + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + // #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading edges".to_string(), df_view.num_rows)?; + + let mut src_col_resolved: Vec = vec![]; + let mut dst_col_resolved: Vec = vec![]; + let mut eid_col_resolved: Vec = vec![]; + let mut eids_exist: Vec = vec![]; // exists or needs to be created + let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created + + rayon::scope(|s| { + let (tx, rx) = mpsc::sync_channel(2); + + s.spawn(move |_| { + let sender = tx; + for chunk in df_view.chunks { + sender.send(chunk).unwrap() + } + }); + + for chunk in rx.iter() { + let df = chunk?; + let prop_cols = + combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, false) + .map_err(into_graph_err) + })?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + src_col_resolved.resize_with(df.len(), Default::default); + dst_col_resolved.resize_with(df.len(), Default::default); + + let atomic_src_col = atomic_vid_from_mut_slice(&mut src_col_resolved); + let atomic_dst_col = atomic_vid_from_mut_slice(&mut dst_col_resolved); + + let layer = lift_layer_col(layer, layer_index, &df)?; + let layer_col_resolved = layer.resolve(graph)?; + + let src_col = df.node_col(src_index)?; + src_col.validate(graph, LoadError::MissingSrcError)?; + + let dst_col = df.node_col(dst_index)?; + dst_col.validate(graph, LoadError::MissingDstError)?; + let gid_str_cache = resolve_nodes_with_cache::( + graph, + [(&src_col, atomic_src_col), (&dst_col, atomic_dst_col)].as_ref(), + )?; + + let time_col = df.time_col(time_index)?; + + // Load the secondary index column if it exists, otherwise generate from start_id. + let secondary_index_col = + extract_secondary_index_col::(secondary_index_index, &session, &df)?; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + eid_col_resolved.resize_with(df.len(), Default::default); + eids_exist.resize_with(df.len(), Default::default); + layer_eids_exist.resize_with(df.len(), Default::default); + let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); + + let edges = write_locked_graph.graph().storage().edges().clone(); + let next_edge_id = |row: usize| { + let (page, pos) = edges.reserve_free_pos(row); + pos.as_eid(page, edges.max_page_len()) + }; + + let mut per_segment_edge_count = Vec::with_capacity(write_locked_graph.nodes.len()); + per_segment_edge_count + .resize_with(write_locked_graph.nodes.len(), || AtomicUsize::new(0)); + + let WriteLockedGraph { + nodes, ref edges, .. + } = &mut write_locked_graph; + + let counts_per_segment = (0..nodes.len()) + .map(|_| AtomicU32::new(0)) + .collect::>(); + + // Generate all edge_ids + add outbound edges + nodes + .par_iter_mut() + .enumerate() // TODO: change to par_iter_mut but preserve edge_id order + .for_each(|(p_id, locked_page)| { + // Zip all columns for iteration. + let zip = izip!( + src_col_resolved.iter(), + dst_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + layer_col_resolved.iter() + ); + + for entry in gid_str_cache.iter() { + let (src_gid, vid) = entry.value(); + + if let Some(src_pos) = locked_page.resolve_pos(vid.inner()) { + let mut writer = locked_page.writer(); + writer.store_node_id(src_pos, 0, src_gid.clone(), 0); + } + } + + for (row, (src, dst, time, secondary_index, layer)) in zip.enumerate() { + if let Some(src_pos) = locked_page.resolve_pos(*src) { + counts_per_segment[p_id].fetch_add(1, Ordering::Relaxed); + let mut writer = locked_page.writer(); + let t = TimeIndexEntry(time, secondary_index); + // find the original EID in the static graph if it exists + // otherwise create a new one + + let edge_id = + if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(true, Ordering::Relaxed); + edge_id.with_layer(*layer) + } else { + let edge_id = next_edge_id(row); + writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(false, Ordering::Relaxed); + edge_id.with_layer(*layer) + }; + + if edges.exists(edge_id) + // || writer.get_out_edge(src_pos, *dst, *layer).is_some() + { + layer_eids_exist[row].store(true, Ordering::Relaxed); + // node additions + writer.update_timestamp(t, src_pos, edge_id, 0); + } else { + layer_eids_exist[row].store(false, Ordering::Relaxed); + // actually adds the edge + writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); + } + } + } + }); + + // s.spawn(|_| { + write_locked_graph.nodes.par_iter_mut().for_each(|shard| { + let zip = izip!( + src_col_resolved.iter(), + dst_col_resolved.iter(), + eid_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + layer_col_resolved.iter(), + layer_eids_exist.iter().map(|a| a.load(Ordering::Relaxed)), + eids_exist.iter().map(|b| b.load(Ordering::Relaxed)) + ); + + for ( + src, + dst, + eid, + time, + secondary_index, + layer, + edge_exists_in_layer, + edge_exists_in_static_graph, + ) in zip + { + if let Some(dst_pos) = shard.resolve_pos(*dst) { + let t = TimeIndexEntry(time, secondary_index); + let mut writer = shard.writer(); + + if !edge_exists_in_static_graph { + writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); + } + + if !edge_exists_in_layer { + writer.add_inbound_edge( + Some(t), + dst_pos, + *src, + eid.with_layer(*layer), + 0, + ); + } else { + writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); + } + } + } + }); + // }); + + drop(write_locked_graph); + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + let per_edge_segment_event_count = (0..write_locked_graph.edges.len()) + .map(|_| AtomicUsize::new(0)) + .collect::>(); + + // // Add temporal & constant properties to edges + // sc.spawn(|_| { + let now = std::time::Instant::now(); + write_locked_graph + .edges + .par_iter_mut() + .enumerate() + .for_each(|(seg_id, shard)| { + let zip = izip!( + src_col_resolved.iter(), + dst_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + eid_col_resolved.iter(), + layer_col_resolved.iter(), + eids_exist + .iter() + .map(|exists| exists.load(Ordering::Relaxed)) + ); + let mut t_props: Vec<(usize, Prop)> = vec![]; + let mut c_props: Vec<(usize, Prop)> = vec![]; + + for (row, (src, dst, time, secondary_index, eid, layer, exists)) in + zip.enumerate() + { + if let Some(eid_pos) = shard.resolve_pos(*eid) { + per_edge_segment_event_count[seg_id].fetch_add(1, Ordering::Relaxed); + let t = TimeIndexEntry(time, secondary_index); + let mut writer = shard.writer(); + + t_props.clear(); + t_props.extend(prop_cols.iter_row(row)); + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(row)); + c_props.extend_from_slice(&shared_metadata); + + writer.bulk_add_edge( + t, + eid_pos, + *src, + *dst, + exists, + *layer, + c_props.drain(..), + t_props.drain(..), + 0, + ); + } + } + }); + + println!("Loading edge events took {:?}", now.elapsed()); + // }); + // }); + // + // println!( + // "Per edge segment event count: {:?}", + // per_edge_segment_event_count + // ); + + // #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + Ok::<_, GraphError>(()) + })?; + // set the type of the resolver; + + Ok(()) +} + +fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps>( + graph: &G, + cols_to_resolve: &[(&'a NodeCol, &mut [AtomicUsize])], +) -> Result, (Prop, MaybeNew)>, GraphError> { + let gid_str_cache: dashmap::DashMap, (Prop, MaybeNew), _> = + FxDashMap::default(); + let hasher_factory = gid_str_cache.hasher().clone(); + gid_str_cache + .shards() + .par_iter() + .enumerate() + .try_for_each(|(shard_idx, shard)| { + let mut shard_guard = shard.write(); + use dashmap::SharedValue; + use std::hash::BuildHasher; + + // Create hasher function for this shard + let hash_key = |key: &GidRef<'_>| -> u64 { hasher_factory.hash_one(key) }; + + let hasher_fn = |tuple: &(GidRef<'_>, SharedValue<(Prop, MaybeNew)>)| -> u64 { + hasher_factory.hash_one(tuple.0) + }; + + for (col, atomic_col) in cols_to_resolve { + // Process src_col sequentially for this shard + for (idx, gid) in col.iter().enumerate() { + // Check if this key belongs to this shard + if gid_str_cache.determine_map(&gid) != shard_idx { + continue; // Skip, not our shard + } + + let hash = hash_key(&gid); + + // Check if exists in this shard + if let Some((_, value)) = shard_guard.get(hash, |(g, _)| g == &gid) { + let (_, vid) = value.get(); + atomic_col[idx].store(vid.inner().index(), Ordering::Relaxed); + } else { + let vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(|_| LoadError::FatalError)?; + + let data = (gid, SharedValue::new((Prop::from(gid), vid))); + shard_guard.insert(hash, data, hasher_fn); + + atomic_col[idx].store(vid.inner().index(), Ordering::Relaxed); + } + } + } + + Ok::<(), LoadError>(()) + })?; + Ok(gid_str_cache) +} + +fn extract_secondary_index_col( + secondary_index_index: Option, + session: &::WS<'_>, + df: &DFChunk, +) -> Result { + let secondary_index_col = match secondary_index_index { + Some(col_index) => { + // Update the event_id to reflect ingesting new secondary indices. + let col = df.secondary_index_col(col_index)?; + session + .set_max_event_id(col.max()) + .map_err(into_graph_err)?; + col + } + None => { + let start_id = session + .reserve_event_ids(df.len()) + .map_err(into_graph_err)?; + SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) + } + }; + Ok(secondary_index_col) +} diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders/mod.rs similarity index 61% rename from raphtory/src/io/arrow/df_loaders.rs rename to raphtory/src/io/arrow/df_loaders/mod.rs index 87ab30e9dc..40cd61e913 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders/mod.rs @@ -9,13 +9,12 @@ use crate::{ }, prelude::*, }; -use arrow::array::BooleanArray; use bytemuck::checked::cast_slice_mut; use db4_graph::WriteLockedGraph; use either::Either; use itertools::izip; use kdam::{Bar, BarBuilder, BarExt}; -use raphtory_api::core::storage::FxHashMap; +use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; use raphtory_api::{ atomic_extra::atomic_usize_from_mut_slice, core::{ @@ -30,22 +29,19 @@ use raphtory_core::{ entities::{graph::logical_to_physical::ResolverShardT, GidRef, VID}, storage::timeindex::AsTime, }; -use raphtory_storage::{ - core_ops::CoreGraphOps, - layer_ops::InternalLayerOps, - mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}, -}; +use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; use rayon::prelude::*; use std::{ borrow::{Borrow, Cow}, collections::HashMap, sync::{ - atomic::{AtomicBool, AtomicUsize, Ordering}, - mpsc, Arc, + atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}, + mpsc, }, - usize, }; +pub mod edges; + fn build_progress_bar(des: String, num_rows: usize) -> Result { BarBuilder::default() .desc(des) @@ -231,367 +227,6 @@ pub fn load_nodes_from_df< Ok(()) } -pub fn load_edges_from_df( - df_view: DFView> + Send>, - time: &str, - secondary_index: Option<&str>, - src: &str, - dst: &str, - properties: &[&str], - metadata: &[&str], - shared_metadata: Option<&HashMap>, - layer: Option<&str>, - layer_col: Option<&str>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - - let properties_indices = properties - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let src_index = df_view.get_index(src)?; - let dst_index = df_view.get_index(dst)?; - let time_index = df_view.get_index(time)?; - let secondary_index_index = secondary_index - .map(|col| df_view.get_index(col)) - .transpose()?; - let layer_index = if let Some(layer_col) = layer_col { - Some(df_view.get_index(layer_col.as_ref())?) - } else { - None - }; - let session = graph.write_session().map_err(into_graph_err)?; - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - // #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading edges".to_string(), df_view.num_rows)?; - - let mut src_col_resolved: Vec = vec![]; - let mut dst_col_resolved: Vec = vec![]; - let mut eid_col_resolved: Vec = vec![]; - let mut eids_exist: Vec = vec![]; // exists or needs to be created - let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created - - rayon::scope(|s| { - let (tx, rx) = mpsc::sync_channel(2); - - s.spawn(move |_| { - let sender = tx; - for chunk in df_view.chunks { - sender.send(chunk).unwrap() - } - }); - - for chunk in rx.iter() { - let df = chunk?; - let prop_cols = - combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, false) - .map_err(into_graph_err) - })?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - src_col_resolved.resize_with(df.len(), Default::default); - dst_col_resolved.resize_with(df.len(), Default::default); - - // let src_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut src_col_resolved)); - // let dst_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut dst_col_resolved)); - - let layer = lift_layer_col(layer, layer_index, &df)?; - let layer_col_resolved = layer.resolve(graph)?; - - let src_col = df.node_col(src_index)?; - src_col.validate(graph, LoadError::MissingSrcError)?; - - let dst_col = df.node_col(dst_index)?; - dst_col.validate(graph, LoadError::MissingDstError)?; - - // avoid allocation of ArcStr - let gid_str_cache = FxDashMap::default(); - // It's our graph, no one else can change it - src_col - .par_iter() - .zip(src_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let vid = graph - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; - - if vid.is_new() && gid_str_cache.get(&gid).is_none() { - gid_str_cache - .entry(gid) - .or_insert_with(|| (Prop::from(gid), vid)); - }; - - *resolved = vid.inner(); - Ok::<(), LoadError>(()) - })?; - - dst_col - .par_iter() - .zip(dst_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - - let vid = graph - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; - - if vid.is_new() && gid_str_cache.get(&gid).is_none() { - gid_str_cache - .entry(gid) - .or_insert_with(|| (Prop::from(gid), vid)); - }; - - *resolved = vid.inner(); - Ok::<(), LoadError>(()) - })?; - - let time_col = df.time_col(time_index)?; - - // Load the secondary index column if it exists, otherwise generate from start_id. - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) - } - }; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - eid_col_resolved.resize_with(df.len(), Default::default); - eids_exist.resize_with(df.len(), Default::default); - layer_eids_exist.resize_with(df.len(), Default::default); - let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); - - let edges = write_locked_graph.graph().storage().edges().clone(); - let next_edge_id = |row: usize| { - let (page, pos) = edges.reserve_free_pos(row); - pos.as_eid(page, edges.max_page_len()) - }; - - let mut per_segment_edge_count = Vec::with_capacity(write_locked_graph.nodes.len()); - per_segment_edge_count - .resize_with(write_locked_graph.nodes.len(), || AtomicUsize::new(0)); - - let WriteLockedGraph { - nodes, ref edges, .. - } = &mut write_locked_graph; - - // Generate all edge_ids + add outbound edges - nodes - .par_iter_mut() - .enumerate() // TODO: change to par_iter_mut but preserve edge_id order - .for_each(|(p_id, locked_page)| { - // Zip all columns for iteration. - let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - layer_col_resolved.iter() - ); - - for entry in gid_str_cache.iter() { - let (src_gid, vid) = entry.value(); - - if let Some(src_pos) = locked_page.resolve_pos(vid.inner()) { - let mut writer = locked_page.writer(); - writer.store_node_id(src_pos, 0, src_gid.clone(), 0); - } - } - - for (row, (src, dst, time, secondary_index, layer)) in zip.enumerate() { - if let Some(src_pos) = locked_page.resolve_pos(*src) { - let mut writer = locked_page.writer(); - let t = TimeIndexEntry(time, secondary_index); - // find the original EID in the static graph if it exists - // otherwise create a new one - - let edge_id = - if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(true, Ordering::Relaxed); - edge_id.with_layer(*layer) - } else { - let edge_id = next_edge_id(row); - writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(false, Ordering::Relaxed); - edge_id.with_layer(*layer) - }; - - if edges.exists(edge_id) - // || writer.get_out_edge(src_pos, *dst, *layer).is_some() - { - layer_eids_exist[row].store(true, Ordering::Relaxed); - // node additions - writer.update_timestamp(t, src_pos, edge_id, 0); - } else { - layer_eids_exist[row].store(false, Ordering::Relaxed); - // actually adds the edge - writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); - } - } - } - }); - - // s.spawn(|_| { - write_locked_graph.nodes.par_iter_mut().for_each(|shard| { - let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), - eid_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - layer_col_resolved.iter(), - layer_eids_exist.iter().map(|a| a.load(Ordering::Relaxed)), - eids_exist.iter().map(|b| b.load(Ordering::Relaxed)) - ); - - for ( - src, - dst, - eid, - time, - secondary_index, - layer, - edge_exists_in_layer, - edge_exists_in_static_graph, - ) in zip - { - if let Some(dst_pos) = shard.resolve_pos(*dst) { - let t = TimeIndexEntry(time, secondary_index); - let mut writer = shard.writer(); - - if !edge_exists_in_static_graph { - writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); - } - - if !edge_exists_in_layer { - writer.add_inbound_edge( - Some(t), - dst_pos, - *src, - eid.with_layer(*layer), - 0, - ); - } else { - writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); - } - } - } - }); - // }); - - drop(write_locked_graph); - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - let per_edge_segment_event_count = (0..write_locked_graph.edges.len()) - .map(|_| AtomicUsize::new(0)) - .collect::>(); - - // // Add temporal & constant properties to edges - // sc.spawn(|_| { - let now = std::time::Instant::now(); - write_locked_graph - .edges - .par_iter_mut() - .enumerate() - .for_each(|(seg_id, shard)| { - let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - eid_col_resolved.iter(), - layer_col_resolved.iter(), - eids_exist - .iter() - .map(|exists| exists.load(Ordering::Relaxed)) - ); - let mut t_props: Vec<(usize, Prop)> = vec![]; - let mut c_props: Vec<(usize, Prop)> = vec![]; - - for (row, (src, dst, time, secondary_index, eid, layer, exists)) in - zip.enumerate() - { - if let Some(eid_pos) = shard.resolve_pos(*eid) { - per_edge_segment_event_count[seg_id].fetch_add(1, Ordering::Relaxed); - let t = TimeIndexEntry(time, secondary_index); - let mut writer = shard.writer(); - - t_props.clear(); - t_props.extend(prop_cols.iter_row(row)); - - c_props.clear(); - c_props.extend(metadata_cols.iter_row(row)); - c_props.extend_from_slice(&shared_metadata); - - writer.bulk_add_edge( - t, - eid_pos, - *src, - *dst, - exists, - *layer, - c_props.drain(..), - t_props.drain(..), - 0, - ); - } - } - }); - - println!("Loading edge events took {:?}", now.elapsed()); - // }); - // }); - // - // println!( - // "Per edge segment event count: {:?}", - // per_edge_segment_event_count - // ); - - // #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - Ok::<_, GraphError>(()) - })?; - // set the type of the resolver; - - Ok(()) -} - fn load_into_shard( src_col_shared: &[AtomicUsize], dst_col_shared: &[AtomicUsize], diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index e2251ba160..c5ebe7fe38 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -1,7 +1,10 @@ use crate::{ db::api::view::StaticGraphViewOps, errors::{GraphError, InvalidPathReason::PathDoesNotExist}, - io::arrow::{dataframe::*, df_loaders::*}, + io::arrow::{ + dataframe::*, + df_loaders::{edges::load_edges_from_df, *}, + }, prelude::{AdditionOps, DeletionOps, PropertyAdditionOps}, }; use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs index 14811d41e2..8421322b46 100644 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -1,7 +1,10 @@ use crate::{ db::api::view::StaticGraphViewOps, errors::GraphError, - io::arrow::{dataframe::*, df_loaders::*}, + io::arrow::{ + dataframe::*, + df_loaders::{edges::load_edges_from_df, *}, + }, prelude::{AdditionOps, PropertyAdditionOps}, python::graph::io::*, }; From 51ff4623cd1ac0d3cbd5cc84c1c478b88e343d34 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Thu, 11 Dec 2025 14:24:50 +0000 Subject: [PATCH 14/24] refactor load_edges_from_df --- .../src/entities/properties/tcell.rs | 2 +- raphtory/src/io/arrow/df_loaders/edges.rs | 337 ++++++++++-------- 2 files changed, 180 insertions(+), 159 deletions(-) diff --git a/raphtory-core/src/entities/properties/tcell.rs b/raphtory-core/src/entities/properties/tcell.rs index e51b8ae77c..3ef808b5d7 100644 --- a/raphtory-core/src/entities/properties/tcell.rs +++ b/raphtory-core/src/entities/properties/tcell.rs @@ -23,7 +23,7 @@ enum TCellVariants { TCellN(TCellN), } -const BTREE_CUTOFF: usize = 16; +const BTREE_CUTOFF: usize = 32; impl TCell { pub fn new(t: TimeIndexEntry, value: A) -> Self { diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 33ce92d710..92bdd6cef3 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -33,7 +33,16 @@ use std::{ mpsc, }, }; +use storage::{ + api::{edges::EdgeSegmentOps, nodes::NodeSegmentOps}, + pages::locked::{ + edges::{LockedEdgePage, WriteLockedEdgePages}, + nodes::LockedNodePage, + }, + Extension, +}; +#[allow(clippy::too_many_arguments)] pub fn load_edges_from_df( df_view: DFView> + Send>, time: &str, @@ -158,70 +167,30 @@ pub fn load_edges_from_df>(); - // Generate all edge_ids + add outbound edges - nodes - .par_iter_mut() - .enumerate() // TODO: change to par_iter_mut but preserve edge_id order - .for_each(|(p_id, locked_page)| { - // Zip all columns for iteration. - let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - layer_col_resolved.iter() - ); - - for entry in gid_str_cache.iter() { - let (src_gid, vid) = entry.value(); - - if let Some(src_pos) = locked_page.resolve_pos(vid.inner()) { - let mut writer = locked_page.writer(); - writer.store_node_id(src_pos, 0, src_gid.clone(), 0); - } - } + nodes.par_iter_mut().for_each(|locked_page| { + // Zip all columns for iteration. + let zip = izip!( + src_col_resolved.iter(), + dst_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + layer_col_resolved.iter() + ); - for (row, (src, dst, time, secondary_index, layer)) in zip.enumerate() { - if let Some(src_pos) = locked_page.resolve_pos(*src) { - counts_per_segment[p_id].fetch_add(1, Ordering::Relaxed); - let mut writer = locked_page.writer(); - let t = TimeIndexEntry(time, secondary_index); - // find the original EID in the static graph if it exists - // otherwise create a new one - - let edge_id = - if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(true, Ordering::Relaxed); - edge_id.with_layer(*layer) - } else { - let edge_id = next_edge_id(row); - writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); - eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); - eids_exist[row].store(false, Ordering::Relaxed); - edge_id.with_layer(*layer) - }; - - if edges.exists(edge_id) - // || writer.get_out_edge(src_pos, *dst, *layer).is_some() - { - layer_eids_exist[row].store(true, Ordering::Relaxed); - // node additions - writer.update_timestamp(t, src_pos, edge_id, 0); - } else { - layer_eids_exist[row].store(false, Ordering::Relaxed); - // actually adds the edge - writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); - } - } - } - }); + store_node_ids(&gid_str_cache, locked_page); + + add_and_resolve_outbound_edges( + &eids_exist, + &layer_eids_exist, + &eid_col_shared, + next_edge_id, + edges, + locked_page, + zip, + ); + }); - // s.spawn(|_| { write_locked_graph.nodes.par_iter_mut().for_each(|shard| { let zip = izip!( src_col_resolved.iter(), @@ -234,109 +203,27 @@ pub fn load_edges_from_df>(); - - // // Add temporal & constant properties to edges - // sc.spawn(|_| { - let now = std::time::Instant::now(); - write_locked_graph - .edges - .par_iter_mut() - .enumerate() - .for_each(|(seg_id, shard)| { - let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - eid_col_resolved.iter(), - layer_col_resolved.iter(), - eids_exist - .iter() - .map(|exists| exists.load(Ordering::Relaxed)) - ); - let mut t_props: Vec<(usize, Prop)> = vec![]; - let mut c_props: Vec<(usize, Prop)> = vec![]; - - for (row, (src, dst, time, secondary_index, eid, layer, exists)) in - zip.enumerate() - { - if let Some(eid_pos) = shard.resolve_pos(*eid) { - per_edge_segment_event_count[seg_id].fetch_add(1, Ordering::Relaxed); - let t = TimeIndexEntry(time, secondary_index); - let mut writer = shard.writer(); - - t_props.clear(); - t_props.extend(prop_cols.iter_row(row)); - - c_props.clear(); - c_props.extend(metadata_cols.iter_row(row)); - c_props.extend_from_slice(&shared_metadata); - - writer.bulk_add_edge( - t, - eid_pos, - *src, - *dst, - exists, - *layer, - c_props.drain(..), - t_props.drain(..), - 0, - ); - } - } - }); - - println!("Loading edge events took {:?}", now.elapsed()); - // }); - // }); - // - // println!( - // "Per edge segment event count: {:?}", - // per_edge_segment_event_count - // ); + write_locked_graph.edges.par_iter_mut().for_each(|shard| { + let zip = izip!( + src_col_resolved.iter(), + dst_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + eid_col_resolved.iter(), + layer_col_resolved.iter(), + eids_exist + .iter() + .map(|exists| exists.load(Ordering::Relaxed)) + ); + update_edge_properties(&shared_metadata, &prop_cols, &metadata_cols, shard, zip); + }); // #[cfg(feature = "python")] let _ = pb.update(df.len()); @@ -348,6 +235,139 @@ pub fn load_edges_from_df>( + shared_metadata: &[(usize, Prop)], + prop_cols: &PropCols, + metadata_cols: &PropCols, + shard: &mut LockedEdgePage<'_, ES>, + zip: impl Iterator, +) { + let mut t_props: Vec<(usize, Prop)> = vec![]; + let mut c_props: Vec<(usize, Prop)> = vec![]; + + for (row, (src, dst, time, secondary_index, eid, layer, exists)) in zip.enumerate() { + if let Some(eid_pos) = shard.resolve_pos(*eid) { + let t = TimeIndexEntry(time, secondary_index); + let mut writer = shard.writer(); + + t_props.clear(); + t_props.extend(prop_cols.iter_row(row)); + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(row)); + c_props.extend_from_slice(shared_metadata); + + writer.bulk_add_edge( + t, + eid_pos, + *src, + *dst, + exists, + *layer, + c_props.drain(..), + t_props.drain(..), + 0, + ); + } + } +} + +#[inline(never)] +fn add_outbound_edges<'a, NS: NodeSegmentOps>( + shard: &mut LockedNodePage<'_, NS>, + zip: impl Iterator, +) { + for ( + src, + dst, + eid, + time, + secondary_index, + layer, + edge_exists_in_layer, + edge_exists_in_static_graph, + ) in zip + { + if let Some(dst_pos) = shard.resolve_pos(*dst) { + let t = TimeIndexEntry(time, secondary_index); + let mut writer = shard.writer(); + + if !edge_exists_in_static_graph { + writer.add_static_inbound_edge(dst_pos, *src, *eid, 0); + } + + if !edge_exists_in_layer { + writer.add_inbound_edge(Some(t), dst_pos, *src, eid.with_layer(*layer), 0); + } else { + writer.update_timestamp(t, dst_pos, eid.with_layer(*layer), 0); + } + } + } +} + +#[inline(never)] +fn add_and_resolve_outbound_edges< + 'a, + NS: NodeSegmentOps, + ES: EdgeSegmentOps, +>( + eids_exist: &[AtomicBool], + layer_eids_exist: &[AtomicBool], + eid_col_shared: &&mut [AtomicUsize], + next_edge_id: impl Fn(usize) -> EID, + edges: &WriteLockedEdgePages<'_, ES>, + locked_page: &mut LockedNodePage<'_, NS>, + zip: impl Iterator, +) { + for (row, (src, dst, time, secondary_index, layer)) in zip.enumerate() { + if let Some(src_pos) = locked_page.resolve_pos(*src) { + let mut writer = locked_page.writer(); + let t = TimeIndexEntry(time, secondary_index); + // find the original EID in the static graph if it exists + // otherwise create a new one + + let edge_id = if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(true, Ordering::Relaxed); + edge_id.with_layer(*layer) + } else { + let edge_id = next_edge_id(row); + writer.add_static_outbound_edge(src_pos, *dst, edge_id, 0); + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + eids_exist[row].store(false, Ordering::Relaxed); + edge_id.with_layer(*layer) + }; + + if edges.exists(edge_id) { + layer_eids_exist[row].store(true, Ordering::Relaxed); + // node additions + writer.update_timestamp(t, src_pos, edge_id, 0); + } else { + layer_eids_exist[row].store(false, Ordering::Relaxed); + // actually adds the edge + writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id, 0); + } + } + } +} + +#[inline(never)] +fn store_node_ids>( + gid_str_cache: &FxDashMap, (Prop, MaybeNew)>, + locked_page: &mut LockedNodePage<'_, NS>, +) { + for entry in gid_str_cache.iter() { + let (src_gid, vid) = entry.value(); + + if let Some(src_pos) = locked_page.resolve_pos(vid.inner()) { + let mut writer = locked_page.writer(); + writer.store_node_id(src_pos, 0, src_gid.clone(), 0); + } + } +} + +#[inline(never)] fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps>( graph: &G, cols_to_resolve: &[(&'a NodeCol, &mut [AtomicUsize])], @@ -403,6 +423,7 @@ fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + Ad Ok(gid_str_cache) } +#[inline(never)] fn extract_secondary_index_col( secondary_index_index: Option, session: &::WS<'_>, From 90b997c5d5be6b62e5fdf542064dc84da8496d2b Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Thu, 11 Dec 2025 16:56:39 +0000 Subject: [PATCH 15/24] introduce stable ids in parquet loading --- db4-storage/src/pages/edge_page/writer.rs | 29 +-- db4-storage/src/properties/mod.rs | 2 +- db4-storage/src/segments/edge/segment.rs | 224 ------------------ raphtory-core/src/storage/mod.rs | 62 +---- raphtory/src/io/arrow/dataframe.rs | 8 +- raphtory/src/io/arrow/df_loaders/edges.rs | 176 +++++++++++--- raphtory/src/io/arrow/mod.rs | 2 +- raphtory/src/io/parquet_loaders.rs | 18 +- .../src/python/graph/io/pandas_loaders.rs | 18 +- raphtory/src/serialise/parquet/edges.rs | 7 +- raphtory/src/serialise/parquet/mod.rs | 15 +- raphtory/src/serialise/parquet/model.rs | 6 +- raphtory/tests/df_loaders.rs | 2 +- 13 files changed, 194 insertions(+), 375 deletions(-) diff --git a/db4-storage/src/pages/edge_page/writer.rs b/db4-storage/src/pages/edge_page/writer.rs index ab9a2a4670..0e619bebfd 100644 --- a/db4-storage/src/pages/edge_page/writer.rs +++ b/db4-storage/src/pages/edge_page/writer.rs @@ -3,9 +3,7 @@ use crate::{ segments::edge::segment::MemEdgeSegment, }; use raphtory_api::core::entities::{VID, properties::prop::Prop}; -use raphtory_core::{ - storage::timeindex::{AsTime, TimeIndexEntry}, -}; +use raphtory_core::storage::timeindex::{AsTime, TimeIndexEntry}; use std::ops::DerefMut; pub struct EdgeWriter< @@ -57,31 +55,6 @@ impl<'a, MP: DerefMut + std::fmt::Debug, ES: EdgeSegmen edge_pos } - // pub fn bulk_add_edges( - // &mut self, - // mask: &BooleanArray, - // time: &[i64], - // start_idx: usize, - // eids: &[EID], - // srcs: &[VID], - // dsts: &[VID], - // layer_id: usize, - // cols: &[ArrayRef], - // cols_prop_ids: &[usize], - // ) { - // self.writer.bulk_insert_edges_internal( - // mask, - // time, - // start_idx, - // eids, - // srcs, - // dsts, - // layer_id, - // cols, - // cols_prop_ids, - // ); - // } - pub fn delete_edge( &mut self, t: T, diff --git a/db4-storage/src/properties/mod.rs b/db4-storage/src/properties/mod.rs index 14ae383845..6d1a9356fa 100644 --- a/db4-storage/src/properties/mod.rs +++ b/db4-storage/src/properties/mod.rs @@ -23,7 +23,7 @@ use std::sync::Arc; pub mod props_meta_writer; -#[derive(Debug, Default, serde::Serialize)] +#[derive(Debug, Default)] pub struct Properties { c_properties: Vec, diff --git a/db4-storage/src/segments/edge/segment.rs b/db4-storage/src/segments/edge/segment.rs index 66c072ef30..a84106ee59 100644 --- a/db4-storage/src/segments/edge/segment.rs +++ b/db4-storage/src/segments/edge/segment.rs @@ -144,61 +144,6 @@ impl MemEdgeSegment { .map(|entry| (entry.src, entry.dst)) } - pub fn bulk_insert_edges_internal( - &mut self, - mask: &BooleanArray, - time: &[i64], - time_sec_index: usize, - eids: &[EID], - srcs: &[VID], - dsts: &[VID], - layer_id: usize, - cols: &[ArrayRef], - col_mapping: &[usize], // mapping from cols to the property id - ) { - self.ensure_layer(layer_id); - let est_size = self.layers[layer_id].est_size(); - let t_col_offset = self.layers[layer_id].properties().t_len(); - - let max_page_len = self.layers.get(layer_id).unwrap().max_page_len; - eids.iter() - .zip(srcs.iter().zip(dsts.iter())) - .zip(time) - .enumerate() - .fold( - (t_col_offset, time_sec_index), - |(t_col_offset, time_sec_index), (i, ((eid, (src, dst)), time))| { - if mask.value(i) { - let (_, local_pos) = resolve_pos(*eid, max_page_len); - let row = self.reserve_local_row(local_pos, *src, *dst, layer_id); - let mut prop = self.layers[layer_id].properties_mut().get_mut_entry(row); - prop.ensure_times_from_props(); - prop.set_time(TimeIndexEntry(*time, time_sec_index), t_col_offset); - (t_col_offset + 1, time_sec_index + 1) - } else { - (t_col_offset, time_sec_index) - } - }, - ); - - let props = self.layers[layer_id].properties_mut(); - - // ensure the columns are present - for prop_id in col_mapping { - props.t_properties_mut().ensure_column(*prop_id); - } - - for (prop_id, col) in col_mapping.iter().zip(cols) { - let column = props.t_column_mut(*prop_id).unwrap(); - column.append(col, mask); - } - - props.reset_t_len(); - - let layer_est_size = self.layers[layer_id].est_size(); - self.est_size += layer_est_size.saturating_sub(est_size); - } - pub fn insert_edge_internal( &mut self, t: T, @@ -644,175 +589,6 @@ mod test { assert_eq!(segment.t_len(), 3); } - #[test] - fn test_bulk_insert_edges_internal_basic() { - let mut segment = create_test_segment(); - - // Prepare bulk insert data - let mask = BooleanArray::from(vec![true, true, true]); - let times = vec![1i64, 2i64, 3i64]; - let eids = vec![EID(0), EID(1), EID(2)]; - let srcs = vec![VID(1), VID(3), VID(5)]; - let dsts = vec![VID(2), VID(4), VID(6)]; - let cols: Vec> = - vec![Arc::new(StringArray::from(vec!["test1", "test2", "test3"]))]; - let col_mapping = vec![0]; // property id 0 - - // Bulk insert edges - segment.bulk_insert_edges_internal( - &mask, - ×, - 0, // time_sec_index - &eids, - &srcs, - &dsts, - 0, // layer_id - &cols, - &col_mapping, - ); - - // Verify edges exist - assert!(segment.contains_edge(LocalPOS(0), 0)); - assert!(segment.contains_edge(LocalPOS(1), 0)); - assert!(segment.contains_edge(LocalPOS(2), 0)); - - // Verify edge data - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(1), 0), Some((VID(3), VID(4)))); - assert_eq!(segment.get_edge(LocalPOS(2), 0), Some((VID(5), VID(6)))); - - // Verify time length increased - assert_eq!(segment.t_len(), 3); - - for (index, local_pos) in [LocalPOS(0), LocalPOS(1), LocalPOS(2)].iter().enumerate() { - let actual = segment.layers[0] - .t_prop(*local_pos, 0) - .into_iter() - .flat_map(|p| p.iter()) - .collect::>(); - - let i = local_pos.0 as i64; - assert_eq!( - actual, - vec![( - TimeIndexEntry::new(i + 1, index), - Prop::str(format!("test{}", i + 1)) - )] - ); - } - } - - #[test] - fn test_bulk_insert_with_mask() { - let mut segment = create_test_segment(); - - // Prepare bulk insert data with selective mask - let mask = BooleanArray::from(vec![true, false, true, false]); - let times = vec![1i64, 2i64, 3i64, 4i64]; - let eids = vec![EID(0), EID(1), EID(2), EID(3)]; - let srcs = vec![VID(1), VID(3), VID(5), VID(7)]; - let dsts = vec![VID(2), VID(4), VID(6), VID(8)]; - let cols: Vec> = vec![Arc::new(StringArray::from(vec![ - "test1", "test2", "test3", "test4", - ]))]; - let col_mapping = vec![0]; - - // Bulk insert edges - segment.bulk_insert_edges_internal( - &mask, - ×, - 0, - &eids, - &srcs, - &dsts, - 0, - &cols, - &col_mapping, - ); - - // Only edges at positions 0 and 2 should exist (mask was true) - assert!(segment.contains_edge(LocalPOS(0), 0)); - assert!(!segment.contains_edge(LocalPOS(1), 0)); - assert!(segment.contains_edge(LocalPOS(2), 0)); - assert!(!segment.contains_edge(LocalPOS(3), 0)); - - // Verify correct edge data for existing edges - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(2), 0), Some((VID(5), VID(6)))); - - // Only 2 edges should contribute to time length - assert_eq!(segment.t_len(), 2); - } - - #[test] - fn test_bulk_vs_individual_equivalence() { - let mut segment1 = create_test_segment(); - let mut segment2 = create_test_segment(); - - // Individual insertions - segment1.insert_edge_internal( - TimeIndexEntry::new(1, 0), - LocalPOS(0), - VID(1), - VID(2), - 0, - vec![(0, Prop::from("test1"))], - 1, - ); - segment1.insert_edge_internal( - TimeIndexEntry::new(2, 1), - LocalPOS(1), - VID(3), - VID(4), - 0, - vec![(0, Prop::from("test2"))], - 1, - ); - segment1.insert_edge_internal( - TimeIndexEntry::new(3, 2), - LocalPOS(2), - VID(5), - VID(6), - 0, - vec![(0, Prop::from("test3"))], - 1, - ); - - // Equivalent bulk insertion - let mask = BooleanArray::from(vec![true, true, true]); - let times = vec![1i64, 2i64, 3i64]; - let eids = vec![EID(0), EID(1), EID(2)]; - let srcs = vec![VID(1), VID(3), VID(5)]; - let dsts = vec![VID(2), VID(4), VID(6)]; - let cols: Vec> = - vec![Arc::new(StringArray::from(vec!["test1", "test2", "test3"]))]; - let col_mapping = vec![0]; - - segment2.bulk_insert_edges_internal( - &mask, - ×, - 0, - &eids, - &srcs, - &dsts, - 0, - &cols, - &col_mapping, - ); - - // Both segments should have the same edges - for pos in [LocalPOS(0), LocalPOS(1), LocalPOS(2)] { - assert_eq!( - segment1.contains_edge(pos, 0), - segment2.contains_edge(pos, 0) - ); - assert_eq!(segment1.get_edge(pos, 0), segment2.get_edge(pos, 0)); - } - - // Both should have same time length - assert_eq!(segment1.t_len(), segment2.t_len()); - } - #[test] fn test_interleaved_operations() { let mut segment = create_test_segment(); diff --git a/raphtory-core/src/storage/mod.rs b/raphtory-core/src/storage/mod.rs index 8807dcf115..d1673769d6 100644 --- a/raphtory-core/src/storage/mod.rs +++ b/raphtory-core/src/storage/mod.rs @@ -3,6 +3,7 @@ use crate::{ storage::lazy_vec::IllegalSet, }; use arrow_array::{ + builder::StringViewBuilder, cast::AsArray, types::{ Float32Type, Float64Type, Int32Type, Int64Type, UInt16Type, UInt32Type, UInt64Type, @@ -29,7 +30,7 @@ pub mod lazy_vec; pub mod locked_view; pub mod timeindex; -#[derive(Debug, Serialize, Deserialize, PartialEq, Default)] +#[derive(Debug, Default)] pub struct TColumns { t_props_log: Vec, num_rows: usize, @@ -128,7 +129,7 @@ impl TColumns { } } -#[derive(Debug, Serialize, Deserialize, PartialEq)] +#[derive(Debug)] pub enum PropColumn { Empty(usize), Bool(LazyVec), @@ -233,63 +234,6 @@ impl PropColumn { } } - pub fn append(&mut self, col: &dyn Array, mask: &BooleanArray) { - self.init_from_prop_type(col.data_type()); - match self { - PropColumn::Bool(v) => v.append(col.as_boolean(), mask), - PropColumn::I64(v) => v.append(col.as_primitive::(), mask), - PropColumn::U32(v) => v.append(col.as_primitive::(), mask), - PropColumn::U64(v) => v.append(col.as_primitive::(), mask), - PropColumn::F32(v) => v.append(col.as_primitive::(), mask), - PropColumn::F64(v) => v.append(col.as_primitive::(), mask), - PropColumn::Str(v) => { - let iter = col - .as_string_opt::() - .map(|iter| Either::Left(iter.into_iter())) - .or_else(|| { - col.as_string_opt::() - .map(|iter| Either::Right(iter.into_iter())) - }) - .expect("Failed to cast to StringArray"); - v.append(iter.map(|opt| opt.map(ArcStr::from)), mask) - } - PropColumn::U8(v) => v.append(col.as_primitive::(), mask), - PropColumn::U16(v) => v.append(col.as_primitive::(), mask), - PropColumn::I32(v) => v.append(col.as_primitive::(), mask), - PropColumn::NDTime(v) => v.append( - col.as_any() - .downcast_ref::() - .expect("Failed to cast to Timestamp") - .iter() - .map(|value| DateTime::from_timestamp_millis(value?).map(|dt| dt.naive_utc())), - mask, - ), - PropColumn::DTime(v) => v.append( - col.as_any() - .downcast_ref::() - .expect("failed to cast to Timestamp") - .iter() - .map(|value| DateTime::from_timestamp_millis(value?)), - mask, - ), - PropColumn::Decimal(v) => v.append( - // this needs a review if it actually works - col.as_any() - .downcast_ref::() - .expect("Failed to cast to Timestamp") - .iter() - .map(|bd| bd.map(BigDecimal::from)), - mask, - ), - // PropColumn::List(v) => v.append(col, mask), - // PropColumn::Map(v) => v.append(col, mask), - // - // PropColumn::Array(v) => v.append(col, mask), - // PropColumn::Empty(_) => {} - _ => { /* ignore unsupported types for now */ } - } - } - pub fn upsert(&mut self, index: usize, prop: Prop) -> Result<(), TPropColumnError> { self.init_empty_col(&prop); match (self, prop) { diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs index 644b6f3d3f..ecf67d0d9b 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/io/arrow/dataframe.rs @@ -47,12 +47,14 @@ where } pub(crate) fn get_index(&self, name: &str) -> Result { - self.names - .iter() - .position(|n| n == name) + self.get_index_opt(name) .ok_or_else(|| GraphError::ColumnDoesNotExist(name.to_string())) } + pub(crate) fn get_index_opt(&self, name: &str) -> Option { + self.names.iter().position(|n| n == name) + } + pub fn is_empty(&self) -> bool { self.num_rows == 0 } diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 92bdd6cef3..57e70edb75 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -11,6 +11,10 @@ use crate::{ }, prelude::*, }; +use arrow::{ + array::{AsArray, PrimitiveArray}, + datatypes::UInt64Type, +}; use bytemuck::checked::cast_slice_mut; use db4_graph::WriteLockedGraph; use itertools::izip; @@ -42,24 +46,39 @@ use storage::{ Extension, }; +#[derive(Debug, Copy, Clone)] +pub struct ColumnNames<'a> { + pub time: &'a str, + pub secondary_index: Option<&'a str>, + pub src: &'a str, + pub dst: &'a str, + pub edge_id: Option<&'a str>, + pub layer_col: Option<&'a str>, +} + #[allow(clippy::too_many_arguments)] pub fn load_edges_from_df( df_view: DFView> + Send>, - time: &str, - secondary_index: Option<&str>, - src: &str, - dst: &str, + column_names: ColumnNames, properties: &[&str], metadata: &[&str], shared_metadata: Option<&HashMap>, layer: Option<&str>, - layer_col: Option<&str>, graph: &G, ) -> Result<(), GraphError> { if df_view.is_empty() { return Ok(()); } + let ColumnNames { + time, + secondary_index, + src, + dst, + edge_id, + layer_col, + } = column_names; + let properties_indices = properties .iter() .map(|name| df_view.get_index(name)) @@ -72,6 +91,7 @@ pub fn load_edges_from_df = vec![]; // exists or needs to be created let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created + let resolve_ids = true; // todo add this to function params + rayon::scope(|s| { let (tx, rx) = mpsc::sync_channel(2); @@ -120,24 +142,24 @@ pub fn load_edges_from_df( + let layer = lift_layer_col(layer, layer_index, &df)?; + let layer_col_resolved = layer.resolve(graph)?; + + let (src_vids, dst_vids, gid_str_cache) = get_or_resolve_node_vids( graph, - [(&src_col, atomic_src_col), (&dst_col, atomic_dst_col)].as_ref(), + src_index, + dst_index, + &mut src_col_resolved, + &mut dst_col_resolved, + resolve_ids, + &df, + &src_col, + &dst_col, )?; let time_col = df.time_col(time_index)?; @@ -159,42 +181,61 @@ pub fn load_edges_from_df()? + .values() + .as_ref(), + ) + }); + // Generate all edge_ids + add outbound edges nodes.par_iter_mut().for_each(|locked_page| { // Zip all columns for iteration. let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), + src_vids.iter(), + dst_vids.iter(), time_col.iter(), secondary_index_col.iter(), layer_col_resolved.iter() ); - store_node_ids(&gid_str_cache, locked_page); + if resolve_ids { + store_node_ids(&gid_str_cache, locked_page); + } - add_and_resolve_outbound_edges( - &eids_exist, - &layer_eids_exist, - &eid_col_shared, - next_edge_id, - edges, - locked_page, - zip, - ); + if resolve_ids { + add_and_resolve_outbound_edges( + &eids_exist, + &layer_eids_exist, + &eid_col_shared, + next_edge_id, + edges, + locked_page, + zip, + ); + } else if let Some(edge_ids) = eids { + add_and_resolve_outbound_edges( + &eids_exist, + &layer_eids_exist, + &eid_col_shared, + |row| EID(edge_ids[row] as usize), + edges, + locked_page, + zip, + ); + } }); write_locked_graph.nodes.par_iter_mut().for_each(|shard| { let zip = izip!( - src_col_resolved.iter(), - dst_col_resolved.iter(), + src_vids.iter(), + dst_vids.iter(), eid_col_resolved.iter(), time_col.iter(), secondary_index_col.iter(), @@ -212,8 +253,8 @@ pub fn load_edges_from_df( + graph: &G, + src_index: usize, + dst_index: usize, + src_col_resolved: &'a mut Vec, + dst_col_resolved: &'a mut Vec, + resolve_nodes: bool, + df: &'b DFChunk, + src_col: &'a NodeCol, + dst_col: &'a NodeCol, +) -> Result< + ( + &'c [VID], + &'c [VID], + FxDashMap, (Prop, MaybeNew)>, + ), + GraphError, +> { + let (src_vids, dst_vids, gid_str_cache) = if resolve_nodes { + src_col_resolved.resize_with(df.len(), Default::default); + dst_col_resolved.resize_with(df.len(), Default::default); + + let atomic_src_col = atomic_vid_from_mut_slice(src_col_resolved); + let atomic_dst_col = atomic_vid_from_mut_slice(dst_col_resolved); + + let gid_str_cache = resolve_nodes_with_cache::( + graph, + [(src_col, atomic_src_col), (dst_col, atomic_dst_col)].as_ref(), + )?; + ( + src_col_resolved.as_slice(), + dst_col_resolved.as_slice(), + gid_str_cache, + ) + } else { + let srcs = df.chunk[src_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[src_index].data_type().clone()))? + .values() + .as_ref(); + let dsts = df.chunk[dst_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[dst_index].data_type().clone()))? + .values() + .as_ref(); + ( + bytemuck::cast_slice(srcs), + bytemuck::cast_slice(dsts), + FxDashMap::default(), + ) + }; + Ok((src_vids, dst_vids, gid_str_cache)) +} + #[inline(never)] fn update_edge_properties<'a, ES: EdgeSegmentOps>( shared_metadata: &[(usize, Prop)], diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs index 9186b21f31..9e7d5b103c 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/io/arrow/mod.rs @@ -9,7 +9,7 @@ mod test { use crate::{ io::arrow::{ dataframe::{DFChunk, DFView}, - df_loaders::*, + df_loaders::{edges::load_edges_from_df, *}, }, prelude::*, }; diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index c5ebe7fe38..ce7a87ce5b 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -3,7 +3,10 @@ use crate::{ errors::{GraphError, InvalidPathReason::PathDoesNotExist}, io::arrow::{ dataframe::*, - df_loaders::{edges::load_edges_from_df, *}, + df_loaders::{ + edges::{load_edges_from_df, ColumnNames}, + *, + }, }, prelude::{AdditionOps, DeletionOps, PropertyAdditionOps}, }; @@ -130,15 +133,18 @@ pub fn load_edges_from_parquet Serialize for ParquetTEdge<'a, G> { state.serialize_entry(TIME_COL, &t.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &t.1)?; - state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; - state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(SRC_COL_ID, &edge.src().node.0)?; + state.serialize_entry(DST_COL_ID, &edge.dst().node.0)?; + state.serialize_entry(EDGE_COL_ID, &edge.edge.pid())?; state.serialize_entry(LAYER_COL, &layer)?; for (name, prop) in edge.properties().temporal().iter_latest() { diff --git a/raphtory/tests/df_loaders.rs b/raphtory/tests/df_loaders.rs index 5066aa004d..789b64e82a 100644 --- a/raphtory/tests/df_loaders.rs +++ b/raphtory/tests/df_loaders.rs @@ -10,7 +10,7 @@ mod io_tests { errors::GraphError, io::arrow::{ dataframe::{DFChunk, DFView}, - df_loaders::{load_edges_from_df, load_nodes_from_df}, + df_loaders::{edges::load_edges_from_df, load_nodes_from_df}, }, prelude::*, test_utils::{build_edge_list, build_edge_list_str, build_edge_list_with_secondary_index}, From 51bb16ec47dafb70d50a8d5d32167b1e34a8f3bb Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Fri, 12 Dec 2025 13:04:24 +0000 Subject: [PATCH 16/24] load_edges_from_df supports non resolution variant --- db4-storage/src/pages/mod.rs | 46 +++++- db4-storage/src/segments/edge/segment.rs | 147 ------------------ raphtory/src/io/arrow/df_loaders/edges.rs | 9 +- raphtory/src/io/arrow/mod.rs | 19 ++- raphtory/src/io/parquet_loaders.rs | 26 ++-- raphtory/src/python/graph/graph.rs | 16 +- .../src/python/graph/graph_with_deletions.rs | 16 +- .../src/python/graph/io/pandas_loaders.rs | 1 + raphtory/src/serialise/parquet/edges.rs | 1 - raphtory/src/serialise/parquet/mod.rs | 24 ++- raphtory/tests/df_loaders.rs | 89 +++++++---- 11 files changed, 172 insertions(+), 222 deletions(-) diff --git a/db4-storage/src/pages/mod.rs b/db4-storage/src/pages/mod.rs index ed2fdd9ff7..8a74a1879a 100644 --- a/db4-storage/src/pages/mod.rs +++ b/db4-storage/src/pages/mod.rs @@ -524,9 +524,30 @@ fn read_graph_config( #[inline(always)] pub fn resolve_pos>(i: I, max_page_len: u32) -> (usize, LocalPOS) { let i = i.into(); - let chunk = i / max_page_len as usize; + let seg = i / max_page_len as usize; let pos = i % max_page_len as usize; - (chunk, LocalPOS(pos as u32)) + (seg, LocalPOS(pos as u32)) +} + +fn gen_interleave( + chunk_size: usize, + num_segments: usize, + max_seg_len: u32, +) -> impl ParallelIterator)> { + let chunk_size = (chunk_size / num_segments).max(1); + (0..max_seg_len as usize) + .into_par_iter() + .chunks(chunk_size) + .enumerate() + .map(move |(chunk_id, items)| { + let iter = items.into_iter().flat_map(move |x| { + (0..num_segments).map(move |seg| -> usize { + // clamp this by the largest local pos in the segment + seg * max_seg_len as usize + x + }) + }); + (chunk_id, iter) + }) } #[cfg(test)] @@ -545,6 +566,27 @@ mod test { use proptest::prelude::*; use raphtory_api::core::entities::properties::prop::Prop; use raphtory_core::{entities::VID, storage::timeindex::TimeIndexOps}; + use rayon::iter::ParallelIterator; + + #[test] + fn test_iterleave() { + let chunk_size = 3; + let num_segments = 3; + let max_seg_len = 4; + + let actual = super::gen_interleave(chunk_size, num_segments, max_seg_len) + .map(|(c, items)| (c, items.collect::>())) + .collect::>(); + + let expected = vec![ + (0, vec![0, 4, 8]), + (1, vec![1, 5, 9]), + (2, vec![2, 6, 10]), + (3, vec![3, 7, 11]), + ]; + + assert_eq!(actual, expected); + } fn check_edges(edges: Vec<(impl Into, impl Into)>, chunk_size: u32, par_load: bool) { // Set optional layer_id to None diff --git a/db4-storage/src/segments/edge/segment.rs b/db4-storage/src/segments/edge/segment.rs index a84106ee59..164c9fb05f 100644 --- a/db4-storage/src/segments/edge/segment.rs +++ b/db4-storage/src/segments/edge/segment.rs @@ -589,153 +589,6 @@ mod test { assert_eq!(segment.t_len(), 3); } - #[test] - fn test_interleaved_operations() { - let mut segment = create_test_segment(); - - // Start with individual insertion - segment.insert_edge_internal( - TimeIndexEntry::new(1, 0), - LocalPOS(0), - VID(1), - VID(2), - 0, - vec![(0, Prop::from("individual1"))], - 1, - ); - - // Bulk insert some edges - let mask = BooleanArray::from(vec![true, true]); - let times = vec![2i64, 3i64]; - let eids = vec![EID(1), EID(2)]; - let srcs = vec![VID(3), VID(5)]; - let dsts = vec![VID(4), VID(6)]; - let cols: Vec> = vec![Arc::new(StringArray::from(vec!["bulk1", "bulk2"]))]; - let col_mapping = vec![0]; - - segment.bulk_insert_edges_internal( - &mask, - ×, - 1, // time_sec_index continues from previous - &eids, - &srcs, - &dsts, - 0, - &cols, - &col_mapping, - ); - - // Insert another individual edge - segment.insert_edge_internal( - TimeIndexEntry::new(4, 3), - LocalPOS(3), - VID(7), - VID(8), - 0, - vec![(0, Prop::from("individual2"))], - 1, - ); - - // Another bulk insert - let mask2 = BooleanArray::from(vec![true, false, true]); - let times2 = vec![5i64, 6i64, 7i64]; - let eids2 = vec![EID(4), EID(5), EID(6)]; - let srcs2 = vec![VID(9), VID(11), VID(13)]; - let dsts2 = vec![VID(10), VID(12), VID(14)]; - let cols2: Vec> = - vec![Arc::new(StringArray::from(vec!["bulk3", "bulk4", "bulk5"]))]; - - segment.bulk_insert_edges_internal( - &mask2, - ×2, - 4, // time_sec_index continues - &eids2, - &srcs2, - &dsts2, - 0, - &cols2, - &col_mapping, - ); - - // Verify all edges exist correctly - assert!(segment.contains_edge(LocalPOS(0), 0)); // individual1 - assert!(segment.contains_edge(LocalPOS(1), 0)); // bulk1 - assert!(segment.contains_edge(LocalPOS(2), 0)); // bulk2 - assert!(segment.contains_edge(LocalPOS(3), 0)); // individual2 - assert!(segment.contains_edge(LocalPOS(4), 0)); // bulk3 - assert!(!segment.contains_edge(LocalPOS(5), 0)); // masked out - assert!(segment.contains_edge(LocalPOS(6), 0)); // bulk5 - - // Verify edge data - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(1), 0), Some((VID(3), VID(4)))); - assert_eq!(segment.get_edge(LocalPOS(2), 0), Some((VID(5), VID(6)))); - assert_eq!(segment.get_edge(LocalPOS(3), 0), Some((VID(7), VID(8)))); - assert_eq!(segment.get_edge(LocalPOS(4), 0), Some((VID(9), VID(10)))); - assert_eq!(segment.get_edge(LocalPOS(6), 0), Some((VID(13), VID(14)))); - - // Total time length should be 6 (4 individual + 2 from first bulk + 2 from second bulk) - assert_eq!(segment.t_len(), 6); - } - - #[test] - fn test_bulk_insert_multiple_layers() { - let mut segment = create_test_segment(); - - // Insert into layer 0 - let mask = BooleanArray::from(vec![true, true]); - let times = vec![1i64, 2i64]; - let eids = vec![EID(0), EID(1)]; - let srcs = vec![VID(1), VID(3)]; - let dsts = vec![VID(2), VID(4)]; - let cols: Vec> = - vec![Arc::new(StringArray::from(vec!["layer0_1", "layer0_2"]))]; - let col_mapping = vec![0]; - - segment.bulk_insert_edges_internal( - &mask, - ×, - 0, - &eids, - &srcs, - &dsts, - 0, // layer 0 - &cols, - &col_mapping, - ); - - // Insert into layer 1 - let mask2 = BooleanArray::from(vec![true]); - let times2 = vec![3i64]; - let eids2 = vec![EID(0)]; // same eid, different layer - let srcs2 = vec![VID(5)]; - let dsts2 = vec![VID(6)]; - let cols2: Vec> = vec![Arc::new(StringArray::from(vec!["layer1_1"]))]; - - segment.bulk_insert_edges_internal( - &mask2, - ×2, - 2, - &eids2, - &srcs2, - &dsts2, - 1, // layer 1 - &cols2, - &col_mapping, - ); - - // Verify edges in both layers - assert!(segment.contains_edge(LocalPOS(0), 0)); - assert!(segment.contains_edge(LocalPOS(1), 0)); - assert!(segment.contains_edge(LocalPOS(0), 1)); - assert!(!segment.contains_edge(LocalPOS(1), 1)); - - // Verify correct layer data - assert_eq!(segment.get_edge(LocalPOS(0), 0), Some((VID(1), VID(2)))); - assert_eq!(segment.get_edge(LocalPOS(1), 0), Some((VID(3), VID(4)))); - assert_eq!(segment.get_edge(LocalPOS(0), 1), Some((VID(5), VID(6)))); - } - #[test] fn est_size_changes() { use super::*; diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 57e70edb75..17c252100c 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -60,6 +60,7 @@ pub struct ColumnNames<'a> { pub fn load_edges_from_df( df_view: DFView> + Send>, column_names: ColumnNames, + resolve_nodes: bool, properties: &[&str], metadata: &[&str], shared_metadata: Option<&HashMap>, @@ -116,8 +117,6 @@ pub fn load_edges_from_df = vec![]; // exists or needs to be created let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created - let resolve_ids = true; // todo add this to function params - rayon::scope(|s| { let (tx, rx) = mpsc::sync_channel(2); @@ -156,7 +155,7 @@ pub fn load_edges_from_df( graph: &G, parquet_path: impl AsRef, - time: &str, - secondary_index: Option<&str>, - src: &str, - dst: &str, + column_names: ColumnNames, + resolve_nodes: bool, properties: &[&str], metadata: &[&str], shared_metadata: Option<&HashMap>, layer: Option<&str>, - layer_col: Option<&str>, batch_size: Option, ) -> Result<(), GraphError> { + let ColumnNames { + time, + secondary_index, + src, + dst, + layer_col, + .. + } = column_names; + let parquet_path = parquet_path.as_ref(); let mut cols_to_check = vec![src, dst, time]; cols_to_check.extend_from_slice(properties); @@ -133,14 +139,8 @@ pub fn load_edges_from_parquet>(); for edge_rows in edges .into_iter() diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index 4a3a1d81df..17c66826d7 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -4,9 +4,13 @@ use crate::{ graph::views::deletion_graph::PersistentGraph, }, errors::GraphError, - io::parquet_loaders::{ - load_edge_deletions_from_parquet, load_edge_props_from_parquet, load_edges_from_parquet, - load_graph_props_from_parquet, load_node_props_from_parquet, load_nodes_from_parquet, + io::{ + arrow::df_loaders::edges::ColumnNames, + parquet_loaders::{ + load_edge_deletions_from_parquet, load_edge_props_from_parquet, + load_edges_from_parquet, load_graph_props_from_parquet, load_node_props_from_parquet, + load_nodes_from_parquet, + }, }, prelude::*, serialise::{ @@ -547,15 +551,19 @@ fn decode_graph_storage( load_edges_from_parquet( &graph, &t_edge_path, - TIME_COL, - Some(SECONDARY_INDEX_COL), - SRC_COL_ID, - DST_COL_ID, + ColumnNames { + time: TIME_COL, + secondary_index: Some(SECONDARY_INDEX_COL), + src: SRC_COL_ID, + dst: DST_COL_ID, + layer_col: Some(LAYER_COL), + edge_id: None, + }, + false, &t_prop_columns, &[], None, None, - Some(LAYER_COL), batch_size, )?; } diff --git a/raphtory/tests/df_loaders.rs b/raphtory/tests/df_loaders.rs index 789b64e82a..088c63561c 100644 --- a/raphtory/tests/df_loaders.rs +++ b/raphtory/tests/df_loaders.rs @@ -10,7 +10,10 @@ mod io_tests { errors::GraphError, io::arrow::{ dataframe::{DFChunk, DFView}, - df_loaders::{edges::load_edges_from_df, load_nodes_from_df}, + df_loaders::{ + edges::{load_edges_from_df, ColumnNames}, + load_nodes_from_df, + }, }, prelude::*, test_utils::{build_edge_list, build_edge_list_str, build_edge_list_with_secondary_index}, @@ -208,7 +211,17 @@ mod io_tests { let g = Graph::new(); let props = ["str_prop", "int_prop"]; let secondary_index = None; - load_edges_from_df(df_view, "time", secondary_index,"src", "dst", &props, &[], None, None, None, &g).unwrap(); + load_edges_from_df(df_view, + ColumnNames { + time: "time", + secondary_index, + src: "src", + dst: "dst", + edge_id: None, + layer_col: None, + }, + true, + &props, &[], None, None, &g).unwrap(); let g2 = Graph::new(); @@ -243,15 +256,19 @@ mod io_tests { load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames { + time: "time", + secondary_index, + src: "src", + dst: "dst", + edge_id: None, + layer_col: None, + }, + true, &props, &[], None, None, - None, &g, ) .unwrap(); @@ -286,7 +303,7 @@ mod io_tests { let g = Graph::new(); let props = ["str_prop", "int_prop"]; let secondary_index = None; - load_edges_from_df(df_view, "time", secondary_index, "src", "dst", &props, &[], None, None, None, &g).unwrap(); + load_edges_from_df(df_view, ColumnNames {time: "time", secondary_index, src: "src", dst: "dst", edge_id: None, layer_col:None},true, &props, &[], None, None, &g).unwrap(); let g2 = Graph::new(); @@ -310,15 +327,19 @@ mod io_tests { load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames { + time: "time", + secondary_index, + src: "src", + dst: "dst", + edge_id: None, + layer_col: None, + }, + true, &props, &[], None, None, - None, &g, ) .unwrap(); @@ -351,15 +372,19 @@ mod io_tests { // Load edges from DataFrame with secondary_index load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames { + time: "time", + secondary_index, + src: "src", + dst: "dst", + edge_id: None, + layer_col: None, + }, + true, &props, &[], None, None, - None, &g, ) .unwrap(); @@ -412,15 +437,19 @@ mod io_tests { load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames { + time: "time", + secondary_index, + src: "src", + dst: "dst", + edge_id: None, + layer_col: None, + }, + true, &props, &[], None, None, - None, &g, ).unwrap(); @@ -562,15 +591,19 @@ mod io_tests { let secondary_index = None; load_edges_from_df( df_view, - "time", - secondary_index, - "src", - "dst", + ColumnNames { + time: "time", + secondary_index, + src: "src", + dst: "dst", + edge_id: None, + layer_col: None, + }, + true, &props, &[], None, layer.as_deref(), - None, &g, ) .unwrap(); From 991378b0eaafc7defe0e2859259a04a74602f4f3 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Fri, 12 Dec 2025 16:02:18 +0000 Subject: [PATCH 17/24] can encode with stable id, decode is broken --- db4-storage/src/pages/edge_store.rs | 21 +++++++++++ db4-storage/src/pages/mod.rs | 31 ++++++++-------- db4-storage/src/pages/node_store.rs | 18 +++++++++ raphtory-storage/src/graph/edges/edges.rs | 13 +------ raphtory-storage/src/graph/nodes/nodes_ref.rs | 8 ++++ raphtory/src/serialise/graph_folder.rs | 6 +++ raphtory/src/serialise/parquet/edges.rs | 14 ++++--- raphtory/src/serialise/parquet/mod.rs | 1 + raphtory/src/serialise/parquet/model.rs | 14 ++++--- raphtory/src/serialise/parquet/nodes.rs | 37 +++++++++++-------- 10 files changed, 109 insertions(+), 54 deletions(-) diff --git a/db4-storage/src/pages/edge_store.rs b/db4-storage/src/pages/edge_store.rs index e321a0d068..434f7da705 100644 --- a/db4-storage/src/pages/edge_store.rs +++ b/db4-storage/src/pages/edge_store.rs @@ -13,6 +13,7 @@ use crate::{ SegmentCounts, layer_counter::GraphStats, locked::edges::{LockedEdgePage, WriteLockedEdgePages}, + row_group_par_iter, }, persist::strategy::Config, segments::edge::segment::MemEdgeSegment, @@ -580,6 +581,26 @@ impl, EXT: Config> EdgeStorageInner }) } + pub fn row_groups_par_iter( + &self, + ) -> impl IndexedParallelIterator + '_)> { + row_group_par_iter( + self.max_page_len() as usize, + self.segments.count(), + self.max_page_len(), + ) + .map(|(s_id, iter)| (s_id, iter.filter(|eid| self.has_eid(*eid)))) + } + + fn has_eid(&self, eid: EID) -> bool { + let (segment_id, pos) = self.resolve_pos(eid); + segment_id < self.segments.count() + && self + .segments + .get(segment_id) + .is_some_and(|s| pos.0 < s.num_edges()) + } + pub(crate) fn segment_counts(&self) -> SegmentCounts { SegmentCounts::new( self.max_page_len(), diff --git a/db4-storage/src/pages/mod.rs b/db4-storage/src/pages/mod.rs index 8a74a1879a..ee8195d0d0 100644 --- a/db4-storage/src/pages/mod.rs +++ b/db4-storage/src/pages/mod.rs @@ -529,25 +529,24 @@ pub fn resolve_pos>(i: I, max_page_len: u32) -> (usize, Lo (seg, LocalPOS(pos as u32)) } -fn gen_interleave( +pub fn row_group_par_iter>( chunk_size: usize, num_segments: usize, max_seg_len: u32, -) -> impl ParallelIterator)> { +) -> impl IndexedParallelIterator)> { let chunk_size = (chunk_size / num_segments).max(1); - (0..max_seg_len as usize) - .into_par_iter() - .chunks(chunk_size) - .enumerate() - .map(move |(chunk_id, items)| { - let iter = items.into_iter().flat_map(move |x| { - (0..num_segments).map(move |seg| -> usize { - // clamp this by the largest local pos in the segment - seg * max_seg_len as usize + x - }) - }); - (chunk_id, iter) - }) + let num_chunks = (max_seg_len as usize + chunk_size - 1) / chunk_size; + + (0..num_chunks).into_par_iter().map(move |chunk_id| { + let start = chunk_id * chunk_size; + let end = ((chunk_id + 1) * chunk_size).min(max_seg_len as usize); + + let iter = (start..end).flat_map(move |x| { + (0..num_segments).map(move |seg| I::from(seg * max_seg_len as usize + x)) + }); + + (chunk_id, iter) + }) } #[cfg(test)] @@ -574,7 +573,7 @@ mod test { let num_segments = 3; let max_seg_len = 4; - let actual = super::gen_interleave(chunk_size, num_segments, max_seg_len) + let actual = super::row_group_par_iter(chunk_size, num_segments, max_seg_len) .map(|(c, items)| (c, items.collect::>())) .collect::>(); diff --git a/db4-storage/src/pages/node_store.rs b/db4-storage/src/pages/node_store.rs index b22a313e38..e39601218c 100644 --- a/db4-storage/src/pages/node_store.rs +++ b/db4-storage/src/pages/node_store.rs @@ -7,6 +7,7 @@ use crate::{ SegmentCounts, layer_counter::GraphStats, locked::nodes::{LockedNodePage, WriteLockedNodeSegments}, + row_group_par_iter, }, persist::strategy::Config, segments::node::segment::MemNodeSegment, @@ -90,6 +91,23 @@ impl, EXT: Config> ReadLockedNodeStorage impl IndexedParallelIterator + '_)> { + row_group_par_iter( + self.storage.max_segment_len() as usize, + self.locked_segments.len(), + self.storage.max_segment_len(), + ) + .map(|(s_id, iter)| (s_id, iter.filter(|vid| self.has_vid(*vid)))) + } + + fn has_vid(&self, vid: VID) -> bool { + let (segment_id, pos) = self.storage.resolve_pos(vid); + segment_id < self.locked_segments.len() + && pos.0 < self.locked_segments[segment_id].num_nodes() + } } impl NodeStorageInner { diff --git a/raphtory-storage/src/graph/edges/edges.rs b/raphtory-storage/src/graph/edges/edges.rs index 2648517b2a..b15f9c35f0 100644 --- a/raphtory-storage/src/graph/edges/edges.rs +++ b/raphtory-storage/src/graph/edges/edges.rs @@ -72,17 +72,8 @@ impl<'a> EdgesStorageRef<'a> { self, ) -> impl ParallelIterator + use<'a>)> + 'a { match self { - EdgesStorageRef::Mem(storage) => Iter2::I1( - storage - .segmented_par_iter() - .map(|(segment, iter)| (segment, Iter2::I1(iter))), - ), - EdgesStorageRef::Unlocked(edges) => Iter2::I2( - edges - .storage() - .segmented_par_iter() - .map(|(segment, iter)| (segment, Iter2::I2(iter))), - ), + EdgesStorageRef::Mem(storage) => Iter2::I1(storage.storage().row_groups_par_iter()), + EdgesStorageRef::Unlocked(edges) => Iter2::I2(edges.storage().row_groups_par_iter()), } } diff --git a/raphtory-storage/src/graph/nodes/nodes_ref.rs b/raphtory-storage/src/graph/nodes/nodes_ref.rs index 1aec0c1d8d..f170f8dafd 100644 --- a/raphtory-storage/src/graph/nodes/nodes_ref.rs +++ b/raphtory-storage/src/graph/nodes/nodes_ref.rs @@ -44,4 +44,12 @@ impl<'a> NodesStorageEntry<'a> { pub fn iter(&self) -> impl Iterator> { for_all_variants!(self, nodes => nodes.iter()) } + + /// Returns a parallel iterator over nodes row groups + /// the (usize) part is the row group not the segment + pub fn row_groups_par_iter( + &self, + ) -> impl ParallelIterator + '_)> { + for_all_variants!(self, nodes => nodes.row_groups_par_iter()) + } } diff --git a/raphtory/src/serialise/graph_folder.rs b/raphtory/src/serialise/graph_folder.rs index 1d1c2913a4..e83f3d3850 100644 --- a/raphtory/src/serialise/graph_folder.rs +++ b/raphtory/src/serialise/graph_folder.rs @@ -500,6 +500,12 @@ mod tests { .add_edge(4, 1, 3, [("test prop 4", true)], None) .unwrap(); + graph + .node(1) + .unwrap() + .add_updates(5, [("test node prop", 5i32)]) + .unwrap(); + let temp_folder = tempfile::TempDir::new().unwrap(); let folder = temp_folder.path().join("graph"); let graph_folder = GraphFolder::from(&folder); diff --git a/raphtory/src/serialise/parquet/edges.rs b/raphtory/src/serialise/parquet/edges.rs index 484d9211cf..28052f511c 100644 --- a/raphtory/src/serialise/parquet/edges.rs +++ b/raphtory/src/serialise/parquet/edges.rs @@ -67,12 +67,13 @@ pub(crate) fn encode_edge_deletions( g.edges().segmented_par_iter(), path, EDGES_D_PATH, - |id_type| { + |_| { vec![ Field::new(TIME_COL, DataType::Int64, false), Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - Field::new(SRC_COL, id_type.clone(), false), - Field::new(DST_COL, id_type.clone(), false), + Field::new(SRC_COL_ID, DataType::UInt64, false), + Field::new(DST_COL_ID, DataType::UInt64, false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), Field::new(LAYER_COL, DataType::Utf8, true), ] }, @@ -128,10 +129,11 @@ pub(crate) fn encode_edge_cprop( g.edges().segmented_par_iter(), path, EDGES_C_PATH, - |id_type| { + |_| { vec![ - Field::new(SRC_COL, id_type.clone(), false), - Field::new(DST_COL, id_type.clone(), false), + Field::new(SRC_COL_ID, DataType::UInt64, false), + Field::new(DST_COL_ID, DataType::UInt64, false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), Field::new(LAYER_COL, DataType::Utf8, true), ] }, diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index 17c66826d7..3b9eb15ed3 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -185,6 +185,7 @@ pub trait ParquetDecoder: Sized { } const NODE_ID_COL: &str = "rap_node_id"; +const NODE_VID_COL: &str = "rap_node_vid"; const TYPE_COL: &str = "rap_node_type"; const TIME_COL: &str = "rap_time"; const SECONDARY_INDEX_COL: &str = "rap_secondary_index"; diff --git a/raphtory/src/serialise/parquet/model.rs b/raphtory/src/serialise/parquet/model.rs index 4d56e7c3b0..bd70dc0db5 100644 --- a/raphtory/src/serialise/parquet/model.rs +++ b/raphtory/src/serialise/parquet/model.rs @@ -7,7 +7,7 @@ use crate::{ graph::{edge::EdgeView, node::NodeView}, }, prelude::*, - serialise::parquet::{DST_COL_ID, EDGE_COL_ID, SRC_COL_ID}, + serialise::parquet::{DST_COL_ID, EDGE_COL_ID, NODE_VID_COL, SRC_COL_ID}, }; use arrow::datatypes::DataType; use raphtory_api::core::{ @@ -82,8 +82,9 @@ impl<'a, G: StaticGraphViewOps> Serialize for ParquetCEdge<'a, G> { .layer_name() .map_err(|_| S::Error::custom("Edge has no layer"))?; - state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; - state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(SRC_COL_ID, &(edge.src().node.0))?; + state.serialize_entry(DST_COL_ID, &(edge.dst().node.0))?; + state.serialize_entry(EDGE_COL_ID, &(edge.edge.pid().0))?; state.serialize_entry(LAYER_COL, &layer)?; for (name, prop) in edge.metadata().iter_filtered() { @@ -110,8 +111,9 @@ impl<'a, G: StaticGraphViewOps> Serialize for ParquetDelEdge<'a, G> { state.serialize_entry(TIME_COL, &self.del.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &self.del.1)?; - state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; - state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(SRC_COL_ID, &(edge.src().node.0))?; + state.serialize_entry(DST_COL_ID, &(edge.dst().node.0))?; + state.serialize_entry(EDGE_COL_ID, &(edge.edge.pid().0))?; state.serialize_entry(LAYER_COL, &self.layer)?; state.end() @@ -133,6 +135,7 @@ impl<'a> Serialize for ParquetTNode<'a> { let mut state = serializer.serialize_map(None)?; state.serialize_entry(NODE_ID_COL, &ParquetGID(self.node.id()))?; + state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; state.serialize_entry(TIME_COL, &self.t.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &self.t.1)?; state.serialize_entry(TYPE_COL, &self.node.node_type())?; @@ -157,6 +160,7 @@ impl<'a> Serialize for ParquetCNode<'a> { let mut state = serializer.serialize_map(None)?; state.serialize_entry(NODE_ID_COL, &ParquetGID(self.node.id()))?; + state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; state.serialize_entry(TYPE_COL, &self.node.node_type())?; for (name, prop) in self.node.metadata().iter_filtered() { diff --git a/raphtory/src/serialise/parquet/nodes.rs b/raphtory/src/serialise/parquet/nodes.rs index 4669b5b9fc..75784d0b02 100644 --- a/raphtory/src/serialise/parquet/nodes.rs +++ b/raphtory/src/serialise/parquet/nodes.rs @@ -4,13 +4,13 @@ use crate::{ errors::GraphError, serialise::parquet::{ model::{ParquetCNode, ParquetTNode}, - run_encode, NODES_C_PATH, NODES_T_PATH, NODE_ID_COL, SECONDARY_INDEX_COL, TIME_COL, - TYPE_COL, + run_encode_indexed, NODES_C_PATH, NODES_T_PATH, NODE_ID_COL, NODE_VID_COL, + SECONDARY_INDEX_COL, TIME_COL, TYPE_COL, }, }; use arrow::datatypes::{DataType, Field}; use itertools::Itertools; -use raphtory_api::{core::entities::VID, iter::IntoDynBoxed}; +use raphtory_api::iter::IntoDynBoxed; use raphtory_storage::graph::graph::GraphStorage; use std::path::Path; @@ -18,15 +18,16 @@ pub(crate) fn encode_nodes_tprop( g: &GraphStorage, path: impl AsRef, ) -> Result<(), GraphError> { - run_encode( + run_encode_indexed( g, g.node_meta().temporal_prop_mapper(), - g.unfiltered_num_nodes(), + g.nodes().row_groups_par_iter(), path, NODES_T_PATH, |id_type| { vec![ Field::new(NODE_ID_COL, id_type.clone(), false), + Field::new(NODE_VID_COL, DataType::UInt64, false), Field::new(TIME_COL, DataType::Int64, false), Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), Field::new(TYPE_COL, DataType::Utf8, true), @@ -34,21 +35,26 @@ pub(crate) fn encode_nodes_tprop( }, |nodes, g, decoder, writer| { let row_group_size = 100_000; + let nodes = nodes.collect::>(); + dbg!(&nodes); + + let nodes = nodes.into_iter(); let cols = g.node_meta().temporal_prop_mapper().all_keys(); let cols = &cols; for node_rows in nodes - .into_iter() - .map(VID) .map(|vid| NodeView::new_internal(g, vid)) .flat_map(move |node| { GenLockedIter::from(node, |node| { node.rows() - .map(|(t, props)| ParquetTNode { - node: *node, - cols, - t, - props, + .map(|(t, props)| { + dbg!(&t, &props); + ParquetTNode { + node: *node, + cols, + t, + props, + } }) .into_dyn_boxed() }) @@ -72,15 +78,16 @@ pub(crate) fn encode_nodes_cprop( g: &GraphStorage, path: impl AsRef, ) -> Result<(), GraphError> { - run_encode( + run_encode_indexed( g, g.node_meta().metadata_mapper(), - g.unfiltered_num_nodes(), + g.nodes().row_groups_par_iter(), path, NODES_C_PATH, |id_type| { vec![ Field::new(NODE_ID_COL, id_type.clone(), false), + Field::new(NODE_VID_COL, DataType::UInt64, false), Field::new(TYPE_COL, DataType::Utf8, true), ] }, @@ -88,8 +95,6 @@ pub(crate) fn encode_nodes_cprop( let row_group_size = 100_000; for node_rows in nodes - .into_iter() - .map(VID) .map(|vid| NodeView::new_internal(g, vid)) .map(move |node| ParquetCNode { node }) .chunks(row_group_size) From 00409fc019aecdf464eac8b0e873b77a80f7160c Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Fri, 12 Dec 2025 16:12:55 +0000 Subject: [PATCH 18/24] move load nodes in submodule --- raphtory/src/io/arrow/df_loaders/edges.rs | 34 +- raphtory/src/io/arrow/df_loaders/mod.rs | 299 ++---------------- raphtory/src/io/arrow/df_loaders/nodes.rs | 273 ++++++++++++++++ raphtory/src/io/parquet_loaders.rs | 1 + .../src/python/graph/io/pandas_loaders.rs | 1 + raphtory/src/serialise/parquet/mod.rs | 10 +- 6 files changed, 314 insertions(+), 304 deletions(-) create mode 100644 raphtory/src/io/arrow/df_loaders/nodes.rs diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 17c252100c..f36336a40e 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -4,17 +4,14 @@ use crate::{ errors::{into_graph_err, GraphError, LoadError}, io::arrow::{ dataframe::{DFChunk, DFView, SecondaryIndexCol}, - df_loaders::{build_progress_bar, process_shared_properties}, + df_loaders::{build_progress_bar, extract_secondary_index_col, process_shared_properties}, layer_col::lift_layer_col, node_col::NodeCol, prop_handler::*, }, prelude::*, }; -use arrow::{ - array::{AsArray, PrimitiveArray}, - datatypes::UInt64Type, -}; +use arrow::{array::AsArray, datatypes::UInt64Type}; use bytemuck::checked::cast_slice_mut; use db4_graph::WriteLockedGraph; use itertools::izip; @@ -33,7 +30,7 @@ use rayon::prelude::*; use std::{ collections::HashMap, sync::{ - atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}, + atomic::{AtomicBool, AtomicUsize, Ordering}, mpsc, }, }; @@ -521,28 +518,3 @@ fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + Ad })?; Ok(gid_str_cache) } - -#[inline(never)] -fn extract_secondary_index_col( - secondary_index_index: Option, - session: &::WS<'_>, - df: &DFChunk, -) -> Result { - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) - } - }; - Ok(secondary_index_col) -} diff --git a/raphtory/src/io/arrow/df_loaders/mod.rs b/raphtory/src/io/arrow/df_loaders/mod.rs index 40cd61e913..6de103dfbe 100644 --- a/raphtory/src/io/arrow/df_loaders/mod.rs +++ b/raphtory/src/io/arrow/df_loaders/mod.rs @@ -10,11 +10,9 @@ use crate::{ prelude::*, }; use bytemuck::checked::cast_slice_mut; -use db4_graph::WriteLockedGraph; use either::Either; use itertools::izip; use kdam::{Bar, BarBuilder, BarExt}; -use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; use raphtory_api::{ atomic_extra::atomic_usize_from_mut_slice, core::{ @@ -22,7 +20,7 @@ use raphtory_api::{ properties::{meta::STATIC_GRAPH_LAYER_ID, prop::PropType}, EID, }, - storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry}, }, }; use raphtory_core::{ @@ -34,13 +32,11 @@ use rayon::prelude::*; use std::{ borrow::{Borrow, Cow}, collections::HashMap, - sync::{ - atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}, - mpsc, - }, + sync::atomic::{AtomicUsize, Ordering}, }; pub mod edges; +pub mod nodes; fn build_progress_bar(des: String, num_rows: usize) -> Result { BarBuilder::default() @@ -65,168 +61,6 @@ fn process_shared_properties( } } -pub fn load_nodes_from_df< - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, ->( - df_view: DFView>>, - time: &str, - secondary_index: Option<&str>, - node_id: &str, - properties: &[&str], - metadata: &[&str], - shared_metadata: Option<&HashMap>, - node_type: Option<&str>, - node_type_col: Option<&str>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let properties_indices = properties - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let node_type_index = - node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); - let node_type_index = node_type_index.transpose()?; - - let node_id_index = df_view.get_index(node_id)?; - let time_index = df_view.get_index(time)?; - let secondary_index_index = secondary_index - .map(|col| df_view.get_index(col)) - .transpose()?; - - let session = graph.write_session().map_err(into_graph_err)?; - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading nodes".to_string(), df_view.num_rows)?; - - let mut node_col_resolved = vec![]; - let mut node_type_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - for chunk in df_view.chunks { - let df = chunk?; - let prop_cols = - combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { - session - .resolve_node_property(key, dtype, false) - .map_err(into_graph_err) - })?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; - - let time_col = df.time_col(time_index)?; - let node_col = df.node_col(node_id_index)?; - - // Load the secondary index column if it exists, otherwise generate from start_id. - let secondary_index_col = match secondary_index_index { - Some(col_index) => { - // Update the event_id to reflect ingesting new secondary indices. - let col = df.secondary_index_col(col_index)?; - session - .set_max_event_id(col.max()) - .map_err(into_graph_err)?; - col - } - None => { - let start_id = session - .reserve_event_ids(df.len()) - .map_err(into_graph_err)?; - SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) - } - }; - - node_col_resolved.resize_with(df.len(), Default::default); - node_type_col_resolved.resize_with(df.len(), Default::default); - - // TODO: Using parallel iterators results in a 5x speedup, but - // needs to be implemented such that node VID order is preserved. - // See: https://github.com/Pometry/pometry-storage/issues/81 - for (gid, resolved, node_type, node_type_resolved) in izip!( - node_col.iter(), - node_col_resolved.iter_mut(), - node_type_col.iter(), - node_type_col_resolved.iter_mut() - ) { - let (vid, res_node_type) = write_locked_graph - .graph() - .resolve_node_and_type(gid.as_node_ref(), node_type) - .map_err(|_| LoadError::FatalError)?; - - *resolved = vid; - *node_type_resolved = res_node_type; - } - - let node_stats = write_locked_graph.node_stats().clone(); - let update_time = |time: TimeIndexEntry| { - let time = time.t(); - node_stats.update_time(time); - }; - - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); - - write_locked_graph - .nodes - .par_iter_mut() - .try_for_each(|shard| { - // Zip all columns for iteration. - let zip = izip!( - node_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - node_type_col_resolved.iter(), - node_col.iter() - ); - - for (row, (vid, time, secondary_index, node_type, gid)) in zip.enumerate() { - if let Some(mut_node) = shard.resolve_pos(*vid) { - let mut writer = shard.writer(); - let t = TimeIndexEntry(time, secondary_index); - let layer_id = STATIC_GRAPH_LAYER_ID; - let lsn = 0; - - update_time(t); - writer - .store_node_id_and_node_type(mut_node, layer_id, gid, *node_type, lsn); - - let t_props = prop_cols.iter_row(row); - let c_props = metadata_cols - .iter_row(row) - .chain(shared_metadata.iter().cloned()); - - writer.add_props(t, mut_node, layer_id, t_props, lsn); - writer.update_c_props(mut_node, layer_id, c_props, lsn); - }; - } - - Ok::<_, GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - - Ok(()) -} - fn load_into_shard( src_col_shared: &[AtomicUsize], dst_col_shared: &[AtomicUsize], @@ -342,108 +176,6 @@ pub(crate) fn load_edge_deletions_from_df< Ok(()) } -pub(crate) fn load_node_props_from_df< - 'a, - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, ->( - df_view: DFView>>, - node_id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - metadata: &[&str], - shared_metadata: Option<&HashMap>, - graph: &G, -) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let node_type_index = - node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); - let node_type_index = node_type_index.transpose()?; - - let node_id_index = df_view.get_index(node_id)?; - let session = graph.write_session().map_err(into_graph_err)?; - - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading node properties".to_string(), df_view.num_rows)?; - - let mut node_col_resolved = vec![]; - let mut node_type_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - for chunk in df_view.chunks { - let df = chunk?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_node_property(key, dtype, true) - .map_err(into_graph_err) - })?; - let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; - let node_col = df.node_col(node_id_index)?; - - node_col_resolved.resize_with(df.len(), Default::default); - node_type_col_resolved.resize_with(df.len(), Default::default); - - node_col - .iter() - .zip(node_col_resolved.iter_mut()) - .zip(node_type_col.iter()) - .zip(node_type_col_resolved.iter_mut()) - .try_for_each(|(((gid, resolved), node_type), node_type_resolved)| { - let (vid, res_node_type) = write_locked_graph - .graph() - .resolve_node_and_type(gid.as_node_ref(), node_type) - .map_err(|_| LoadError::FatalError)?; - *resolved = vid; - *node_type_resolved = res_node_type; - Ok::<(), LoadError>(()) - })?; - - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); - - write_locked_graph.nodes.iter_mut().try_for_each(|shard| { - let mut c_props = vec![]; - - for (idx, ((vid, node_type), gid)) in node_col_resolved - .iter() - .zip(node_type_col_resolved.iter()) - .zip(node_col.iter()) - .enumerate() - { - if let Some(mut_node) = shard.resolve_pos(*vid) { - let mut writer = shard.writer(); - writer.store_node_id_and_node_type(mut_node, 0, gid, *node_type, 0); - - c_props.clear(); - c_props.extend(metadata_cols.iter_row(idx)); - c_props.extend_from_slice(&shared_metadata); - writer.update_c_props(mut_node, 0, c_props.drain(..), 0); - }; - } - - Ok::<_, GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - Ok(()) -} - pub(crate) fn load_edges_props_from_df< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, >( @@ -692,3 +424,28 @@ pub(crate) fn load_graph_props_from_df< Ok(()) } + +#[inline(never)] +pub(crate) fn extract_secondary_index_col( + secondary_index_index: Option, + session: &::WS<'_>, + df: &DFChunk, +) -> Result { + let secondary_index_col = match secondary_index_index { + Some(col_index) => { + // Update the event_id to reflect ingesting new secondary indices. + let col = df.secondary_index_col(col_index)?; + session + .set_max_event_id(col.max()) + .map_err(into_graph_err)?; + col + } + None => { + let start_id = session + .reserve_event_ids(df.len()) + .map_err(into_graph_err)?; + SecondaryIndexCol::new_from_range(start_id, start_id + df.len()) + } + }; + Ok(secondary_index_col) +} diff --git a/raphtory/src/io/arrow/df_loaders/nodes.rs b/raphtory/src/io/arrow/df_loaders/nodes.rs new file mode 100644 index 0000000000..14d3ea091b --- /dev/null +++ b/raphtory/src/io/arrow/df_loaders/nodes.rs @@ -0,0 +1,273 @@ +#[cfg(feature = "python")] +use crate::io::arrow::df_loaders::build_progress_bar; +use crate::{ + core::entities::nodes::node_ref::AsNodeRef, + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, + io::arrow::{ + dataframe::{DFChunk, DFView, SecondaryIndexCol}, + df_loaders::{extract_secondary_index_col, process_shared_properties}, + layer_col::lift_node_type_col, + prop_handler::*, + }, + prelude::*, +}; +use itertools::izip; +#[cfg(feature = "python")] +use kdam::BarExt; +use raphtory_api::core::{ + entities::properties::meta::STATIC_GRAPH_LAYER_ID, storage::timeindex::TimeIndexEntry, +}; +use raphtory_core::storage::timeindex::AsTime; +use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; +use rayon::prelude::*; +use std::collections::HashMap; + +pub fn load_nodes_from_df< + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, +>( + df_view: DFView>>, + time: &str, + secondary_index: Option<&str>, + node_id: &str, + properties: &[&str], + metadata: &[&str], + shared_metadata: Option<&HashMap>, + node_type: Option<&str>, + node_type_col: Option<&str>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + let properties_indices = properties + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let node_type_index = + node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); + let node_type_index = node_type_index.transpose()?; + + let node_id_index = df_view.get_index(node_id)?; + let time_index = df_view.get_index(time)?; + let secondary_index_index = secondary_index + .map(|col| df_view.get_index(col)) + .transpose()?; + + let session = graph.write_session().map_err(into_graph_err)?; + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading nodes".to_string(), df_view.num_rows)?; + + let mut node_col_resolved = vec![]; + let mut node_type_col_resolved = vec![]; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + for chunk in df_view.chunks { + let df = chunk?; + let prop_cols = + combine_properties_arrow(properties, &properties_indices, &df, |key, dtype| { + session + .resolve_node_property(key, dtype, false) + .map_err(into_graph_err) + })?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + + let time_col = df.time_col(time_index)?; + let node_col = df.node_col(node_id_index)?; + + // Load the secondary index column if it exists, otherwise generate from start_id. + let secondary_index_col = + extract_secondary_index_col::(secondary_index_index, &session, &df)?; + node_col_resolved.resize_with(df.len(), Default::default); + node_type_col_resolved.resize_with(df.len(), Default::default); + + // TODO: Using parallel iterators results in a 5x speedup, but + // needs to be implemented such that node VID order is preserved. + // See: https://github.com/Pometry/pometry-storage/issues/81 + for (gid, resolved, node_type, node_type_resolved) in izip!( + node_col.iter(), + node_col_resolved.iter_mut(), + node_type_col.iter(), + node_type_col_resolved.iter_mut() + ) { + let (vid, res_node_type) = write_locked_graph + .graph() + .resolve_node_and_type(gid.as_node_ref(), node_type) + .map_err(|_| LoadError::FatalError)?; + + *resolved = vid; + *node_type_resolved = res_node_type; + } + + let node_stats = write_locked_graph.node_stats().clone(); + let update_time = |time: TimeIndexEntry| { + let time = time.t(); + node_stats.update_time(time); + }; + + write_locked_graph + .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); + + write_locked_graph + .nodes + .par_iter_mut() + .try_for_each(|shard| { + // Zip all columns for iteration. + let zip = izip!( + node_col_resolved.iter(), + time_col.iter(), + secondary_index_col.iter(), + node_type_col_resolved.iter(), + node_col.iter() + ); + + for (row, (vid, time, secondary_index, node_type, gid)) in zip.enumerate() { + if let Some(mut_node) = shard.resolve_pos(*vid) { + let mut writer = shard.writer(); + let t = TimeIndexEntry(time, secondary_index); + let layer_id = STATIC_GRAPH_LAYER_ID; + let lsn = 0; + + update_time(t); + writer + .store_node_id_and_node_type(mut_node, layer_id, gid, *node_type, lsn); + + let t_props = prop_cols.iter_row(row); + let c_props = metadata_cols + .iter_row(row) + .chain(shared_metadata.iter().cloned()); + + writer.add_props(t, mut_node, layer_id, t_props, lsn); + writer.update_c_props(mut_node, layer_id, c_props, lsn); + }; + } + + Ok::<_, GraphError>(()) + })?; + + #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + + Ok(()) +} + +pub(crate) fn load_node_props_from_df< + 'a, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, +>( + df_view: DFView>>, + node_id: &str, + node_type: Option<&str>, + node_type_col: Option<&str>, + metadata: &[&str], + shared_metadata: Option<&HashMap>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let node_type_index = + node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); + let node_type_index = node_type_index.transpose()?; + + let node_id_index = df_view.get_index(node_id)?; + let session = graph.write_session().map_err(into_graph_err)?; + + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading node properties".to_string(), df_view.num_rows)?; + + let mut node_col_resolved = vec![]; + let mut node_type_col_resolved = vec![]; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + for chunk in df_view.chunks { + let df = chunk?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_node_property(key, dtype, true) + .map_err(into_graph_err) + })?; + let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + let node_col = df.node_col(node_id_index)?; + + node_col_resolved.resize_with(df.len(), Default::default); + node_type_col_resolved.resize_with(df.len(), Default::default); + + node_col + .iter() + .zip(node_col_resolved.iter_mut()) + .zip(node_type_col.iter()) + .zip(node_type_col_resolved.iter_mut()) + .try_for_each(|(((gid, resolved), node_type), node_type_resolved)| { + let (vid, res_node_type) = write_locked_graph + .graph() + .resolve_node_and_type(gid.as_node_ref(), node_type) + .map_err(|_| LoadError::FatalError)?; + *resolved = vid; + *node_type_resolved = res_node_type; + Ok::<(), LoadError>(()) + })?; + + write_locked_graph + .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); + + write_locked_graph.nodes.iter_mut().try_for_each(|shard| { + let mut c_props = vec![]; + + for (idx, ((vid, node_type), gid)) in node_col_resolved + .iter() + .zip(node_type_col_resolved.iter()) + .zip(node_col.iter()) + .enumerate() + { + if let Some(mut_node) = shard.resolve_pos(*vid) { + let mut writer = shard.writer(); + writer.store_node_id_and_node_type(mut_node, 0, gid, *node_type, 0); + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(idx)); + c_props.extend_from_slice(&shared_metadata); + writer.update_c_props(mut_node, 0, c_props.drain(..), 0); + }; + } + + Ok::<_, GraphError>(()) + })?; + + #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + Ok(()) +} diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index e30c61203d..1b05526dc8 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -5,6 +5,7 @@ use crate::{ dataframe::*, df_loaders::{ edges::{load_edges_from_df, ColumnNames}, + nodes::{load_node_props_from_df, load_nodes_from_df}, *, }, }, diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs index 8a0a6211ad..b04b801b76 100644 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -5,6 +5,7 @@ use crate::{ dataframe::*, df_loaders::{ edges::{load_edges_from_df, ColumnNames}, + nodes::{load_node_props_from_df, load_nodes_from_df}, *, }, }, diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index 3b9eb15ed3..99a9ec299d 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -489,7 +489,13 @@ fn decode_graph_storage( let t_node_path = path.as_ref().join(NODES_T_PATH); if std::fs::exists(&t_node_path)? { - let exclude = vec![NODE_ID_COL, TIME_COL, SECONDARY_INDEX_COL, TYPE_COL]; + let exclude = vec![ + NODE_ID_COL, + NODE_VID_COL, + TIME_COL, + SECONDARY_INDEX_COL, + TYPE_COL, + ]; let (t_prop_columns, _) = collect_prop_columns(&t_node_path, &exclude)?; let t_prop_columns = t_prop_columns .iter() @@ -501,7 +507,7 @@ fn decode_graph_storage( &t_node_path, TIME_COL, Some(SECONDARY_INDEX_COL), - NODE_ID_COL, + NODE_VID_COL, None, Some(TYPE_COL), &t_prop_columns, From e994b2d778d5e0575a5f8eb056eeb774b9138b50 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Mon, 15 Dec 2025 16:38:24 +0000 Subject: [PATCH 19/24] df_loaders refactor progress --- Cargo.lock | 21 ++ db4-graph/src/lib.rs | 13 +- .../src/entities/graph/logical_to_physical.rs | 139 +------ raphtory-storage/src/graph/graph.rs | 6 + raphtory/src/db/api/view/graph.rs | 16 +- .../src/io/arrow/df_loaders/edge_props.rs | 217 +++++++++++ raphtory/src/io/arrow/df_loaders/edges.rs | 86 +---- raphtory/src/io/arrow/df_loaders/mod.rs | 339 ++++++++---------- raphtory/src/io/arrow/df_loaders/nodes.rs | 262 ++++++++++---- raphtory/src/io/arrow/layer_col.rs | 29 +- raphtory/src/io/parquet_loaders.rs | 8 + raphtory/src/python/graph/graph.rs | 4 + .../src/python/graph/graph_with_deletions.rs | 4 + .../src/python/graph/io/pandas_loaders.rs | 4 + raphtory/src/serialise/parquet/mod.rs | 55 ++- raphtory/src/serialise/parquet/model.rs | 5 +- raphtory/src/serialise/parquet/nodes.rs | 7 +- 17 files changed, 701 insertions(+), 514 deletions(-) create mode 100644 raphtory/src/io/arrow/df_loaders/edge_props.rs diff --git a/Cargo.lock b/Cargo.lock index 2bc455c67a..24703ff166 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5149,6 +5149,7 @@ dependencies = [ "tantivy", "tempfile", "thiserror 2.0.17", + "tikv-jemallocator", "tokio", "tracing", "uuid", @@ -6520,6 +6521,26 @@ dependencies = [ "ordered-float 2.10.1", ] +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "time" version = "0.3.44" diff --git a/db4-graph/src/lib.rs b/db4-graph/src/lib.rs index 90363daf43..ab972ac398 100644 --- a/db4-graph/src/lib.rs +++ b/db4-graph/src/lib.rs @@ -396,14 +396,13 @@ impl<'a, EXT: PersistentStrategy, ES = ES, GS = GS>> self.graph } - pub fn resize_chunks_to_num_nodes(&mut self, num_nodes: usize) { - if num_nodes == 0 { - return; + pub fn resize_chunks_to_num_nodes(&mut self, max_vid: Option) { + if let Some(max_vid) = max_vid { + let (chunks_needed, _) = self.graph.storage.nodes().resolve_pos(max_vid); + self.graph.storage().nodes().grow(chunks_needed + 1); + std::mem::take(&mut self.nodes); + self.nodes = self.graph.storage.nodes().write_locked(); } - let (chunks_needed, _) = self.graph.storage.nodes().resolve_pos(VID(num_nodes - 1)); - self.graph.storage().nodes().grow(chunks_needed + 1); - std::mem::take(&mut self.nodes); - self.nodes = self.graph.storage.nodes().write_locked(); } pub fn resize_chunks_to_num_edges(&mut self, num_edges: usize) { diff --git a/raphtory-core/src/entities/graph/logical_to_physical.rs b/raphtory-core/src/entities/graph/logical_to_physical.rs index ab3c6609f4..444d900270 100644 --- a/raphtory-core/src/entities/graph/logical_to_physical.rs +++ b/raphtory-core/src/entities/graph/logical_to_physical.rs @@ -1,17 +1,10 @@ -use dashmap::{mapref::entry::Entry, RwLockWriteGuard, SharedValue}; -use either::Either; -use hashbrown::raw::RawTable; +use dashmap::mapref::entry::Entry; use once_cell::sync::OnceCell; use raphtory_api::core::{ entities::{GidRef, GidType, VID}, storage::{dict_mapper::MaybeNew, FxDashMap}, }; -use rayon::prelude::*; use serde::{Deserialize, Deserializer, Serialize}; -use std::{ - borrow::Borrow, - hash::{BuildHasher, Hash}, -}; use thiserror::Error; #[derive(Debug, Deserialize, Serialize)] @@ -42,40 +35,6 @@ impl Map { _ => None, } } - - pub fn run_with_locked) -> Result<(), E> + Send + Sync>( - &self, - work_fn: FN, - ) -> Result<(), E> { - match self { - Map::U64(map) => { - let shards = map.shards(); - shards - .par_iter() - .enumerate() - .try_for_each(|(shard_id, shard)| { - work_fn(ResolverShard::U64(ResolverShardT::new( - shard.write(), - map, - shard_id, - ))) - }) - } - Map::Str(map) => { - let shards = map.shards(); - shards - .par_iter() - .enumerate() - .try_for_each(|(shard_id, shard)| { - work_fn(ResolverShard::Str(ResolverShardT::new( - shard.write(), - map, - shard_id, - ))) - }) - } - } - } } impl Default for Map { @@ -89,94 +48,6 @@ pub struct Mapping { map: OnceCell, } -pub enum ResolverShard<'a> { - U64(ResolverShardT<'a, u64>), - Str(ResolverShardT<'a, String>), -} - -impl<'a> ResolverShard<'a> { - pub fn shard_id(&self) -> usize { - match self { - ResolverShard::U64(ResolverShardT { shard_id, .. }) => *shard_id, - ResolverShard::Str(ResolverShardT { shard_id, .. }) => *shard_id, - } - } - - pub fn as_u64(&mut self) -> Option<&mut ResolverShardT<'a, u64>> { - if let ResolverShard::U64(shard) = self { - Some(shard) - } else { - None - } - } - - pub fn as_str(&mut self) -> Option<&mut ResolverShardT<'a, String>> { - if let ResolverShard::Str(shard) = self { - Some(shard) - } else { - None - } - } -} - -pub struct ResolverShardT<'a, T> { - guard: RwLockWriteGuard<'a, RawTable<(T, SharedValue)>>, - map: &'a FxDashMap, - shard_id: usize, -} - -impl<'a, T: Eq + Hash + Clone> ResolverShardT<'a, T> { - pub fn new( - guard: RwLockWriteGuard<'a, RawTable<(T, SharedValue)>>, - map: &'a FxDashMap, - shard_id: usize, - ) -> Self { - Self { - guard, - map, - shard_id, - } - } - pub fn resolve_node( - &mut self, - id: &Q, - next_id: impl FnOnce(&Q) -> Either, - ) -> Option - where - T: Borrow, - Q: Eq + Hash + ToOwned + ?Sized, - { - let shard_ind = self.map.determine_map(id.borrow()); - if shard_ind != self.shard_id { - // This shard does not contain the id, return None - return None; - } - let factory = self.map.hasher().clone(); - let hash = factory.hash_one(id); - - match self.guard.get(hash, |(k, _)| k.borrow() == id) { - Some((_, vid)) => { - // Node already exists, do nothing - Some(*(vid.get())) - } - None => { - // Node does not exist, create it - let vid = next_id(id); - - if let Either::Left(vid) = vid { - self.guard - .insert(hash, (id.borrow().to_owned(), SharedValue::new(vid)), |t| { - factory.hash_one(&t.0) - }); - Some(vid) - } else { - vid.right() - } - } - } - } -} - impl Mapping { pub fn len(&self) -> usize { self.map.get().map_or(0, |map| match map { @@ -185,14 +56,6 @@ impl Mapping { }) } - pub fn run_with_locked) -> Result<(), E> + Send + Sync>( - &self, - work_fn: FN, - ) -> Result<(), E> { - let inner_map = self.map.get().unwrap(); - inner_map.run_with_locked(work_fn) - } - pub fn is_empty(&self) -> bool { self.len() == 0 } diff --git a/raphtory-storage/src/graph/graph.rs b/raphtory-storage/src/graph/graph.rs index e047b0f0ce..b5108741d3 100644 --- a/raphtory-storage/src/graph/graph.rs +++ b/raphtory-storage/src/graph/graph.rs @@ -29,6 +29,12 @@ pub enum Immutable { ReadLockedImmutable, } +impl From> for GraphStorage { + fn from(value: Arc) -> Self { + Self::Unlocked(value) + } +} + impl From for GraphStorage { fn from(value: TemporalGraph) -> Self { Self::Unlocked(Arc::new(value)) diff --git a/raphtory/src/db/api/view/graph.rs b/raphtory/src/db/api/view/graph.rs index 06f5c51988..343327613d 100644 --- a/raphtory/src/db/api/view/graph.rs +++ b/raphtory/src/db/api/view/graph.rs @@ -326,15 +326,12 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { .storage() .set_event_id(storage.read_event_id()); - let graph_storage = GraphStorage::from(temporal_graph); + let temporal_graph = Arc::new(temporal_graph); + + let graph_storage = GraphStorage::from(temporal_graph.clone()); { // scope for the write lock - let mut new_storage = graph_storage.write_lock()?; - new_storage.resize_chunks_to_num_nodes(self.count_nodes()); - for layer_id in &layer_map { - new_storage.nodes.ensure_layer(*layer_id); - } let mut node_map = vec![VID::default(); storage.unfiltered_num_nodes()]; let node_map_shared = @@ -345,7 +342,7 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { self.nodes().par_iter().for_each(|node| { let vid = node.node; if let Some(pos) = index.index(&vid) { - let new_vid = new_storage.graph().storage().nodes().reserve_vid(pos); + let new_vid = temporal_graph.storage().nodes().reserve_vid(pos); node_map_shared[pos].store(new_vid.index(), Ordering::Relaxed); } }); @@ -356,6 +353,11 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { .expect("old_vid should exist in index"); node_map[pos] }; + let mut new_storage = graph_storage.write_lock()?; + + for layer_id in &layer_map { + new_storage.nodes.ensure_layer(*layer_id); + } new_storage.nodes.par_iter_mut().try_for_each(|shard| { for node in self.nodes().iter() { diff --git a/raphtory/src/io/arrow/df_loaders/edge_props.rs b/raphtory/src/io/arrow/df_loaders/edge_props.rs new file mode 100644 index 0000000000..77ad9e2c42 --- /dev/null +++ b/raphtory/src/io/arrow/df_loaders/edge_props.rs @@ -0,0 +1,217 @@ +use crate::{ + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, + io::arrow::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + build_progress_bar, + edges::{get_or_resolve_node_vids, store_node_ids, ColumnNames}, + extract_secondary_index_col, process_shared_properties, resolve_nodes_with_cache, + GidKey, + }, + layer_col::lift_layer_col, + node_col::NodeCol, + prop_handler::*, + }, + prelude::*, +}; +use arrow::{array::AsArray, datatypes::UInt64Type}; +use bytemuck::checked::cast_slice_mut; +use db4_graph::WriteLockedGraph; +use itertools::izip; +use kdam::BarExt; +use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; +use raphtory_api::{ + atomic_extra::atomic_usize_from_mut_slice, + core::{ + entities::EID, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, + }, +}; +use raphtory_core::entities::VID; +use raphtory_storage::mutation::addition_ops::SessionAdditionOps; +use rayon::prelude::*; +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + mpsc, + }, +}; +use storage::{ + api::{edges::EdgeSegmentOps, nodes::NodeSegmentOps}, + pages::locked::{edges::LockedEdgePage, nodes::LockedNodePage}, + Extension, +}; + +#[allow(clippy::too_many_arguments)] +pub fn load_edges_from_df( + df_view: DFView> + Send>, + column_names: ColumnNames, + resolve_nodes: bool, + metadata: &[&str], + shared_metadata: Option<&HashMap>, + layer: Option<&str>, + graph: &G, +) -> Result<(), GraphError> { + if df_view.is_empty() { + return Ok(()); + } + + let ColumnNames { + src, + dst, + layer_col, + .. + } = column_names; + + let metadata_indices = metadata + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let src_index = df_view.get_index(src)?; + let dst_index = df_view.get_index(dst)?; + let layer_index = if let Some(layer_col) = layer_col { + Some(df_view.get_index(layer_col.as_ref())?) + } else { + None + }; + let session = graph.write_session().map_err(into_graph_err)?; + let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + + // #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading edges metadata".to_string(), df_view.num_rows)?; + + let mut src_col_resolved: Vec = vec![]; + let mut dst_col_resolved: Vec = vec![]; + let mut eid_col_resolved: Vec = vec![]; + + rayon::scope(|s| { + let (tx, rx) = mpsc::sync_channel(2); + + s.spawn(move |_| { + let sender = tx; + for chunk in df_view.chunks { + sender.send(chunk).unwrap() + } + }); + + for chunk in rx.iter() { + let df = chunk?; + let metadata_cols = + combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { + session + .resolve_edge_property(key, dtype, true) + .map_err(into_graph_err) + })?; + // validate src and dst columns + let src_col = df.node_col(src_index)?; + src_col.validate(graph, LoadError::MissingSrcError)?; + let dst_col = df.node_col(dst_index)?; + dst_col.validate(graph, LoadError::MissingDstError)?; + let layer = lift_layer_col(layer, layer_index, &df)?; + let layer_col_resolved = layer.resolve_layer(graph)?; + + let (src_vids, dst_vids, gid_str_cache) = get_or_resolve_node_vids( + graph, + src_index, + dst_index, + &mut src_col_resolved, + &mut dst_col_resolved, + resolve_nodes, + &df, + &src_col, + &dst_col, + )?; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + eid_col_resolved.resize_with(df.len(), Default::default); + let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); + + let WriteLockedGraph { nodes, .. } = &mut write_locked_graph; + + // Generate all edge_ids + add outbound edges + nodes.par_iter_mut().try_for_each(|locked_page| { + // Zip all columns for iteration. + let zip = izip!(src_vids.iter(), dst_vids.iter()); + add_and_resolve_outbound_edges(&eid_col_shared, locked_page, zip)?; + // resolve_nodes=false + // assumes we are loading our own graph, via the parquet loaders, + // so previous calls have already stored the node ids and types + if resolve_nodes { + store_node_ids(&gid_str_cache, locked_page); + } + Ok::<_, GraphError>(()) + })?; + + drop(write_locked_graph); + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + + write_locked_graph.edges.par_iter_mut().for_each(|shard| { + let zip = izip!( + src_vids.iter(), + dst_vids.iter(), + eid_col_resolved.iter(), + layer_col_resolved.iter(), + ); + update_edge_metadata(&shared_metadata, &metadata_cols, shard, zip); + }); + + // #[cfg(feature = "python")] + let _ = pb.update(df.len()); + } + Ok::<_, GraphError>(()) + })?; + // set the type of the resolver; + + Ok(()) +} + +#[inline(never)] +fn add_and_resolve_outbound_edges<'a, NS: NodeSegmentOps>( + eid_col_shared: &&mut [AtomicUsize], + locked_page: &mut LockedNodePage<'_, NS>, + zip: impl Iterator, +) -> Result<(), LoadError> { + for (row, (src, dst)) in zip.enumerate() { + if let Some(src_pos) = locked_page.resolve_pos(*src) { + let writer = locked_page.writer(); + // find the original EID in the static graph if it exists + // otherwise create a new one + if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, 0) { + eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); + } else { + return Err(LoadError::MissingEdgeError(*src, *dst)); + }; + } + } + Ok(()) +} + +#[inline(never)] +fn update_edge_metadata<'a, ES: EdgeSegmentOps>( + shared_metadata: &[(usize, Prop)], + metadata_cols: &PropCols, + shard: &mut LockedEdgePage<'_, ES>, + zip: impl Iterator, +) { + let mut c_props: Vec<(usize, Prop)> = Vec::new(); + for (row, (src, dst, eid, layer)) in zip.enumerate() { + if let Some(eid_pos) = shard.resolve_pos(*eid) { + let mut writer = shard.writer(); + + c_props.clear(); + c_props.extend(metadata_cols.iter_row(row)); + c_props.extend_from_slice(shared_metadata); + + writer.update_c_props(eid_pos, *src, *dst, *layer, c_props.drain(..)); + } + } +} diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index f36336a40e..27b8d41e68 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -1,10 +1,12 @@ use crate::{ - core::entities::nodes::node_ref::AsNodeRef, db::api::view::StaticGraphViewOps, errors::{into_graph_err, GraphError, LoadError}, io::arrow::{ - dataframe::{DFChunk, DFView, SecondaryIndexCol}, - df_loaders::{build_progress_bar, extract_secondary_index_col, process_shared_properties}, + dataframe::{DFChunk, DFView}, + df_loaders::{ + build_progress_bar, extract_secondary_index_col, process_shared_properties, + resolve_nodes_with_cache, GidKey, + }, layer_col::lift_layer_col, node_col::NodeCol, prop_handler::*, @@ -24,8 +26,8 @@ use raphtory_api::{ storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, }, }; -use raphtory_core::entities::{GidRef, VID}; -use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; +use raphtory_core::entities::VID; +use raphtory_storage::mutation::addition_ops::SessionAdditionOps; use rayon::prelude::*; use std::{ collections::HashMap, @@ -144,7 +146,7 @@ pub fn load_edges_from_df, (Prop, MaybeNew)>, + FxDashMap, (Prop, MaybeNew)>, ), GraphError, > { @@ -304,7 +311,8 @@ fn get_or_resolve_node_vids< let gid_str_cache = resolve_nodes_with_cache::( graph, - [(src_col, atomic_src_col), (dst_col, atomic_dst_col)].as_ref(), + [(src_col), (dst_col)].as_ref(), + [atomic_src_col, atomic_dst_col].as_ref(), )?; ( src_col_resolved.as_slice(), @@ -449,8 +457,8 @@ fn add_and_resolve_outbound_edges< } #[inline(never)] -fn store_node_ids>( - gid_str_cache: &FxDashMap, (Prop, MaybeNew)>, +pub fn store_node_ids>( + gid_str_cache: &FxDashMap)>, locked_page: &mut LockedNodePage<'_, NS>, ) { for entry in gid_str_cache.iter() { @@ -462,59 +470,3 @@ fn store_node_ids>( } } } - -#[inline(never)] -fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps>( - graph: &G, - cols_to_resolve: &[(&'a NodeCol, &mut [AtomicUsize])], -) -> Result, (Prop, MaybeNew)>, GraphError> { - let gid_str_cache: dashmap::DashMap, (Prop, MaybeNew), _> = - FxDashMap::default(); - let hasher_factory = gid_str_cache.hasher().clone(); - gid_str_cache - .shards() - .par_iter() - .enumerate() - .try_for_each(|(shard_idx, shard)| { - let mut shard_guard = shard.write(); - use dashmap::SharedValue; - use std::hash::BuildHasher; - - // Create hasher function for this shard - let hash_key = |key: &GidRef<'_>| -> u64 { hasher_factory.hash_one(key) }; - - let hasher_fn = |tuple: &(GidRef<'_>, SharedValue<(Prop, MaybeNew)>)| -> u64 { - hasher_factory.hash_one(tuple.0) - }; - - for (col, atomic_col) in cols_to_resolve { - // Process src_col sequentially for this shard - for (idx, gid) in col.iter().enumerate() { - // Check if this key belongs to this shard - if gid_str_cache.determine_map(&gid) != shard_idx { - continue; // Skip, not our shard - } - - let hash = hash_key(&gid); - - // Check if exists in this shard - if let Some((_, value)) = shard_guard.get(hash, |(g, _)| g == &gid) { - let (_, vid) = value.get(); - atomic_col[idx].store(vid.inner().index(), Ordering::Relaxed); - } else { - let vid = graph - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; - - let data = (gid, SharedValue::new((Prop::from(gid), vid))); - shard_guard.insert(hash, data, hasher_fn); - - atomic_col[idx].store(vid.inner().index(), Ordering::Relaxed); - } - } - } - - Ok::<(), LoadError>(()) - })?; - Ok(gid_str_cache) -} diff --git a/raphtory/src/io/arrow/df_loaders/mod.rs b/raphtory/src/io/arrow/df_loaders/mod.rs index 6de103dfbe..c2fd5194b4 100644 --- a/raphtory/src/io/arrow/df_loaders/mod.rs +++ b/raphtory/src/io/arrow/df_loaders/mod.rs @@ -4,37 +4,31 @@ use crate::{ errors::{into_graph_err, GraphError, LoadError}, io::arrow::{ dataframe::{DFChunk, DFView, SecondaryIndexCol}, - layer_col::{lift_layer_col, lift_node_type_col}, + df_loaders::edges::ColumnNames, + layer_col::{lift_layer_col, LayerCol}, + node_col::NodeCol, prop_handler::*, }, prelude::*, }; use bytemuck::checked::cast_slice_mut; -use either::Either; -use itertools::izip; use kdam::{Bar, BarBuilder, BarExt}; use raphtory_api::{ atomic_extra::atomic_usize_from_mut_slice, core::{ - entities::{ - properties::{meta::STATIC_GRAPH_LAYER_ID, prop::PropType}, - EID, - }, - storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry}, + entities::{properties::prop::PropType, EID}, + storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, }, }; -use raphtory_core::{ - entities::{graph::logical_to_physical::ResolverShardT, GidRef, VID}, - storage::timeindex::AsTime, -}; +use raphtory_core::entities::{GidRef, VID}; use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; use rayon::prelude::*; use std::{ - borrow::{Borrow, Cow}, collections::HashMap, sync::atomic::{AtomicUsize, Ordering}, }; +pub mod edge_props; pub mod edges; pub mod nodes; @@ -61,49 +55,6 @@ fn process_shared_properties( } } -fn load_into_shard( - src_col_shared: &[AtomicUsize], - dst_col_shared: &[AtomicUsize], - src_col: &super::node_col::NodeCol, - dst_col: &super::node_col::NodeCol, - node_count: &AtomicUsize, - shard: &mut ResolverShardT<'_, T>, - mut mapper_fn: impl FnMut(GidRef<'_>) -> Cow<'_, Q>, - mut fallback_fn: impl FnMut(&Q) -> Option, -) -> Result<(), LoadError> -where - T: Clone + Eq + std::hash::Hash + Borrow, - Q: Eq + std::hash::Hash + ToOwned + ?Sized, -{ - let src_iter = src_col.iter().map(&mut mapper_fn).enumerate(); - - for (id, gid) in src_iter { - if let Some(vid) = shard.resolve_node(&gid, |id| { - // fallback_fn(id).map(Either::Right).unwrap_or_else(|| { - // // If the node does not exist, create a new VID - // Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - // }) - Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - }) { - src_col_shared[id].store(vid.0, Ordering::Relaxed); - } - } - - let dst_iter = dst_col.iter().map(mapper_fn).enumerate(); - for (id, gid) in dst_iter { - if let Some(vid) = shard.resolve_node(&gid, |id| { - // fallback_fn(id).map(Either::Right).unwrap_or_else(|| { - // // If the node does not exist, create a new VID - // Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - // }) - Either::Left(VID(node_count.fetch_add(1, Ordering::Relaxed))) - }) { - dst_col_shared[id].store(vid.0, Ordering::Relaxed); - } - } - Ok::<_, LoadError>(()) -} - pub(crate) fn load_edge_deletions_from_df< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + DeletionOps, >( @@ -179,7 +130,7 @@ pub(crate) fn load_edge_deletions_from_df< pub(crate) fn load_edges_props_from_df< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, >( - df_view: DFView>>, + df_view: DFView> + Send>, src: &str, dst: &str, metadata: &[&str], @@ -187,140 +138,24 @@ pub(crate) fn load_edges_props_from_df< layer: Option<&str>, layer_col: Option<&str>, graph: &G, + resolve_nodes: bool, ) -> Result<(), GraphError> { - if df_view.is_empty() { - return Ok(()); - } - let metadata_indices = metadata - .iter() - .map(|name| df_view.get_index(name)) - .collect::, GraphError>>()?; - - let src_index = df_view.get_index(src)?; - let dst_index = df_view.get_index(dst)?; - let layer_index = if let Some(layer_col) = layer_col { - Some(df_view.get_index(layer_col.as_ref())?) - } else { - None - }; - let session = graph.write_session().map_err(into_graph_err)?; - let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - - #[cfg(feature = "python")] - let mut pb = build_progress_bar("Loading edge properties".to_string(), df_view.num_rows)?; - #[cfg(feature = "python")] - let _ = pb.update(0); - - let mut src_col_resolved = vec![]; - let mut dst_col_resolved = vec![]; - let mut eid_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; - - let g = write_locked_graph.graph; - - for chunk in df_view.chunks { - let df = chunk?; - let metadata_cols = - combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { - session - .resolve_edge_property(key, dtype, true) - .map_err(into_graph_err) - })?; - let layer = lift_layer_col(layer, layer_index, &df)?; - let layer_col_resolved = layer.resolve(graph)?; - - let src_col = df.node_col(src_index)?; - src_col.validate(graph, LoadError::MissingSrcError)?; - - let dst_col = df.node_col(dst_index)?; - dst_col.validate(graph, LoadError::MissingDstError)?; - - // It's our graph, no one else can change it - src_col_resolved.resize_with(df.len(), Default::default); - src_col - .par_iter() - .zip(src_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let vid = g - .resolve_node_ref(gid.as_node_ref()) - .ok_or(LoadError::MissingNodeError)?; - *resolved = vid; - Ok::<(), LoadError>(()) - })?; - - dst_col_resolved.resize_with(df.len(), Default::default); - dst_col - .par_iter() - .zip(dst_col_resolved.par_iter_mut()) - .try_for_each(|(gid, resolved)| { - let gid = gid.ok_or(LoadError::FatalError)?; - let vid = g - .resolve_node_ref(gid.as_node_ref()) - .ok_or(LoadError::MissingNodeError)?; - *resolved = vid; - Ok::<(), LoadError>(()) - })?; - - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); - - // resolve all the edges - eid_col_resolved.resize_with(df.len(), Default::default); - let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); - - write_locked_graph - .nodes - .par_iter_mut() - .try_for_each(|shard| { - for (row, (src, dst)) in src_col_resolved - .iter() - .zip(dst_col_resolved.iter()) - .enumerate() - { - if let Some(src_node) = shard.resolve_pos(*src) { - let writer = shard.writer(); - let EID(eid) = writer - .get_out_edge(src_node, *dst, 0) - .ok_or(LoadError::MissingEdgeError(*src, *dst))?; - eid_col_shared[row].store(eid, Ordering::Relaxed); - } - } - Ok::<_, LoadError>(()) - })?; - - write_locked_graph - .edges - .par_iter_mut() - .try_for_each(|shard| { - let mut c_props = vec![]; - for (idx, (((eid, layer), src), dst)) in eid_col_resolved - .iter() - .zip(layer_col_resolved.iter()) - .zip(&src_col_resolved) - .zip(&dst_col_resolved) - .enumerate() - { - if let Some(eid_pos) = shard.resolve_pos(*eid) { - let mut writer = shard.writer(); - c_props.clear(); - c_props.extend(metadata_cols.iter_row(idx)); - c_props.extend_from_slice(&shared_metadata); - writer.update_c_props(eid_pos, *src, *dst, *layer, c_props.drain(..)); - } - } - Ok::<(), GraphError>(()) - })?; - - #[cfg(feature = "python")] - let _ = pb.update(df.len()); - } - Ok(()) + edge_props::load_edges_from_df( + df_view, + ColumnNames { + src, + dst, + layer_col, + time: "", + secondary_index: None, + edge_id: None, + }, + resolve_nodes, + metadata, + shared_metadata, + layer, + graph, + ) } pub(crate) fn load_graph_props_from_df< @@ -449,3 +284,127 @@ pub(crate) fn extract_secondary_index_col( }; Ok(secondary_index_col) } + +#[inline(never)] +fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps>( + graph: &G, + cols_to_resolve: &[&'a NodeCol], + resolved_cols: &[&mut [AtomicUsize]], +) -> Result, (Prop, MaybeNew)>, GraphError> { + let node_type_col = vec![None; cols_to_resolve.len()]; + resolve_nodes_with_cache_generic( + &cols_to_resolve, + &node_type_col, + |v: &(Prop, MaybeNew), idx, col_idx| { + let (_, vid) = v; + resolved_cols[col_idx][idx].store(vid.inner().0, Ordering::Relaxed); + }, + |gid, _idx| { + let GidKey { gid, .. } = gid; + let vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(|_| LoadError::FatalError) + .unwrap(); + (Prop::from(gid), vid) + }, + ) +} + +#[inline(never)] +fn resolve_nodes_and_type_with_cache< + 'a, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + cols_to_resolve: &[&'a NodeCol], + resolved_cols: &[&mut [AtomicUsize]], + node_type_col: LayerCol<'a>, +) -> Result, (VID, usize)>, GraphError> { + let node_type_cols = vec![Some(node_type_col); cols_to_resolve.len()]; + resolve_nodes_with_cache_generic( + cols_to_resolve, + &node_type_cols, + |v: &(VID, usize), row, col_idx| { + let (vid, _) = v; + resolved_cols[col_idx][row].store(vid.index(), Ordering::Relaxed); + }, + |gid, _| { + let GidKey { gid, node_type } = gid; + let (vid, node_type) = graph + .resolve_node_and_type(gid.as_node_ref(), node_type) + .map_err(|_| LoadError::FatalError) + .unwrap(); + (vid, node_type) + }, + ) +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash, Copy)] +pub struct GidKey<'a> { + gid: GidRef<'a>, + node_type: Option<&'a str>, +} + +impl<'a> GidKey<'a> { + pub fn new(gid: GidRef<'a>, node_type: Option<&'a str>) -> Self { + Self { gid, node_type } + } +} + +#[inline(always)] +fn resolve_nodes_with_cache_generic<'a, V: Send + Sync>( + cols_to_resolve: &[&'a NodeCol], + node_type_cols: &[Option>], + update_fn: impl Fn(&V, usize, usize) + Send + Sync, + new_fn: impl Fn(GidKey<'a>, usize) -> V + Send + Sync, +) -> Result, V>, GraphError> { + assert_eq!(cols_to_resolve.len(), node_type_cols.len()); + let gid_str_cache: dashmap::DashMap, V, _> = FxDashMap::default(); + let hasher_factory = gid_str_cache.hasher().clone(); + gid_str_cache + .shards() + .par_iter() + .enumerate() + .try_for_each(|(shard_idx, shard)| { + let mut shard_guard = shard.write(); + use dashmap::SharedValue; + use std::hash::BuildHasher; + + // Create hasher function for this shard + let hash_key = |key: &GidKey<'_>| -> u64 { hasher_factory.hash_one(key) }; + + let hasher_fn = + |tuple: &(GidKey<'_>, SharedValue)| -> u64 { hasher_factory.hash_one(tuple.0) }; + + for (col_id, (node_col, layer_col)) in + cols_to_resolve.iter().zip(node_type_cols).enumerate() + { + // Process src_col sequentially for this shard + for (idx, gid) in node_col.iter().enumerate() { + let node_type = layer_col.as_ref().and_then(|lc| lc.get(idx)); + let gid = GidKey::new(gid, node_type); + // Check if this key belongs to this shard + if gid_str_cache.determine_map(&gid) != shard_idx { + continue; // Skip, not our shard + } + + let hash = hash_key(&gid); + + // Check if exists in this shard + if let Some((_, value)) = shard_guard.get(hash, |(g, _)| g == &gid) { + let v = value.get(); + update_fn(&v, idx, col_id); + } else { + let v = new_fn(gid, idx); + + update_fn(&v, idx, col_id); + let data = (gid, SharedValue::new(v)); + shard_guard.insert(hash, data, hasher_fn); + } + } + } + + Ok::<(), LoadError>(()) + })?; + Ok(gid_str_cache) +} diff --git a/raphtory/src/io/arrow/df_loaders/nodes.rs b/raphtory/src/io/arrow/df_loaders/nodes.rs index 14d3ea091b..ac236ddaf5 100644 --- a/raphtory/src/io/arrow/df_loaders/nodes.rs +++ b/raphtory/src/io/arrow/df_loaders/nodes.rs @@ -5,23 +5,34 @@ use crate::{ db::api::view::StaticGraphViewOps, errors::{into_graph_err, GraphError, LoadError}, io::arrow::{ - dataframe::{DFChunk, DFView, SecondaryIndexCol}, - df_loaders::{extract_secondary_index_col, process_shared_properties}, - layer_col::lift_node_type_col, + dataframe::{DFChunk, DFView}, + df_loaders::{ + extract_secondary_index_col, process_shared_properties, + resolve_nodes_and_type_with_cache, GidKey, + }, + layer_col::{lift_node_type_col, LayerCol}, + node_col::NodeCol, prop_handler::*, }, prelude::*, }; +use arrow::{array::AsArray, datatypes::UInt64Type}; use itertools::izip; #[cfg(feature = "python")] use kdam::BarExt; -use raphtory_api::core::{ - entities::properties::meta::STATIC_GRAPH_LAYER_ID, storage::timeindex::TimeIndexEntry, +use raphtory_api::{ + atomic_extra::atomic_vid_from_mut_slice, + core::{ + entities::properties::meta::STATIC_GRAPH_LAYER_ID, + storage::{timeindex::TimeIndexEntry, FxDashMap}, + }, }; -use raphtory_core::storage::timeindex::AsTime; +use raphtory_core::{entities::VID, storage::timeindex::AsTime}; use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAdditionOps}; use rayon::prelude::*; use std::collections::HashMap; +use storage::{api::nodes::NodeSegmentOps, pages::locked::nodes::LockedNodePage, Extension}; +use zip::unstable::write; pub fn load_nodes_from_df< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, @@ -36,6 +47,7 @@ pub fn load_nodes_from_df< node_type: Option<&str>, node_type_col: Option<&str>, graph: &G, + resolve_nodes: bool, ) -> Result<(), GraphError> { if df_view.is_empty() { return Ok(()); @@ -70,9 +82,6 @@ pub fn load_nodes_from_df< let mut pb = build_progress_bar("Loading nodes".to_string(), df_view.num_rows)?; let mut node_col_resolved = vec![]; - let mut node_type_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; for chunk in df_view.chunks { let df = chunk?; @@ -97,49 +106,39 @@ pub fn load_nodes_from_df< let secondary_index_col = extract_secondary_index_col::(secondary_index_index, &session, &df)?; node_col_resolved.resize_with(df.len(), Default::default); - node_type_col_resolved.resize_with(df.len(), Default::default); - - // TODO: Using parallel iterators results in a 5x speedup, but - // needs to be implemented such that node VID order is preserved. - // See: https://github.com/Pometry/pometry-storage/issues/81 - for (gid, resolved, node_type, node_type_resolved) in izip!( - node_col.iter(), - node_col_resolved.iter_mut(), - node_type_col.iter(), - node_type_col_resolved.iter_mut() - ) { - let (vid, res_node_type) = write_locked_graph - .graph() - .resolve_node_and_type(gid.as_node_ref(), node_type) - .map_err(|_| LoadError::FatalError)?; - - *resolved = vid; - *node_type_resolved = res_node_type; - } + let (src_vids, gid_str_cache) = get_or_resolve_node_vids::( + graph, + node_id_index, + &mut node_col_resolved, + resolve_nodes, + &df, + &node_col, + node_type_col, + )?; + + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; let node_stats = write_locked_graph.node_stats().clone(); let update_time = |time: TimeIndexEntry| { let time = time.t(); node_stats.update_time(time); }; - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); - write_locked_graph .nodes .par_iter_mut() .try_for_each(|shard| { // Zip all columns for iteration. - let zip = izip!( - node_col_resolved.iter(), - time_col.iter(), - secondary_index_col.iter(), - node_type_col_resolved.iter(), - node_col.iter() - ); - - for (row, (vid, time, secondary_index, node_type, gid)) in zip.enumerate() { + let zip = izip!(src_vids.iter(), time_col.iter(), secondary_index_col.iter(),); + + // resolve_nodes=false + // assumes we are loading our own graph, via the parquet loaders, + // so previous calls have already stored the node ids and types + if resolve_nodes { + store_node_ids_and_type(&gid_str_cache, shard); + } + + for (row, (vid, time, secondary_index)) in zip.enumerate() { if let Some(mut_node) = shard.resolve_pos(*vid) { let mut writer = shard.writer(); let t = TimeIndexEntry(time, secondary_index); @@ -147,8 +146,6 @@ pub fn load_nodes_from_df< let lsn = 0; update_time(t); - writer - .store_node_id_and_node_type(mut_node, layer_id, gid, *node_type, lsn); let t_props = prop_cols.iter_row(row); let c_props = metadata_cols @@ -178,6 +175,8 @@ pub(crate) fn load_node_props_from_df< node_id: &str, node_type: Option<&str>, node_type_col: Option<&str>, + node_id_col: Option<&str>, // provided by our parquet encoder + node_type_id_col: Option<&str>, // provided by our parquet encoder metadata: &[&str], shared_metadata: Option<&HashMap>, graph: &G, @@ -193,8 +192,15 @@ pub(crate) fn load_node_props_from_df< let node_type_index = node_type_col.map(|node_type_col| df_view.get_index(node_type_col.as_ref())); let node_type_index = node_type_index.transpose()?; + let node_type_ids_col = node_type_id_col + .map(|node_type_id_col| df_view.get_index(node_type_id_col.as_ref())) + .transpose()?; - let node_id_index = df_view.get_index(node_id)?; + let node_id_index = node_id_col + .map(|node_col| df_view.get_index(node_col.as_ref())) + .transpose()?; + + let node_gid_index = df_view.get_index(node_id)?; let session = graph.write_session().map_err(into_graph_err)?; let shared_metadata = process_shared_properties(shared_metadata, |key, dtype| { @@ -207,9 +213,7 @@ pub(crate) fn load_node_props_from_df< let mut pb = build_progress_bar("Loading node properties".to_string(), df_view.num_rows)?; let mut node_col_resolved = vec![]; - let mut node_type_col_resolved = vec![]; - - let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + let mut node_type_resolved = vec![]; for chunk in df_view.chunks { let df = chunk?; @@ -220,28 +224,23 @@ pub(crate) fn load_node_props_from_df< .map_err(into_graph_err) })?; let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; - let node_col = df.node_col(node_id_index)?; - - node_col_resolved.resize_with(df.len(), Default::default); - node_type_col_resolved.resize_with(df.len(), Default::default); - - node_col - .iter() - .zip(node_col_resolved.iter_mut()) - .zip(node_type_col.iter()) - .zip(node_type_col_resolved.iter_mut()) - .try_for_each(|(((gid, resolved), node_type), node_type_resolved)| { - let (vid, res_node_type) = write_locked_graph - .graph() - .resolve_node_and_type(gid.as_node_ref(), node_type) - .map_err(|_| LoadError::FatalError)?; - *resolved = vid; - *node_type_resolved = res_node_type; - Ok::<(), LoadError>(()) - })?; - - write_locked_graph - .resize_chunks_to_num_nodes(write_locked_graph.graph().internal_num_nodes()); + let node_col = df.node_col(node_gid_index)?; + + let (node_col_resolved, node_type_col_resolved) = get_or_resolve_node_vids_no_events::( + graph, + &mut node_col_resolved, + &mut node_type_resolved, + node_type_ids_col, + node_id_index, + &df, + &node_col, + node_type_col, + )?; + + // We assume this is fast enough + let max_id = node_col_resolved.iter().map(|VID(i)| *i).max().map(VID); + let mut write_locked_graph = graph.write_lock().map_err(into_graph_err)?; + write_locked_graph.resize_chunks_to_num_nodes(max_id); write_locked_graph.nodes.iter_mut().try_for_each(|shard| { let mut c_props = vec![]; @@ -259,7 +258,9 @@ pub(crate) fn load_node_props_from_df< c_props.clear(); c_props.extend(metadata_cols.iter_row(idx)); c_props.extend_from_slice(&shared_metadata); - writer.update_c_props(mut_node, 0, c_props.drain(..), 0); + if !c_props.is_empty() { + writer.update_c_props(mut_node, 0, c_props.drain(..), 0); + } }; } @@ -271,3 +272,126 @@ pub(crate) fn load_node_props_from_df< } Ok(()) } + +#[allow(clippy::too_many_arguments, clippy::type_complexity)] +fn get_or_resolve_node_vids< + 'a: 'c, + 'b: 'c, + 'c, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + src_index: usize, + src_col_resolved: &'a mut Vec, + resolve_nodes: bool, + df: &'b DFChunk, + src_col: &'a NodeCol, + node_type_col: LayerCol<'a>, +) -> Result<(&'c [VID], FxDashMap, (VID, usize)>), GraphError> { + let (src_vids, gid_str_cache) = if resolve_nodes { + src_col_resolved.resize_with(df.len(), Default::default); + + let atomic_src_col = atomic_vid_from_mut_slice(src_col_resolved); + + let gid_str_cache = resolve_nodes_and_type_with_cache::( + graph, + [src_col].as_ref(), + [atomic_src_col].as_ref(), + node_type_col, + )?; + (src_col_resolved.as_slice(), gid_str_cache) + } else { + let srcs = df.chunk[src_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[src_index].data_type().clone()))? + .values() + .as_ref(); + (bytemuck::cast_slice(srcs), FxDashMap::default()) + }; + Ok((src_vids, gid_str_cache)) +} + +#[allow(clippy::too_many_arguments, clippy::type_complexity)] +fn get_or_resolve_node_vids_no_events< + 'a: 'c, + 'b: 'c, + 'c, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + node_col_resolved: &'a mut Vec, + node_type_resolved: &'a mut Vec, + node_type_ids_col: Option, + node_id_col: Option, + df: &'b DFChunk, + src_col: &'a NodeCol, + node_type_col: LayerCol<'a>, +) -> Result<(&'c [VID], &'c [usize]), GraphError> { + assert!(!(node_type_ids_col.is_none() ^ node_id_col.is_none())); // both some or both none + if let Some((node_type_index, node_id_col)) = node_type_ids_col.zip(node_id_col) { + let srcs = df.chunk[node_id_col] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[node_id_col].data_type().clone()))? + .values() + .as_ref(); + + let node_types = df.chunk[node_type_index] + .as_primitive_opt::() + .ok_or_else(|| { + LoadError::InvalidNodeType(df.chunk[node_type_index].data_type().clone()) + })? + .values() + .as_ref(); + + let mut locked_mapper = graph.node_meta().node_type_meta().write(); + + for (row, node_type) in node_types.iter().enumerate() { + if let Some(name) = node_type_col.get(row) { + locked_mapper.set_id(name, *node_type as usize); + } + } + + Ok((bytemuck::cast_slice(srcs), bytemuck::cast_slice(node_types))) + } else { + node_col_resolved.resize_with(df.len(), Default::default); + node_type_resolved.resize_with(df.len(), Default::default); + + let mut locked_mapper = graph.node_meta().node_type_meta().write(); + + let zip = izip!( + src_col.iter(), + node_type_col.iter(), + node_col_resolved.iter_mut(), + node_type_resolved.iter_mut() + ); + + for (gid, node_type, vid, node_type_id) in zip { + if let Some(name) = node_type { + *node_type_id = locked_mapper.get_or_create_id(name).inner(); + } + + let res_vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(|_| LoadError::FatalError)?; + *vid = res_vid.inner(); + } + + Ok((node_col_resolved.as_slice(), node_type_resolved.as_slice())) + } +} + +#[inline(never)] +fn store_node_ids_and_type>( + gid_str_cache: &FxDashMap, (VID, usize)>, + locked_page: &mut LockedNodePage<'_, NS>, +) { + for entry in gid_str_cache.iter() { + let (vid, node_type) = entry.value(); + let GidKey { gid, .. } = entry.key(); + + if let Some(src_pos) = locked_page.resolve_pos(*vid) { + let mut writer = locked_page.writer(); + writer.store_node_id_and_node_type(src_pos, 0, *gid, *node_type, 0); + } + } +} diff --git a/raphtory/src/io/arrow/layer_col.rs b/raphtory/src/io/arrow/layer_col.rs index 05fa5aed1c..e2a4336ced 100644 --- a/raphtory/src/io/arrow/layer_col.rs +++ b/raphtory/src/io/arrow/layer_col.rs @@ -61,7 +61,34 @@ impl<'a> LayerCol<'a> { } } - pub fn resolve( + pub fn get(&self, row: usize) -> Option<&'a str> { + match self { + LayerCol::Name { name, .. } => *name, + LayerCol::Utf8 { col } => { + if col.is_valid(row) && row < col.len() { + Some(col.value(row)) + } else { + None + } + } + LayerCol::LargeUtf8 { col } => { + if col.is_valid(row) && row < col.len() { + Some(col.value(row)) + } else { + None + } + } + LayerCol::Utf8View { col } => { + if col.is_valid(row) && row < col.len() { + Some(col.value(row)) + } else { + None + } + } + } + } + + pub fn resolve_layer( self, graph: &(impl AdditionOps + Send + Sync), ) -> Result, GraphError> { diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index 1b05526dc8..f67dd525a2 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -34,6 +34,7 @@ pub fn load_nodes_from_parquet< metadata: &[&str], shared_metadata: Option<&HashMap>, batch_size: Option, + resolve_nodes: bool, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id, time]; @@ -62,6 +63,7 @@ pub fn load_nodes_from_parquet< node_type, node_type_col, graph, + resolve_nodes, ) .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; } @@ -161,6 +163,8 @@ pub fn load_node_props_from_parquet< id: &str, node_type: Option<&str>, node_type_col: Option<&str>, + node_id_col: Option<&str>, // for inner parquet use only + node_type_id_col: Option<&str>, // for inner parquet use only metadata_properties: &[&str], shared_metadata: Option<&HashMap>, batch_size: Option, @@ -181,6 +185,8 @@ pub fn load_node_props_from_parquet< id, node_type, node_type_col, + node_id_col, + node_type_id_col, metadata_properties, shared_metadata, graph, @@ -201,6 +207,7 @@ pub fn load_edge_props_from_parquet, layer_col: Option<&str>, batch_size: Option, + resolve_nodes: bool, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst]; if let Some(ref layer_col) = layer_col { @@ -221,6 +228,7 @@ pub fn load_edge_props_from_parquet>(); - load_nodes_from_parquet( + load_node_props_from_parquet( &graph, - &t_node_path, - TIME_COL, - Some(SECONDARY_INDEX_COL), - NODE_VID_COL, + &c_node_path, + NODE_ID_COL, None, Some(TYPE_COL), - &t_prop_columns, - &[], + Some(NODE_VID_COL), + Some(TYPE_ID_COL), + &c_prop_columns, None, batch_size, )?; } - let c_node_path = path.as_ref().join(NODES_C_PATH); + let t_node_path = path.as_ref().join(NODES_T_PATH); - if std::fs::exists(&c_node_path)? { - let exclude = vec![NODE_ID_COL, TYPE_COL]; - let (c_prop_columns, _) = collect_prop_columns(&c_node_path, &exclude)?; - let c_prop_columns = c_prop_columns + if std::fs::exists(&t_node_path)? { + let exclude = vec![NODE_VID_COL, TIME_COL, SECONDARY_INDEX_COL]; + let (t_prop_columns, _) = collect_prop_columns(&t_node_path, &exclude)?; + let t_prop_columns = t_prop_columns .iter() .map(|s| s.as_str()) .collect::>(); - load_node_props_from_parquet( + load_nodes_from_parquet( &graph, - &c_node_path, - NODE_ID_COL, + &t_node_path, + TIME_COL, + Some(SECONDARY_INDEX_COL), + NODE_VID_COL, None, Some(TYPE_COL), - &c_prop_columns, + &t_prop_columns, + &[], None, batch_size, + false, )?; } @@ -594,7 +592,7 @@ fn decode_graph_storage( let c_edge_path = path.as_ref().join(EDGES_C_PATH); if std::fs::exists(&c_edge_path)? { - let exclude = vec![SRC_COL, DST_COL, LAYER_COL]; + let exclude = vec![SRC_COL_ID, DST_COL_ID, LAYER_COL]; let (c_prop_columns, _) = collect_prop_columns(&c_edge_path, &exclude)?; let metadata = c_prop_columns .iter() @@ -611,6 +609,7 @@ fn decode_graph_storage( None, Some(LAYER_COL), batch_size, + false, )?; } diff --git a/raphtory/src/serialise/parquet/model.rs b/raphtory/src/serialise/parquet/model.rs index bd70dc0db5..456ba199de 100644 --- a/raphtory/src/serialise/parquet/model.rs +++ b/raphtory/src/serialise/parquet/model.rs @@ -7,7 +7,7 @@ use crate::{ graph::{edge::EdgeView, node::NodeView}, }, prelude::*, - serialise::parquet::{DST_COL_ID, EDGE_COL_ID, NODE_VID_COL, SRC_COL_ID}, + serialise::parquet::{DST_COL_ID, EDGE_COL_ID, NODE_VID_COL, SRC_COL_ID, TYPE_ID_COL}, }; use arrow::datatypes::DataType; use raphtory_api::core::{ @@ -134,11 +134,9 @@ impl<'a> Serialize for ParquetTNode<'a> { { let mut state = serializer.serialize_map(None)?; - state.serialize_entry(NODE_ID_COL, &ParquetGID(self.node.id()))?; state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; state.serialize_entry(TIME_COL, &self.t.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &self.t.1)?; - state.serialize_entry(TYPE_COL, &self.node.node_type())?; for (name, prop) in self.props.iter() { state.serialize_entry(&self.cols[*name], &SerdeProp(prop))?; @@ -162,6 +160,7 @@ impl<'a> Serialize for ParquetCNode<'a> { state.serialize_entry(NODE_ID_COL, &ParquetGID(self.node.id()))?; state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; state.serialize_entry(TYPE_COL, &self.node.node_type())?; + state.serialize_entry(TYPE_ID_COL, &self.node.node_type_id())?; for (name, prop) in self.node.metadata().iter_filtered() { state.serialize_entry(&name, &SerdeProp(&prop))?; diff --git a/raphtory/src/serialise/parquet/nodes.rs b/raphtory/src/serialise/parquet/nodes.rs index 75784d0b02..236e9ae0c7 100644 --- a/raphtory/src/serialise/parquet/nodes.rs +++ b/raphtory/src/serialise/parquet/nodes.rs @@ -5,7 +5,7 @@ use crate::{ serialise::parquet::{ model::{ParquetCNode, ParquetTNode}, run_encode_indexed, NODES_C_PATH, NODES_T_PATH, NODE_ID_COL, NODE_VID_COL, - SECONDARY_INDEX_COL, TIME_COL, TYPE_COL, + SECONDARY_INDEX_COL, TIME_COL, TYPE_COL, TYPE_ID_COL, }, }; use arrow::datatypes::{DataType, Field}; @@ -24,13 +24,11 @@ pub(crate) fn encode_nodes_tprop( g.nodes().row_groups_par_iter(), path, NODES_T_PATH, - |id_type| { + |_| { vec![ - Field::new(NODE_ID_COL, id_type.clone(), false), Field::new(NODE_VID_COL, DataType::UInt64, false), Field::new(TIME_COL, DataType::Int64, false), Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - Field::new(TYPE_COL, DataType::Utf8, true), ] }, |nodes, g, decoder, writer| { @@ -89,6 +87,7 @@ pub(crate) fn encode_nodes_cprop( Field::new(NODE_ID_COL, id_type.clone(), false), Field::new(NODE_VID_COL, DataType::UInt64, false), Field::new(TYPE_COL, DataType::Utf8, true), + Field::new(TYPE_ID_COL, DataType::UInt64, true), ] }, |nodes, g, decoder, writer| { From ab7dd3c6d498d310f2a41c2bd50d8206eab56547 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Mon, 15 Dec 2025 16:48:04 +0000 Subject: [PATCH 20/24] fixes post rebase --- db4-graph/src/lib.rs | 4 ++-- raphtory-storage/src/mutation/addition_ops_ext.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/db4-graph/src/lib.rs b/db4-graph/src/lib.rs index ab972ac398..1df44e12c9 100644 --- a/db4-graph/src/lib.rs +++ b/db4-graph/src/lib.rs @@ -21,7 +21,7 @@ use storage::{ layer_counter::GraphStats, locked::{ edges::WriteLockedEdgePages, graph_props::WriteLockedGraphPropPages, - nodes::WriteLockedNodePages, + nodes::WriteLockedNodeSegments, }, }, persist::strategy::{Config, PersistentStrategy}, @@ -374,7 +374,7 @@ pub struct WriteLockedGraph<'a, EXT> where EXT: PersistentStrategy, ES = ES, GS = GS>, { - pub nodes: WriteLockedNodePages<'a, storage::NS>, + pub nodes: WriteLockedNodeSegments<'a, storage::NS>, pub edges: WriteLockedEdgePages<'a, storage::ES>, pub graph_props: WriteLockedGraphPropPages<'a, storage::GS>, pub graph: &'a TemporalGraph, diff --git a/raphtory-storage/src/mutation/addition_ops_ext.rs b/raphtory-storage/src/mutation/addition_ops_ext.rs index db63cda054..4ca8fd4498 100644 --- a/raphtory-storage/src/mutation/addition_ops_ext.rs +++ b/raphtory-storage/src/mutation/addition_ops_ext.rs @@ -20,10 +20,10 @@ use raphtory_core::{ }; use storage::{ pages::{node_page::writer::node_info_as_props, session::WriteSession}, - persist::strategy::PersistentStrategy, + persist::strategy::{Config, PersistentStrategy}, properties::props_meta_writer::PropsMetaWriter, resolver::GIDResolverOps, - Config, Extension, WalImpl, ES, GS, NS, + Extension, WalImpl, ES, GS, NS, }; pub struct WriteS<'a, EXT: PersistentStrategy, ES = ES, GS = GS>> { From 8f1da086cb192daf6c34af4a9ef9019008681845 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Wed, 17 Dec 2025 12:35:36 +0000 Subject: [PATCH 21/24] simple test gets the num nodes correctly --- db4-storage/src/state.rs | 6 +- .../entities/properties/prop/prop_enum.rs | 3 +- raphtory/src/db/task/mod.rs | 2 +- raphtory/src/errors.rs | 7 +- raphtory/src/io/arrow/dataframe.rs | 4 + .../src/io/arrow/df_loaders/edge_props.rs | 24 ++- raphtory/src/io/arrow/df_loaders/edges.rs | 68 ++++++-- raphtory/src/io/arrow/df_loaders/mod.rs | 25 +-- raphtory/src/io/arrow/df_loaders/nodes.rs | 148 +++++++++++++----- raphtory/src/io/arrow/layer_col.rs | 69 ++++++-- raphtory/src/io/arrow/mod.rs | 12 +- raphtory/src/io/arrow/node_col.rs | 8 + raphtory/src/io/parquet_loaders.rs | 11 +- raphtory/src/python/graph/graph.rs | 9 +- .../src/python/graph/graph_with_deletions.rs | 9 +- .../src/python/graph/io/pandas_loaders.rs | 1 + raphtory/src/serialise/parquet/edges.rs | 1 + raphtory/src/serialise/parquet/mod.rs | 58 +++---- raphtory/src/serialise/parquet/model.rs | 14 +- raphtory/src/serialise/parquet/nodes.rs | 14 +- raphtory/src/serialise/serialise.rs | 11 +- raphtory/tests/df_loaders.rs | 62 ++------ 22 files changed, 333 insertions(+), 233 deletions(-) diff --git a/db4-storage/src/state.rs b/db4-storage/src/state.rs index 3b0563ab5e..b86bd23697 100644 --- a/db4-storage/src/state.rs +++ b/db4-storage/src/state.rs @@ -1,5 +1,7 @@ -use rayon::iter::plumbing::{Producer, ProducerCallback, UnindexedConsumer, bridge}; -use rayon::prelude::*; +use rayon::{ + iter::plumbing::{Producer, ProducerCallback, UnindexedConsumer, bridge}, + prelude::*, +}; use std::{ ops::{Index, IndexMut}, sync::Arc, diff --git a/raphtory-api/src/core/entities/properties/prop/prop_enum.rs b/raphtory-api/src/core/entities/properties/prop/prop_enum.rs index a076cd4b52..3e6adea4b4 100644 --- a/raphtory-api/src/core/entities/properties/prop/prop_enum.rs +++ b/raphtory-api/src/core/entities/properties/prop/prop_enum.rs @@ -23,10 +23,9 @@ use std::{ }; use thiserror::Error; -use crate::core::entities::properties::prop::prop_array::*; +use crate::core::entities::{properties::prop::prop_array::*, GID}; use arrow_array::{cast::AsArray, ArrayRef, LargeListArray, StructArray}; use arrow_schema::{DataType, Field, FieldRef}; -use crate::core::entities::GID; pub const DECIMAL_MAX: i128 = 99999999999999999999999999999999999999i128; // equivalent to parquet decimal(38, 0) diff --git a/raphtory/src/db/task/mod.rs b/raphtory/src/db/task/mod.rs index 276feb8d57..025a1544ae 100644 --- a/raphtory/src/db/task/mod.rs +++ b/raphtory/src/db/task/mod.rs @@ -89,7 +89,7 @@ mod task_tests { vec![], vec![Job::new(step1)], None, - |egs, _, _, _,_| egs.finalize(&count), + |egs, _, _, _, _| egs.finalize(&count), Some(2), 1, None, diff --git a/raphtory/src/errors.rs b/raphtory/src/errors.rs index f7d790b29b..1b6ef29ab2 100644 --- a/raphtory/src/errors.rs +++ b/raphtory/src/errors.rs @@ -19,9 +19,10 @@ use raphtory_core::{ use raphtory_storage::mutation::MutationError; use std::{ backtrace::Backtrace, + error::Error, fmt::Debug, - io, panic, - panic::Location, + io, + panic::{self, Location}, path::{PathBuf, StripPrefixError}, sync::Arc, time::SystemTimeError, @@ -91,8 +92,6 @@ pub enum LoadError { MissingEdgeError(VID, VID), #[error("Node IDs have the wrong type, expected {existing}, got {new}")] NodeIdTypeError { existing: GidType, new: GidType }, - #[error("Fatal load error, graph may be in a dirty state.")] - FatalError, #[error("Arrow error: {0:?}")] Arrow(#[from] ArrowError), } diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs index ecf67d0d9b..3e2e67dbcb 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/io/arrow/dataframe.rs @@ -178,6 +178,10 @@ impl DFChunk { self.chunk.first().map(|c| c.len()).unwrap_or(0) } + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + pub fn node_col(&self, index: usize) -> Result { lift_node_col(index, self) } diff --git a/raphtory/src/io/arrow/df_loaders/edge_props.rs b/raphtory/src/io/arrow/df_loaders/edge_props.rs index 77ad9e2c42..a2de196f94 100644 --- a/raphtory/src/io/arrow/df_loaders/edge_props.rs +++ b/raphtory/src/io/arrow/df_loaders/edge_props.rs @@ -20,9 +20,8 @@ use bytemuck::checked::cast_slice_mut; use db4_graph::WriteLockedGraph; use itertools::izip; use kdam::BarExt; -use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; use raphtory_api::{ - atomic_extra::atomic_usize_from_mut_slice, + atomic_extra::{atomic_usize_from_mut_slice, atomic_vid_from_mut_slice}, core::{ entities::EID, storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, @@ -62,6 +61,7 @@ pub fn load_edges_from_df() + .ok_or_else(|| { + LoadError::InvalidLayerType(df.chunk[idx].data_type().clone()) + }) + .map(|array| array.values().as_ref()) + }) + .transpose()?; + let layer_col_resolved = layer.resolve_layer(layer_id_values, graph)?; let (src_vids, dst_vids, gid_str_cache) = get_or_resolve_node_vids( graph, diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 27b8d41e68..55e77b777b 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -18,9 +18,8 @@ use bytemuck::checked::cast_slice_mut; use db4_graph::WriteLockedGraph; use itertools::izip; use kdam::BarExt; -use raphtory_api::atomic_extra::atomic_vid_from_mut_slice; use raphtory_api::{ - atomic_extra::atomic_usize_from_mut_slice, + atomic_extra::{atomic_usize_from_mut_slice, atomic_vid_from_mut_slice}, core::{ entities::EID, storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry, FxDashMap}, @@ -53,13 +52,46 @@ pub struct ColumnNames<'a> { pub dst: &'a str, pub edge_id: Option<&'a str>, pub layer_col: Option<&'a str>, + pub layer_id_col: Option<&'a str>, +} + +impl<'a> ColumnNames<'a> { + pub fn new( + time: &'a str, + secondary_index: Option<&'a str>, + + src: &'a str, + dst: &'a str, + + layer_col: Option<&'a str>, + ) -> Self { + Self { + time, + secondary_index, + src, + dst, + layer_col, + edge_id: None, + layer_id_col: None, + } + } + + pub fn with_layer_id_col(mut self, layer_id_col: &'a str) -> Self { + self.layer_id_col = Some(layer_id_col); + self + } + + pub fn with_edge_id_col(mut self, edge_id: &'a str) -> Self { + self.edge_id = Some(edge_id); + self + } } #[allow(clippy::too_many_arguments)] pub fn load_edges_from_df( df_view: DFView> + Send>, column_names: ColumnNames, - resolve_nodes: bool, + resolve_nodes: bool, // this is reserved for internal parquet encoders, this cannot be exposed to users properties: &[&str], metadata: &[&str], shared_metadata: Option<&HashMap>, @@ -77,6 +109,7 @@ pub fn load_edges_from_df() + .ok_or_else(|| { + LoadError::InvalidLayerType(df.chunk[idx].data_type().clone()) + }) + .map(|array| array.values().as_ref()) + }) + .transpose()?; + let layer_col_resolved = layer.resolve_layer(layer_id_values, graph)?; let (src_vids, dst_vids, gid_str_cache) = get_or_resolve_node_vids( graph, @@ -183,7 +229,7 @@ pub fn load_edges_from_df()? diff --git a/raphtory/src/io/arrow/df_loaders/mod.rs b/raphtory/src/io/arrow/df_loaders/mod.rs index c2fd5194b4..3080eceef5 100644 --- a/raphtory/src/io/arrow/df_loaders/mod.rs +++ b/raphtory/src/io/arrow/df_loaders/mod.rs @@ -142,14 +142,7 @@ pub(crate) fn load_edges_props_from_df< ) -> Result<(), GraphError> { edge_props::load_edges_from_df( df_view, - ColumnNames { - src, - dst, - layer_col, - time: "", - secondary_index: None, - edge_id: None, - }, + ColumnNames::new("", None, src, dst, layer_col), resolve_nodes, metadata, shared_metadata, @@ -303,9 +296,8 @@ fn resolve_nodes_with_cache<'a, G: StaticGraphViewOps + PropertyAdditionOps + Ad let GidKey { gid, .. } = gid; let vid = graph .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError) - .unwrap(); - (Prop::from(gid), vid) + .map_err(into_graph_err)?; + Ok((Prop::from(gid), vid)) }, ) } @@ -332,9 +324,8 @@ fn resolve_nodes_and_type_with_cache< let GidKey { gid, node_type } = gid; let (vid, node_type) = graph .resolve_node_and_type(gid.as_node_ref(), node_type) - .map_err(|_| LoadError::FatalError) - .unwrap(); - (vid, node_type) + .map_err(into_graph_err)?; + Ok((vid, node_type)) }, ) } @@ -356,7 +347,7 @@ fn resolve_nodes_with_cache_generic<'a, V: Send + Sync>( cols_to_resolve: &[&'a NodeCol], node_type_cols: &[Option>], update_fn: impl Fn(&V, usize, usize) + Send + Sync, - new_fn: impl Fn(GidKey<'a>, usize) -> V + Send + Sync, + new_fn: impl Fn(GidKey<'a>, usize) -> Result + Send + Sync, ) -> Result, V>, GraphError> { assert_eq!(cols_to_resolve.len(), node_type_cols.len()); let gid_str_cache: dashmap::DashMap, V, _> = FxDashMap::default(); @@ -395,7 +386,7 @@ fn resolve_nodes_with_cache_generic<'a, V: Send + Sync>( let v = value.get(); update_fn(&v, idx, col_id); } else { - let v = new_fn(gid, idx); + let v = new_fn(gid, idx)?; update_fn(&v, idx, col_id); let data = (gid, SharedValue::new(v)); @@ -404,7 +395,7 @@ fn resolve_nodes_with_cache_generic<'a, V: Send + Sync>( } } - Ok::<(), LoadError>(()) + Ok::<(), GraphError>(()) })?; Ok(gid_str_cache) } diff --git a/raphtory/src/io/arrow/df_loaders/nodes.rs b/raphtory/src/io/arrow/df_loaders/nodes.rs index ac236ddaf5..f7375fcd64 100644 --- a/raphtory/src/io/arrow/df_loaders/nodes.rs +++ b/raphtory/src/io/arrow/df_loaders/nodes.rs @@ -32,7 +32,6 @@ use raphtory_storage::mutation::addition_ops::{InternalAdditionOps, SessionAddit use rayon::prelude::*; use std::collections::HashMap; use storage::{api::nodes::NodeSegmentOps, pages::locked::nodes::LockedNodePage, Extension}; -use zip::unstable::write; pub fn load_nodes_from_df< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + std::fmt::Debug, @@ -217,6 +216,9 @@ pub(crate) fn load_node_props_from_df< for chunk in df_view.chunks { let df = chunk?; + if df.is_empty() { + continue; + } let metadata_cols = combine_properties_arrow(metadata, &metadata_indices, &df, |key, dtype| { session @@ -228,6 +230,7 @@ pub(crate) fn load_node_props_from_df< let (node_col_resolved, node_type_col_resolved) = get_or_resolve_node_vids_no_events::( graph, + &session, &mut node_col_resolved, &mut node_type_resolved, node_type_ids_col, @@ -319,6 +322,7 @@ fn get_or_resolve_node_vids_no_events< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, >( graph: &G, + session: &::WS<'_>, node_col_resolved: &'a mut Vec, node_type_resolved: &'a mut Vec, node_type_ids_col: Option, @@ -329,55 +333,115 @@ fn get_or_resolve_node_vids_no_events< ) -> Result<(&'c [VID], &'c [usize]), GraphError> { assert!(!(node_type_ids_col.is_none() ^ node_id_col.is_none())); // both some or both none if let Some((node_type_index, node_id_col)) = node_type_ids_col.zip(node_id_col) { - let srcs = df.chunk[node_id_col] - .as_primitive_opt::() - .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[node_id_col].data_type().clone()))? - .values() - .as_ref(); - - let node_types = df.chunk[node_type_index] - .as_primitive_opt::() - .ok_or_else(|| { - LoadError::InvalidNodeType(df.chunk[node_type_index].data_type().clone()) - })? - .values() - .as_ref(); - - let mut locked_mapper = graph.node_meta().node_type_meta().write(); - - for (row, node_type) in node_types.iter().enumerate() { - if let Some(name) = node_type_col.get(row) { - locked_mapper.set_id(name, *node_type as usize); - } - } - - Ok((bytemuck::cast_slice(srcs), bytemuck::cast_slice(node_types))) + set_meta_for_pre_resolved_nodes_and_node_ids( + graph, + session, + df, + src_col, + node_type_col, + node_type_index, + node_id_col, + ) } else { - node_col_resolved.resize_with(df.len(), Default::default); - node_type_resolved.resize_with(df.len(), Default::default); - - let mut locked_mapper = graph.node_meta().node_type_meta().write(); - - let zip = izip!( - src_col.iter(), - node_type_col.iter(), - node_col_resolved.iter_mut(), - node_type_resolved.iter_mut() - ); + resolve_node_and_meta_for_node_col( + graph, + node_col_resolved, + node_type_resolved, + df, + src_col, + node_type_col, + ) + } +} - for (gid, node_type, vid, node_type_id) in zip { +fn resolve_node_and_meta_for_node_col< + 'a, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + node_col_resolved: &'a mut Vec, + node_type_resolved: &'a mut Vec, + df: &DFChunk, + src_col: &NodeCol, + node_type_col: LayerCol<'a>, +) -> Result<(&'a [VID], &'a [usize]), GraphError> { + node_col_resolved.resize_with(df.len(), Default::default); + node_type_resolved.resize_with(df.len(), Default::default); + + let mut locked_mapper = graph.node_meta().node_type_meta().write(); + + let zip = izip!( + src_col.iter(), + node_type_col.iter(), + node_col_resolved.iter_mut(), + node_type_resolved.iter_mut() + ); + + let mut last_node_type: Option<&str> = None; + for (gid, node_type, vid, node_type_id) in zip { + if last_node_type != node_type { if let Some(name) = node_type { *node_type_id = locked_mapper.get_or_create_id(name).inner(); } - - let res_vid = graph - .resolve_node(gid.as_node_ref()) - .map_err(|_| LoadError::FatalError)?; - *vid = res_vid.inner(); } - Ok((node_col_resolved.as_slice(), node_type_resolved.as_slice())) + let res_vid = graph + .resolve_node(gid.as_node_ref()) + .map_err(into_graph_err)?; + *vid = res_vid.inner(); + last_node_type = node_type; + } + + Ok((node_col_resolved.as_slice(), node_type_resolved.as_slice())) +} + +fn set_meta_for_pre_resolved_nodes_and_node_ids< + 'b, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + session: &::WS<'_>, + df: &'b DFChunk, + src_col: &NodeCol, + node_type_col: LayerCol<'_>, + node_type_index: usize, + node_id_col: usize, +) -> Result<(&'b [VID], &'b [usize]), GraphError> { + let srcs = df.chunk[node_id_col] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeIdType(df.chunk[node_id_col].data_type().clone()))? + .values() + .as_ref(); + + let node_types = df.chunk[node_type_index] + .as_primitive_opt::() + .ok_or_else(|| LoadError::InvalidNodeType(df.chunk[node_type_index].data_type().clone()))? + .values() + .as_ref(); + + let mut locked_mapper = graph.node_meta().node_type_meta().write(); + + let zip = izip!( + src_col.iter(), + srcs.iter(), + node_type_col.iter(), + node_types.iter() + ); + + let mut last_node_type: Option<&str> = None; + + for (gid, node_id, node_type, node_type_id) in zip { + if last_node_type != node_type { + let node_type_name = node_type.unwrap_or("_default"); + locked_mapper.set_id(node_type_name, *node_type_id as usize); + } + last_node_type = node_type; + session + .set_node(gid, VID(*node_id as usize)) + .map_err(into_graph_err)?; } + + Ok((bytemuck::cast_slice(srcs), bytemuck::cast_slice(node_types))) } #[inline(never)] diff --git a/raphtory/src/io/arrow/layer_col.rs b/raphtory/src/io/arrow/layer_col.rs index e2a4336ced..963cb77691 100644 --- a/raphtory/src/io/arrow/layer_col.rs +++ b/raphtory/src/io/arrow/layer_col.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + use crate::{ errors::{into_graph_err, GraphError, LoadError}, io::arrow::dataframe::DFChunk, @@ -88,28 +90,65 @@ impl<'a> LayerCol<'a> { } } - pub fn resolve_layer( + pub fn resolve_layer<'b>( self, + layer_id_col: Option<&'b [u64]>, graph: &(impl AdditionOps + Send + Sync), - ) -> Result, GraphError> { - match self { - LayerCol::Name { name, len } => { + ) -> Result, GraphError> { + match (self, layer_id_col) { + (LayerCol::Name { name, len }, _) => { let layer = graph.resolve_layer(name).map_err(into_graph_err)?.inner(); - Ok(vec![layer; len]) + Ok(Cow::Owned(vec![layer; len])) } - col => { - let iter = col.par_iter(); - let mut res = vec![0usize; iter.len()]; - iter.zip(res.par_iter_mut()) - .try_for_each(|(layer, entry)| { - let layer = graph.resolve_layer(layer).map_err(into_graph_err)?.inner(); - *entry = layer; - Ok::<(), GraphError>(()) - })?; - Ok(res) + (col, None) => { + let mut res = vec![0usize; col.len()]; + let mut last_name = None; + for (row, name) in col.iter().enumerate() { + if last_name == name { + continue; + } + + let layer = graph.resolve_layer(name).map_err(into_graph_err)?.inner(); + res[row] = layer; + last_name = name; + } + Ok(Cow::Owned(res)) + } + (col, Some(layer_ids)) => { + let mut last_pair = None; + + let edge_layer_mapper = graph.edge_meta().layer_meta(); + let node_layer_mapper = graph.node_meta().layer_meta(); + + let mut locked_edge_lm = edge_layer_mapper.write(); + let mut locked_node_lm = node_layer_mapper.write(); + + for pair @ (name, id) in col + .iter() + .map(|name| name.unwrap_or("_default")) + .zip(layer_ids) + { + if let Some(last_pair) = last_pair { + if last_pair != pair { + locked_edge_lm.set_id(name, *id as usize); + locked_node_lm.set_id(name, *id as usize); + } + } + last_pair = Some(pair); + } + Ok(Cow::Borrowed(bytemuck::cast_slice(layer_ids))) } } } + + pub fn len(&self) -> usize { + match self { + LayerCol::Name { len, .. } => *len, + LayerCol::Utf8 { col } => col.len(), + LayerCol::LargeUtf8 { col } => col.len(), + LayerCol::Utf8View { col } => col.len(), + } + } } pub(crate) fn lift_layer_col<'a>( diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs index 005cf3ed77..b56aef26c2 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/io/arrow/mod.rs @@ -11,7 +11,7 @@ mod test { dataframe::{DFChunk, DFView}, df_loaders::{ edges::{load_edges_from_df, ColumnNames}, - *, + nodes::load_nodes_from_df, }, }, prelude::*, @@ -57,14 +57,7 @@ mod test { load_edges_from_df( df, - ColumnNames { - time: "time", - secondary_index, - src: "src", - dst: "dst", - edge_id: None, - layer_col, - }, + ColumnNames::new("time", secondary_index, "src", "dst", layer_col), true, &["prop1", "prop2"], &[], @@ -168,6 +161,7 @@ mod test { Some("node_type"), None, &graph, + true, ) .expect("failed to load nodes from pretend df"); diff --git a/raphtory/src/io/arrow/node_col.rs b/raphtory/src/io/arrow/node_col.rs index 3a4c64ef56..5a07666d8a 100644 --- a/raphtory/src/io/arrow/node_col.rs +++ b/raphtory/src/io/arrow/node_col.rs @@ -228,6 +228,14 @@ impl NodeCol { pub fn dtype(&self) -> GidType { self.0.dtype() } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn get(&self, i: usize) -> Option> { + self.0.get(i) + } } pub fn lift_node_col(index: usize, df: &DFChunk) -> Result { diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index f67dd525a2..31606c6b4b 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -169,12 +169,13 @@ pub fn load_node_props_from_parquet< shared_metadata: Option<&HashMap>, batch_size: Option, ) -> Result<(), GraphError> { - let mut cols_to_check = vec![id]; - cols_to_check.extend_from_slice(metadata_properties); + let mut cols_to_check = std::iter::once(id) + .chain(node_type_id_col) + .chain(node_type_col) + .chain(node_id_col) + .collect::>(); - if let Some(ref node_type_col) = node_type_col { - cols_to_check.push(node_type_col.as_ref()); - } + cols_to_check.extend_from_slice(metadata_properties); for path in get_parquet_file_paths(parquet_path)? { let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check), batch_size)?; diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 81f0a3d32b..9cbccf4558 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -813,14 +813,7 @@ impl PyGraph { load_edges_from_parquet( &self.graph, parquet_path.as_path(), - ColumnNames { - time, - secondary_index, - src, - dst, - layer_col, - edge_id: None, - }, + ColumnNames::new(time, secondary_index, src, dst, layer_col), true, &properties, &metadata, diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index e700354423..49b967b9c4 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -757,14 +757,7 @@ impl PyPersistentGraph { load_edges_from_parquet( &self.graph, parquet_path.as_path(), - ColumnNames { - time, - secondary_index, - src, - dst, - layer_col, - edge_id: None, - }, + ColumnNames::new(time, secondary_index, src, dst, layer_col), true, &properties, &metadata, diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs index 4b8ab44b9b..40edf70d3c 100644 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -109,6 +109,7 @@ pub(crate) fn load_edges_from_pandas< dst, layer_col, edge_id: None, + layer_id_col: None, }, true, properties, diff --git a/raphtory/src/serialise/parquet/edges.rs b/raphtory/src/serialise/parquet/edges.rs index 28052f511c..48b1c44c21 100644 --- a/raphtory/src/serialise/parquet/edges.rs +++ b/raphtory/src/serialise/parquet/edges.rs @@ -30,6 +30,7 @@ pub(crate) fn encode_edge_tprop( Field::new(DST_COL_ID, DataType::UInt64, false), Field::new(EDGE_COL_ID, DataType::UInt64, false), Field::new(LAYER_COL, DataType::Utf8, true), + Field::new(LAYER_ID_COL, DataType::UInt64, true), ] }, |edges, g, decoder, writer| { diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index 2a17e179a6..c096bb5cb0 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -190,12 +190,11 @@ const TYPE_COL: &str = "rap_node_type"; const TYPE_ID_COL: &str = "rap_node_type_id"; const TIME_COL: &str = "rap_time"; const SECONDARY_INDEX_COL: &str = "rap_secondary_index"; -const SRC_COL: &str = "rap_src"; -const DST_COL: &str = "rap_dst"; const SRC_COL_ID: &str = "rap_src_id"; const DST_COL_ID: &str = "rap_dst_id"; const EDGE_COL_ID: &str = "rap_edge_id"; const LAYER_COL: &str = "rap_layer"; +const LAYER_ID_COL: &str = "rap_layer_id"; const EDGES_T_PATH: &str = "edges_t"; const EDGES_D_PATH: &str = "edges_d"; // deletions const EDGES_C_PATH: &str = "edges_c"; @@ -528,7 +527,7 @@ fn decode_graph_storage( Some(SECONDARY_INDEX_COL), NODE_VID_COL, None, - Some(TYPE_COL), + None, &t_prop_columns, &[], None, @@ -556,14 +555,15 @@ fn decode_graph_storage( load_edges_from_parquet( &graph, &t_edge_path, - ColumnNames { - time: TIME_COL, - secondary_index: Some(SECONDARY_INDEX_COL), - src: SRC_COL_ID, - dst: DST_COL_ID, - layer_col: Some(LAYER_COL), - edge_id: None, - }, + ColumnNames::new( + TIME_COL, + Some(SECONDARY_INDEX_COL), + SRC_COL_ID, + DST_COL_ID, + Some(LAYER_COL), + ) + .with_layer_id_col(LAYER_ID_COL) + .with_edge_id_col(EDGE_COL_ID), false, &t_prop_columns, &[], @@ -573,22 +573,6 @@ fn decode_graph_storage( )?; } - let d_edge_path = path.as_ref().join(EDGES_D_PATH); - - if std::fs::exists(&d_edge_path)? { - load_edge_deletions_from_parquet( - graph.core_graph(), - &d_edge_path, - TIME_COL, - Some(SECONDARY_INDEX_COL), - SRC_COL, - DST_COL, - None, - Some(LAYER_COL), - batch_size, - )?; - } - let c_edge_path = path.as_ref().join(EDGES_C_PATH); if std::fs::exists(&c_edge_path)? { @@ -602,8 +586,8 @@ fn decode_graph_storage( load_edge_props_from_parquet( &graph, &c_edge_path, - SRC_COL, - DST_COL, + SRC_COL_ID, + DST_COL_ID, &metadata, None, None, @@ -613,6 +597,22 @@ fn decode_graph_storage( )?; } + let d_edge_path = path.as_ref().join(EDGES_D_PATH); + + if std::fs::exists(&d_edge_path)? { + load_edge_deletions_from_parquet( + graph.core_graph(), + &d_edge_path, + TIME_COL, + Some(SECONDARY_INDEX_COL), + SRC_COL_ID, + DST_COL_ID, + None, + Some(LAYER_COL), + batch_size, + )?; + } + Ok(graph) } diff --git a/raphtory/src/serialise/parquet/model.rs b/raphtory/src/serialise/parquet/model.rs index 456ba199de..b5772767d7 100644 --- a/raphtory/src/serialise/parquet/model.rs +++ b/raphtory/src/serialise/parquet/model.rs @@ -1,13 +1,13 @@ -use super::{ - Prop, DST_COL, LAYER_COL, NODE_ID_COL, SECONDARY_INDEX_COL, SRC_COL, TIME_COL, TYPE_COL, -}; +use super::{Prop, LAYER_COL, NODE_ID_COL, SECONDARY_INDEX_COL, TIME_COL, TYPE_COL}; use crate::{ db::{ api::view::StaticGraphViewOps, graph::{edge::EdgeView, node::NodeView}, }, prelude::*, - serialise::parquet::{DST_COL_ID, EDGE_COL_ID, NODE_VID_COL, SRC_COL_ID, TYPE_ID_COL}, + serialise::parquet::{ + DST_COL_ID, EDGE_COL_ID, LAYER_ID_COL, NODE_VID_COL, SRC_COL_ID, TYPE_ID_COL, + }, }; use arrow::datatypes::DataType; use raphtory_api::core::{ @@ -53,12 +53,18 @@ impl<'a, G: StaticGraphViewOps> Serialize for ParquetTEdge<'a, G> { .layer_name() .map_err(|_| S::Error::custom("Edge has no layer"))?; + let layer_id = edge + .edge + .layer() + .ok_or_else(|| S::Error::custom("Edge has no layer"))?; + state.serialize_entry(TIME_COL, &t.0)?; state.serialize_entry(SECONDARY_INDEX_COL, &t.1)?; state.serialize_entry(SRC_COL_ID, &edge.src().node.0)?; state.serialize_entry(DST_COL_ID, &edge.dst().node.0)?; state.serialize_entry(EDGE_COL_ID, &edge.edge.pid())?; state.serialize_entry(LAYER_COL, &layer)?; + state.serialize_entry(LAYER_ID_COL, &layer_id)?; for (name, prop) in edge.properties().temporal().iter_latest() { state.serialize_entry(&name, &SerdeProp(&prop))?; diff --git a/raphtory/src/serialise/parquet/nodes.rs b/raphtory/src/serialise/parquet/nodes.rs index 236e9ae0c7..d15764b19f 100644 --- a/raphtory/src/serialise/parquet/nodes.rs +++ b/raphtory/src/serialise/parquet/nodes.rs @@ -34,7 +34,6 @@ pub(crate) fn encode_nodes_tprop( |nodes, g, decoder, writer| { let row_group_size = 100_000; let nodes = nodes.collect::>(); - dbg!(&nodes); let nodes = nodes.into_iter(); @@ -45,14 +44,11 @@ pub(crate) fn encode_nodes_tprop( .flat_map(move |node| { GenLockedIter::from(node, |node| { node.rows() - .map(|(t, props)| { - dbg!(&t, &props); - ParquetTNode { - node: *node, - cols, - t, - props, - } + .map(|(t, props)| ParquetTNode { + node: *node, + cols, + t, + props, }) .into_dyn_boxed() }) diff --git a/raphtory/src/serialise/serialise.rs b/raphtory/src/serialise/serialise.rs index 88ff739193..4ccb8d60dd 100644 --- a/raphtory/src/serialise/serialise.rs +++ b/raphtory/src/serialise/serialise.rs @@ -93,15 +93,14 @@ impl StableDecode for T { path: impl Into, path_for_decoded_graph: Option<&Path>, ) -> Result { - let graph; let folder: GraphFolder = path.into(); - if folder.is_zip() { - let reader = std::fs::File::open(&folder.get_base_path())?; - graph = Self::decode_parquet_from_zip(reader, path_for_decoded_graph)?; + let graph = if folder.is_zip() { + let reader = std::fs::File::open(folder.get_base_path())?; + Self::decode_parquet_from_zip(reader, path_for_decoded_graph)? } else { - graph = Self::decode_parquet(&folder.get_graph_path(), path_for_decoded_graph)?; - } + Self::decode_parquet(folder.get_graph_path(), path_for_decoded_graph)? + }; #[cfg(feature = "search")] graph.load_index(&folder)?; diff --git a/raphtory/tests/df_loaders.rs b/raphtory/tests/df_loaders.rs index 088c63561c..185ea3d0cb 100644 --- a/raphtory/tests/df_loaders.rs +++ b/raphtory/tests/df_loaders.rs @@ -12,7 +12,7 @@ mod io_tests { dataframe::{DFChunk, DFView}, df_loaders::{ edges::{load_edges_from_df, ColumnNames}, - load_nodes_from_df, + nodes::load_nodes_from_df, }, }, prelude::*, @@ -212,14 +212,7 @@ mod io_tests { let props = ["str_prop", "int_prop"]; let secondary_index = None; load_edges_from_df(df_view, - ColumnNames { - time: "time", - secondary_index, - src: "src", - dst: "dst", - edge_id: None, - layer_col: None, - }, + ColumnNames::new("time", secondary_index, "src", "dst", None), true, &props, &[], None, None, &g).unwrap(); @@ -256,14 +249,7 @@ mod io_tests { load_edges_from_df( df_view, - ColumnNames { - time: "time", - secondary_index, - src: "src", - dst: "dst", - edge_id: None, - layer_col: None, - }, + ColumnNames::new("time", secondary_index, "src", "dst", None), true, &props, &[], @@ -302,8 +288,7 @@ mod io_tests { let df_view = build_df_str(chunk_size, &edges); let g = Graph::new(); let props = ["str_prop", "int_prop"]; - let secondary_index = None; - load_edges_from_df(df_view, ColumnNames {time: "time", secondary_index, src: "src", dst: "dst", edge_id: None, layer_col:None},true, &props, &[], None, None, &g).unwrap(); + load_edges_from_df(df_view, ColumnNames::new("time", None, "src", "dst", None), true, &props, &[], None, None, &g).unwrap(); let g2 = Graph::new(); @@ -323,18 +308,9 @@ mod io_tests { let df_view = build_df_str(1, &edges); let g = Graph::new(); let props = ["str_prop", "int_prop"]; - let secondary_index = None; - load_edges_from_df( df_view, - ColumnNames { - time: "time", - secondary_index, - src: "src", - dst: "dst", - edge_id: None, - layer_col: None, - }, + ColumnNames::new("time", None, "src", "dst", None), true, &props, &[], @@ -372,14 +348,7 @@ mod io_tests { // Load edges from DataFrame with secondary_index load_edges_from_df( df_view, - ColumnNames { - time: "time", - secondary_index, - src: "src", - dst: "dst", - edge_id: None, - layer_col: None, - }, + ColumnNames::new("time", secondary_index, "src", "dst", None), true, &props, &[], @@ -437,14 +406,7 @@ mod io_tests { load_edges_from_df( df_view, - ColumnNames { - time: "time", - secondary_index, - src: "src", - dst: "dst", - edge_id: None, - layer_col: None, - }, + ColumnNames::new("time", secondary_index, "src", "dst", None), true, &props, &[], @@ -529,6 +491,7 @@ mod io_tests { None, None, &g, + true, ) .unwrap(); @@ -591,14 +554,7 @@ mod io_tests { let secondary_index = None; load_edges_from_df( df_view, - ColumnNames { - time: "time", - secondary_index, - src: "src", - dst: "dst", - edge_id: None, - layer_col: None, - }, + ColumnNames::new("time", secondary_index, "src", "dst", None), true, &props, &[], From 02f88c6157da39f2c8f9ee35fbd92adb3f4cf00f Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Wed, 17 Dec 2025 14:34:07 +0000 Subject: [PATCH 22/24] fix edge count on parquet decoding --- db4-graph/src/lib.rs | 7 ++----- raphtory/src/db/api/view/graph.rs | 9 ++++++++- raphtory/src/io/arrow/dataframe.rs | 7 +++++++ raphtory/src/io/arrow/df_loaders/edges.rs | 16 ++++++++++++++-- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/db4-graph/src/lib.rs b/db4-graph/src/lib.rs index 1df44e12c9..1a9f06289b 100644 --- a/db4-graph/src/lib.rs +++ b/db4-graph/src/lib.rs @@ -405,11 +405,8 @@ impl<'a, EXT: PersistentStrategy, ES = ES, GS = GS>> } } - pub fn resize_chunks_to_num_edges(&mut self, num_edges: usize) { - if num_edges == 0 { - return; - } - let (chunks_needed, _) = self.graph.storage.edges().resolve_pos(EID(num_edges - 1)); + pub fn resize_chunks_to_num_edges(&mut self, max_eid: EID) { + let (chunks_needed, _) = self.graph.storage.edges().resolve_pos(max_eid); self.graph.storage().edges().grow(chunks_needed + 1); std::mem::take(&mut self.edges); self.edges = self.graph.storage.edges().write_locked(); diff --git a/raphtory/src/db/api/view/graph.rs b/raphtory/src/db/api/view/graph.rs index 343327613d..2dc3145c99 100644 --- a/raphtory/src/db/api/view/graph.rs +++ b/raphtory/src/db/api/view/graph.rs @@ -401,7 +401,14 @@ impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { Ok::<(), MutationError>(()) })?; - new_storage.resize_chunks_to_num_edges(self.count_edges()); + let mut new_eids = vec![]; + let mut max_eid = 0usize; + for (row, _) in self.edges().iter().enumerate() { + let new_eid = new_storage.graph().storage().edges().reserve_new_eid(row); + new_eids.push(new_eid); + max_eid = new_eid.0.max(max_eid); + } + new_storage.resize_chunks_to_num_edges(EID(max_eid)); for layer_id in &layer_map { new_storage.edges.ensure_layer(*layer_id); diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs index 3e2e67dbcb..68ee14ede1 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/io/arrow/dataframe.rs @@ -162,6 +162,13 @@ impl SecondaryIndexCol { pub fn max(&self) -> usize { self.iter().max().unwrap_or(0) } + + pub fn len(&self) -> usize { + match self { + SecondaryIndexCol::DataFrame(arr) => arr.len(), + SecondaryIndexCol::Range(range) => range.len(), + } + } } #[derive(Clone, Debug)] diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 55e77b777b..3de24ecc08 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -43,6 +43,7 @@ use storage::{ }, Extension, }; +use zip::unstable::write; #[derive(Debug, Copy, Clone)] pub struct ColumnNames<'a> { @@ -162,6 +163,8 @@ pub fn load_edges_from_df>( let mut t_props: Vec<(usize, Prop)> = vec![]; let mut c_props: Vec<(usize, Prop)> = vec![]; - for (row, (src, dst, time, secondary_index, eid, layer, exists)) in zip.enumerate() { + for item @ (row, (src, dst, time, secondary_index, eid, layer, exists)) in zip.enumerate() { if let Some(eid_pos) = shard.resolve_pos(*eid) { let t = TimeIndexEntry(time, secondary_index); let mut writer = shard.writer(); From 0994310a84c22c9114fe80a270f65a99b4eee959 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Thu, 18 Dec 2025 13:17:55 +0000 Subject: [PATCH 23/24] fixes for various counts/comparisons on bulk loading --- db4-storage/src/gen_ts.rs | 24 +++++++++++------------ db4-storage/src/pages/edge_page/writer.rs | 1 + db4-storage/src/segments/edge/segment.rs | 1 + raphtory/src/io/arrow/df_loaders/edges.rs | 2 +- raphtory/src/serialise/graph_folder.rs | 1 + raphtory/src/serialise/parquet/mod.rs | 2 +- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/db4-storage/src/gen_ts.rs b/db4-storage/src/gen_ts.rs index be9fdcfe60..70c20ce95f 100644 --- a/db4-storage/src/gen_ts.rs +++ b/db4-storage/src/gen_ts.rs @@ -41,7 +41,7 @@ impl<'a> From<&'a LayerIds> for LayerIter<'a> { pub struct GenericTimeOps<'a, Ref> { range: Option<(TimeIndexEntry, TimeIndexEntry)>, layer_id: LayerIter<'a>, - node: Ref, + item_ref: Ref, } impl<'a, Ref> GenericTimeOps<'a, Ref> { @@ -49,7 +49,7 @@ impl<'a, Ref> GenericTimeOps<'a, Ref> { Self { range: None, layer_id: layer_id.into(), - node, + item_ref: node, } } @@ -57,7 +57,7 @@ impl<'a, Ref> GenericTimeOps<'a, Ref> { Self { range: None, layer_id: layer_id.into(), - node, + item_ref: node, } } } @@ -296,9 +296,9 @@ where { pub fn edge_events(self) -> impl Iterator + Send + Sync + 'a { self.layer_id - .into_iter(self.node.num_layers()) + .into_iter(self.item_ref.num_layers()) .flat_map(move |layer_id| { - self.node + self.item_ref .additions_tc(layer_id, self.range) .map(|t_cell| t_cell.edge_events()) }) @@ -309,9 +309,9 @@ where self, ) -> impl Iterator + Send + Sync + 'a { self.layer_id - .into_iter(self.node.num_layers()) + .into_iter(self.item_ref.num_layers()) .flat_map(|layer_id| { - self.node + self.item_ref .additions_tc(layer_id, self.range) .map(|t_cell| t_cell.edge_events_rev()) }) @@ -323,12 +323,12 @@ impl<'a, Ref: WithTimeCells<'a> + 'a> GenericTimeOps<'a, Ref> { pub fn time_cells(self) -> impl Iterator + Send + Sync + 'a { let range = self.range; self.layer_id - .into_iter(self.node.num_layers()) + .into_iter(self.item_ref.num_layers()) .flat_map(move |layer_id| { - self.node.t_props_tc(layer_id, range).chain( - self.node + self.item_ref.t_props_tc(layer_id, range).chain( + self.item_ref .additions_tc(layer_id, range) - .chain(self.node.deletions_tc(layer_id, range)), + .chain(self.item_ref.deletions_tc(layer_id, range)), ) }) } @@ -356,7 +356,7 @@ impl<'a, Ref: WithTimeCells<'a> + 'a> TimeIndexOps<'a> for GenericTimeOps<'a, Re fn range(&self, w: Range) -> Self::RangeType { GenericTimeOps { range: Some((w.start, w.end)), - node: self.node, + item_ref: self.item_ref, layer_id: self.layer_id, } } diff --git a/db4-storage/src/pages/edge_page/writer.rs b/db4-storage/src/pages/edge_page/writer.rs index 0e619bebfd..4fd665444c 100644 --- a/db4-storage/src/pages/edge_page/writer.rs +++ b/db4-storage/src/pages/edge_page/writer.rs @@ -111,6 +111,7 @@ impl<'a, MP: DerefMut + std::fmt::Debug, ES: EdgeSegmen if !exists { self.increment_layer_num_edges(0); self.increment_layer_num_edges(layer_id); + self.page.increment_num_edges(); } self.writer diff --git a/db4-storage/src/segments/edge/segment.rs b/db4-storage/src/segments/edge/segment.rs index 164c9fb05f..312e48caca 100644 --- a/db4-storage/src/segments/edge/segment.rs +++ b/db4-storage/src/segments/edge/segment.rs @@ -374,6 +374,7 @@ impl LockedESegment for ArcLockedSegmentView { &'a self, layer_ids: &'b LayerIds, ) -> impl ParallelIterator> + 'a { + dbg!(layer_ids); match layer_ids { LayerIds::None => Iter4::I(rayon::iter::empty()), LayerIds::All => Iter4::J(self.edge_par_iter_layer(0)), diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 3de24ecc08..9cf0c56949 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -408,7 +408,7 @@ fn update_edge_properties<'a, ES: EdgeSegmentOps>( let mut t_props: Vec<(usize, Prop)> = vec![]; let mut c_props: Vec<(usize, Prop)> = vec![]; - for item @ (row, (src, dst, time, secondary_index, eid, layer, exists)) in zip.enumerate() { + for (row, (src, dst, time, secondary_index, eid, layer, exists)) in zip.enumerate() { if let Some(eid_pos) = shard.resolve_pos(*eid) { let t = TimeIndexEntry(time, secondary_index); let mut writer = shard.writer(); diff --git a/raphtory/src/serialise/graph_folder.rs b/raphtory/src/serialise/graph_folder.rs index e83f3d3850..9f66ba9451 100644 --- a/raphtory/src/serialise/graph_folder.rs +++ b/raphtory/src/serialise/graph_folder.rs @@ -317,6 +317,7 @@ mod tests { prelude::{AdditionOps, Graph, Prop, StableEncode, NO_PROPS}, }; use raphtory_api::{core::utils::logging::global_info_logger, GraphType}; + use raphtory_storage::core_ops::CoreGraphOps; /// Verify that the metadata is re-created if it does not exist. #[test] diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index c096bb5cb0..2521ee420b 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -576,7 +576,7 @@ fn decode_graph_storage( let c_edge_path = path.as_ref().join(EDGES_C_PATH); if std::fs::exists(&c_edge_path)? { - let exclude = vec![SRC_COL_ID, DST_COL_ID, LAYER_COL]; + let exclude = vec![SRC_COL_ID, DST_COL_ID, LAYER_COL, EDGE_COL_ID]; let (c_prop_columns, _) = collect_prop_columns(&c_edge_path, &exclude)?; let metadata = c_prop_columns .iter() From b0eb48e2bad431212416eb5e0574a856038ab921 Mon Sep 17 00:00:00 2001 From: Fabian Murariu Date: Thu, 18 Dec 2025 14:49:41 +0000 Subject: [PATCH 24/24] first end-2-end for stable id encoding passes --- db4-storage/src/api/nodes.rs | 5 ++++ db4-storage/src/pages/node_page/writer.rs | 5 ++++ db4-storage/src/pages/node_store.rs | 33 ++++++++++++----------- db4-storage/src/segments/node/segment.rs | 9 +++++++ raphtory/src/io/arrow/df_loaders/edges.rs | 5 ++++ raphtory/src/io/arrow/df_loaders/nodes.rs | 7 +++++ raphtory/src/io/parquet_loaders.rs | 10 +++++-- raphtory/src/serialise/parquet/mod.rs | 7 ++++- 8 files changed, 63 insertions(+), 18 deletions(-) diff --git a/db4-storage/src/api/nodes.rs b/db4-storage/src/api/nodes.rs index f4af23344b..b813cbbe9b 100644 --- a/db4-storage/src/api/nodes.rs +++ b/db4-storage/src/api/nodes.rs @@ -30,6 +30,7 @@ use crate::{ LocalPOS, error::StorageError, gen_ts::LayerIter, + pages::node_store::increment_and_clamp, segments::node::segment::MemNodeSegment, utils::{Iter2, Iter3, Iter4}, }; @@ -119,6 +120,10 @@ pub trait NodeSegmentOps: Send + Sync + std::fmt::Debug + 'static { fn nodes_counter(&self) -> &AtomicU32; + fn increment_num_nodes(&self, max_page_len: u32) { + increment_and_clamp(self.nodes_counter(), max_page_len); + } + fn num_nodes(&self) -> u32 { self.nodes_counter() .load(std::sync::atomic::Ordering::Relaxed) diff --git a/db4-storage/src/pages/node_page/writer.rs b/db4-storage/src/pages/node_page/writer.rs index bb87225a0c..c6312090f6 100644 --- a/db4-storage/src/pages/node_page/writer.rs +++ b/db4-storage/src/pages/node_page/writer.rs @@ -208,6 +208,11 @@ impl<'a, MP: DerefMut + 'a, NS: NodeSegmentOps> NodeWri pub fn update_deletion_time(&mut self, t: T, node: LocalPOS, e_id: ELID, lsn: u64) { self.update_timestamp(t, node, e_id, lsn); } + + pub fn increment_seg_num_nodes(&mut self) { + self.page + .increment_num_nodes(self.mut_segment.max_page_len()); + } } pub fn node_info_as_props( diff --git a/db4-storage/src/pages/node_store.rs b/db4-storage/src/pages/node_store.rs index e39601218c..ad1f7e9f2a 100644 --- a/db4-storage/src/pages/node_store.rs +++ b/db4-storage/src/pages/node_store.rs @@ -22,7 +22,7 @@ use rayon::prelude::*; use std::{ collections::HashMap, path::{Path, PathBuf}, - sync::Arc, + sync::{Arc, atomic::AtomicU32}, }; // graph // (nodes|edges) // graph segments // layers // chunks @@ -251,20 +251,7 @@ impl, EXT: Config> NodeStorageInner fn reserve_segment_row(&self, segment: &Arc) -> Option { // TODO: if this becomes a hotspot, we can switch to a fetch_add followed by a fetch_min // this means when we read the counter we need to clamp it to max_page_len so the iterators don't break - segment - .nodes_counter() - .fetch_update( - std::sync::atomic::Ordering::Relaxed, - std::sync::atomic::Ordering::Relaxed, - |current| { - if current < self.max_segment_len() { - Some(current + 1) - } else { - None - } - }, - ) - .ok() + increment_and_clamp(segment.nodes_counter(), self.max_segment_len()) } fn push_new_segment(&self) -> usize { @@ -520,3 +507,19 @@ impl, EXT: Config> NodeStorageInner ) } } + +pub fn increment_and_clamp(counter: &AtomicU32, max_segment_len: u32) -> Option { + counter + .fetch_update( + std::sync::atomic::Ordering::Relaxed, + std::sync::atomic::Ordering::Relaxed, + |current| { + if current < max_segment_len { + Some(current + 1) + } else { + None + } + }, + ) + .ok() +} diff --git a/db4-storage/src/segments/node/segment.rs b/db4-storage/src/segments/node/segment.rs index 72df28f69e..bfffbd58f1 100644 --- a/db4-storage/src/segments/node/segment.rs +++ b/db4-storage/src/segments/node/segment.rs @@ -3,6 +3,7 @@ use crate::{ api::nodes::{LockedNSSegment, NodeSegmentOps}, error::StorageError, loop_lock_write, + pages::node_store::increment_and_clamp, persist::strategy::PersistentStrategy, segments::{ HasRow, SegmentContainer, @@ -359,6 +360,10 @@ impl MemNodeSegment { pub fn node_ref(&self, pos: LocalPOS) -> MemNodeRef<'_> { MemNodeRef::new(pos, self) } + + pub fn max_page_len(&self) -> u32 { + self.max_page_len + } } #[derive(Debug)] @@ -546,6 +551,10 @@ impl>> NodeSegmentOps for NodeSegm fn nodes_counter(&self) -> &AtomicU32 { &self.max_num_node } + + fn increment_num_nodes(&self, max_page_len: u32) { + increment_and_clamp(self.nodes_counter(), max_page_len); + } } #[cfg(test)] diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/io/arrow/df_loaders/edges.rs index 9cf0c56949..e53308b5e3 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/io/arrow/df_loaders/edges.rs @@ -144,6 +144,11 @@ pub fn load_edges_from_df>(); + cols_to_check.extend_from_slice(properties); cols_to_check.extend_from_slice(metadata); diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index 2521ee420b..2cd1e29710 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -1,6 +1,9 @@ use crate::{ db::{ - api::{storage::storage::Storage, view::MaterializedGraph}, + api::{ + storage::storage::Storage, + view::{internal::InternalStorageOps, MaterializedGraph}, + }, graph::views::deletion_graph::PersistentGraph, }, errors::GraphError, @@ -545,6 +548,8 @@ fn decode_graph_storage( SRC_COL_ID, DST_COL_ID, LAYER_COL, + LAYER_ID_COL, + EDGE_COL_ID, ]; let (t_prop_columns, _) = collect_prop_columns(&t_edge_path, &exclude)?; let t_prop_columns = t_prop_columns