From 42417485c9cdbf3eea18d3247252c47af5d92099 Mon Sep 17 00:00:00 2001 From: Richard Kiss Date: Mon, 24 Nov 2025 14:24:51 -0800 Subject: [PATCH 1/2] Add interning API. --- fuzz/Cargo.toml | 6 + fuzz/fuzz_targets/intern.rs | 77 +++++++ src/serde/intern.rs | 403 ++++++++++++++++++++++++++++++++++++ src/serde/mod.rs | 9 +- src/serde/test_intern.rs | 118 +++++++++++ 5 files changed, 611 insertions(+), 2 deletions(-) create mode 100644 fuzz/fuzz_targets/intern.rs create mode 100644 src/serde/intern.rs create mode 100644 src/serde/test_intern.rs diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index aea819915..725c77f53 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -121,3 +121,9 @@ name = "canonical-serialization-br" path = "fuzz_targets/canonical_serialization_br.rs" test = false doc = false + +[[bin]] +name = "intern" +path = "fuzz_targets/intern.rs" +test = false +doc = false diff --git a/fuzz/fuzz_targets/intern.rs b/fuzz/fuzz_targets/intern.rs new file mode 100644 index 000000000..fde647c5e --- /dev/null +++ b/fuzz/fuzz_targets/intern.rs @@ -0,0 +1,77 @@ +#![no_main] + +use clvm_fuzzing::make_tree; +use clvmr::allocator::Allocator; +use clvmr::serde::{ObjectCache, intern, node_to_bytes, treehash}; +use libfuzzer_sys::fuzz_target; + +// Fuzzer for the interning functionality +// Verifies that: +// 1. Interning succeeds on valid nodes +// 2. The interned node serializes to the same bytes as the original +// 3. The tree hash is preserved +// 4. Interned nodes have fewer or equal unique atoms/pairs (deduplication works) +fuzz_target!(|data: &[u8]| { + let mut unstructured = arbitrary::Unstructured::new(data); + let mut allocator = Allocator::new(); + let (program, _) = make_tree(&mut allocator, &mut unstructured); + + // Serialize the original node + let original_serialized = match node_to_bytes(&allocator, program) { + Ok(b) => b, + Err(_) => return, + }; + + // Compute original tree hash + let mut original_cache = ObjectCache::new(treehash); + let original_tree_hash = match original_cache.get_or_calculate(&allocator, &program, None) { + Some(hash) => *hash, + None => return, + }; + + // Count original atoms and pairs before interning + let original_atoms = allocator.atom_count() + allocator.small_atom_count(); + let original_pairs = allocator.pair_count_no_ghosts(); + + // Create interned version using new API + let tree = match intern(&allocator, program) { + Ok(result) => result, + Err(_) => return, + }; + + // Serialize the interned node + let interned_serialized = match node_to_bytes(&tree.allocator, tree.root) { + Ok(b) => b, + Err(_) => panic!("Interned node should serialize successfully"), + }; + + // The serializations must match + assert_eq!( + original_serialized, interned_serialized, + "Serialized bytes differ after interning" + ); + + // Get stats and verify deduplication + let stats = tree.stats(); + + // Interning should not increase atom/pair counts (deduplication) + assert!( + stats.atom_count as usize <= original_atoms, + "Interning increased atoms: {} -> {}", + original_atoms, + stats.atom_count + ); + assert!( + stats.pair_count as usize <= original_pairs, + "Interning increased pairs: {} -> {}", + original_pairs, + stats.pair_count + ); + + // Verify tree hash is preserved + let interned_tree_hash = tree.tree_hash(); + assert_eq!( + original_tree_hash, interned_tree_hash, + "Tree hash differs after interning" + ); +}); diff --git a/src/serde/intern.rs b/src/serde/intern.rs new file mode 100644 index 000000000..b7f256a58 --- /dev/null +++ b/src/serde/intern.rs @@ -0,0 +1,403 @@ +//! CLVM tree interning - deduplicate atoms and pairs in a single pass. +//! +//! This module provides the core interning functionality for CLVM trees: +//! - Deduplicate identical atoms and pairs +//! - Collect unique nodes for cost calculation and serialization +//! - Compute tree hash efficiently over the interned structure + +use std::collections::HashMap; + +use crate::allocator::{Allocator, Atom, NodePtr, SExp}; +use crate::error::Result; + +use super::bytes32::Bytes32; +use super::object_cache::{ObjectCache, treehash}; + +/// Statistics from an interned tree - the building blocks for cost formulas. +/// +/// These components can be combined in different ways depending on the cost +/// formula being used. The struct provides helper methods for common formulas. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct InternedStats { + /// Number of unique atoms + pub atom_count: u64, + /// Number of unique pairs + pub pair_count: u64, + /// Sum of all unique atom byte lengths: Σ(atom_len) + pub atom_bytes: u64, + /// SHA256 blocks for atoms: Σ(⌈(atom_len + 10) / 64⌉) + /// The +10 accounts for: 0x01 prefix (1 byte) + SHA256 padding overhead (9 bytes) + pub sha_atom_blocks: u64, +} + +impl InternedStats { + /// Total unique nodes (atoms + pairs) + #[inline] + pub fn node_count(&self) -> u64 { + self.atom_count + self.pair_count + } + + /// SHA256 blocks for pairs: always 2 per pair. + /// Each pair hashes: 0x02 (1) + left_hash (32) + right_hash (32) = 65 bytes + /// With padding: 74 bytes → always 2 SHA256 blocks + #[inline] + pub fn sha_pair_blocks(&self) -> u64 { + 2 * self.pair_count + } + + /// Total SHA256 blocks needed for tree hashing (atom blocks + pair blocks) + #[inline] + pub fn sha_blocks(&self) -> u64 { + self.sha_atom_blocks + self.sha_pair_blocks() + } + + /// Total SHA256 invocations needed (one per unique node) + #[inline] + pub fn sha_invocations(&self) -> u64 { + self.atom_count + self.pair_count + } +} + +/// Result of interning a CLVM tree. +/// +/// Contains the deduplicated tree structure and lists of unique nodes, +/// enabling efficient cost calculation, tree hashing, and serialization. +#[derive(Debug)] +pub struct InternedTree { + /// Allocator containing only unique (deduplicated) nodes + pub allocator: Allocator, + /// Root node in the interned allocator + pub root: NodePtr, + /// All unique atoms, in insertion order + pub atoms: Vec, + /// All unique pairs, in post-order (children before parents) + pub pairs: Vec, +} + +impl InternedTree { + /// Compute statistics for this interned tree. + /// + /// This is O(atoms.len()) - it iterates the atom list once to sum byte lengths. + pub fn stats(&self) -> InternedStats { + let mut stats = InternedStats { + atom_count: self.atoms.len() as u64, + pair_count: self.pairs.len() as u64, + atom_bytes: 0, + sha_atom_blocks: 0, + }; + + for &atom in &self.atoms { + let len = self.allocator.atom_len(atom) as u64; + stats.atom_bytes += len; + // SHA256 blocks: ceil((len + 10) / 64) = (len + 73) / 64 + stats.sha_atom_blocks += (len + 73) / 64; + } + + stats + } + + /// Compute SHA256 tree hash for the interned tree. + /// + /// This is efficient because each unique node is only hashed once, + /// and the ObjectCache handles memoization automatically. + pub fn tree_hash(&self) -> Bytes32 { + let mut cache: ObjectCache = ObjectCache::new(treehash); + *cache + .get_or_calculate(&self.allocator, &self.root, None) + .expect("treehash should not fail on valid tree") + } + + /// Get a mapping from NodePtr to index for serialization. + /// + /// Returns (atom_to_index, pair_to_index) where: + /// - Atom indices are 0, 1, 2, ... (non-negative) + /// - Pair indices are -1, -2, -3, ... (negative, 1-based) + /// + /// This is useful for serialization formats that reference nodes by index. + pub fn node_indices(&self) -> (HashMap, HashMap) { + let mut atom_to_index = HashMap::with_capacity(self.atoms.len()); + let mut pair_to_index = HashMap::with_capacity(self.pairs.len()); + + for (i, &atom) in self.atoms.iter().enumerate() { + atom_to_index.insert(atom, i as i32); + } + for (i, &pair) in self.pairs.iter().enumerate() { + pair_to_index.insert(pair, -(i as i32 + 1)); + } + + (atom_to_index, pair_to_index) + } +} + +/// Intern a CLVM tree: deduplicate atoms and pairs in a single pass. +/// +/// This function traverses the source tree once, building a new allocator +/// with deduplicated nodes. It tracks: +/// - Atoms by content (identical byte sequences share one node) +/// - Pairs by their (left, right) tuple in the interned allocator +/// +/// The resulting `InternedTree` contains: +/// - A new allocator with only unique nodes +/// - The root node in the new allocator +/// - Lists of unique atoms and pairs for cost/serialization +/// +/// # Algorithm +/// +/// Uses an iterative post-order traversal with explicit stack: +/// 1. Push root to stack +/// 2. For each node: +/// - If atom: deduplicate by content, add to atoms list if new +/// - If pair: wait for children to be processed, then deduplicate by (left, right) +/// 3. Pairs are naturally collected in post-order (children before parents) +/// +/// # Errors +/// +/// Returns an error if allocator limits are exceeded when creating new nodes. +pub fn intern(allocator: &Allocator, node: NodePtr) -> Result { + let mut new_allocator = Allocator::new(); + let mut atoms: Vec = Vec::new(); + let mut pairs: Vec = Vec::new(); + + // Maps from source allocator to interned allocator + let mut node_to_interned: HashMap = HashMap::new(); + // Maps atom content to interned NodePtr (for deduplication) + let mut atom_to_interned: HashMap = HashMap::new(); + // Maps (left_interned, right_interned) to interned pair NodePtr + let mut pair_to_interned: HashMap<(NodePtr, NodePtr), NodePtr> = HashMap::new(); + + let mut stack = vec![node]; + + while let Some(current) = stack.pop() { + // Skip if already processed + if node_to_interned.contains_key(¤t) { + continue; + } + + match allocator.sexp(current) { + SExp::Atom => { + let atom = allocator.atom(current); + let interned = if let Some(&existing) = atom_to_interned.get(atom.as_ref()) { + existing + } else { + let new_node = new_allocator.new_atom(atom.as_ref())?; + atom_to_interned.insert(atom, new_node); + atoms.push(new_node); + new_node + }; + node_to_interned.insert(current, interned); + } + SExp::Pair(left, right) => { + // Check if children are processed + let left_interned = node_to_interned.get(&left); + let right_interned = node_to_interned.get(&right); + + match (left_interned, right_interned) { + (Some(&l), Some(&r)) => { + // Both children processed, create or reuse pair + let interned = if let Some(&existing) = pair_to_interned.get(&(l, r)) { + existing + } else { + let new_node = new_allocator.new_pair(l, r)?; + pair_to_interned.insert((l, r), new_node); + pairs.push(new_node); + new_node + }; + node_to_interned.insert(current, interned); + } + _ => { + // Need to process children first + stack.push(current); + if right_interned.is_none() { + stack.push(right); + } + if left_interned.is_none() { + stack.push(left); + } + } + } + } + } + } + + let root = node_to_interned[&node]; + Ok(InternedTree { + allocator: new_allocator, + root, + atoms, + pairs, + }) +} +#[cfg(test)] +mod tests { + use super::*; + use crate::serde::node_from_bytes; + + #[test] + fn test_intern_single_atom() { + let mut allocator = Allocator::new(); + let node = allocator.new_atom(&[1, 2, 3]).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + assert_eq!(tree.atoms.len(), 1); + assert_eq!(tree.pairs.len(), 0); + assert_eq!(tree.allocator.atom(tree.root).as_ref(), &[1, 2, 3]); + } + + #[test] + fn test_intern_simple_pair() { + let mut allocator = Allocator::new(); + let left = allocator.new_atom(&[1]).unwrap(); + let right = allocator.new_atom(&[2]).unwrap(); + let node = allocator.new_pair(left, right).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + assert_eq!(tree.atoms.len(), 2); + assert_eq!(tree.pairs.len(), 1); + } + + #[test] + fn test_intern_deduplicates_atoms() { + // Create (A . A) where A has same content + let mut allocator = Allocator::new(); + let a1 = allocator.new_atom(&[42]).unwrap(); + let a2 = allocator.new_atom(&[42]).unwrap(); // Same content, different NodePtr + let node = allocator.new_pair(a1, a2).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + // Should have only 1 unique atom + assert_eq!(tree.atoms.len(), 1); + assert_eq!(tree.pairs.len(), 1); + } + + #[test] + fn test_intern_deduplicates_pairs() { + // Create ((A . B) . (A . B)) + let mut allocator = Allocator::new(); + let a = allocator.new_atom(&[1]).unwrap(); + let b = allocator.new_atom(&[2]).unwrap(); + let p1 = allocator.new_pair(a, b).unwrap(); + let p2 = allocator.new_pair(a, b).unwrap(); // Same structure, different NodePtr + let node = allocator.new_pair(p1, p2).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + // Should have 2 atoms, 2 pairs (inner pair deduplicated) + assert_eq!(tree.atoms.len(), 2); + assert_eq!(tree.pairs.len(), 2); // (A . B) and ((A.B) . (A.B)) + } + + #[test] + fn test_stats() { + let mut allocator = Allocator::new(); + let a = allocator.new_atom(&[1, 2, 3, 4, 5]).unwrap(); // 5 bytes + let b = allocator.new_atom(&[6, 7, 8]).unwrap(); // 3 bytes + let node = allocator.new_pair(a, b).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + let stats = tree.stats(); + + assert_eq!(stats.atom_count, 2); + assert_eq!(stats.pair_count, 1); + assert_eq!(stats.atom_bytes, 8); + assert_eq!(stats.sha_pair_blocks(), 2); + } + + #[test] + fn test_tree_hash_deterministic() { + let mut alloc1 = Allocator::new(); + let a1 = alloc1.new_atom(&[1, 2, 3]).unwrap(); + let b1 = alloc1.new_atom(&[4, 5, 6]).unwrap(); + let node1 = alloc1.new_pair(a1, b1).unwrap(); + + let mut alloc2 = Allocator::new(); + let a2 = alloc2.new_atom(&[1, 2, 3]).unwrap(); + let b2 = alloc2.new_atom(&[4, 5, 6]).unwrap(); + let node2 = alloc2.new_pair(a2, b2).unwrap(); + + let tree1 = intern(&alloc1, node1).unwrap(); + let tree2 = intern(&alloc2, node2).unwrap(); + + assert_eq!(tree1.tree_hash(), tree2.tree_hash()); + } + + #[test] + fn test_pairs_in_post_order() { + // Create (A . (B . C)) + let mut allocator = Allocator::new(); + let a = allocator.new_atom(&[1]).unwrap(); + let b = allocator.new_atom(&[2]).unwrap(); + let c = allocator.new_atom(&[3]).unwrap(); + let inner = allocator.new_pair(b, c).unwrap(); + let outer = allocator.new_pair(a, inner).unwrap(); + + let tree = intern(&allocator, outer).unwrap(); + + // Post-order: inner pair before outer pair + assert_eq!(tree.pairs.len(), 2); + // The inner pair (B . C) should come before the outer pair (A . (B . C)) + // because children must be processed before parents + + // Verify the ordering: inner pair should be first, outer pair should be second + let inner_pair = tree.pairs[0]; + let outer_pair = tree.pairs[1]; + + // Verify that inner_pair is actually the (B . C) pair + match tree.allocator.sexp(inner_pair) { + SExp::Pair(left, right) => { + assert_eq!(tree.allocator.atom(left).as_ref(), &[2]); + assert_eq!(tree.allocator.atom(right).as_ref(), &[3]); + } + _ => panic!("Expected inner_pair to be a pair"), + } + + // Verify that outer_pair is actually the (A . (B . C)) pair + match tree.allocator.sexp(outer_pair) { + SExp::Pair(left, right) => { + assert_eq!(tree.allocator.atom(left).as_ref(), &[1]); + assert_eq!( + right, inner_pair, + "Outer pair's right child should be the inner pair" + ); + } + _ => panic!("Expected outer_pair to be a pair"), + } + } + + #[test] + fn test_stats_values() { + let mut allocator = Allocator::new(); + // 2 atoms (10 bytes total) and 3 pairs + let a = allocator.new_atom(&[1, 2, 3, 4, 5]).unwrap(); + let b = allocator.new_atom(&[6, 7, 8, 9, 10]).unwrap(); + let p1 = allocator.new_pair(a, b).unwrap(); + let p2 = allocator.new_pair(p1, a).unwrap(); + let p3 = allocator.new_pair(p2, b).unwrap(); + + let tree = intern(&allocator, p3).unwrap(); + let stats = tree.stats(); + + assert_eq!(stats.atom_count, 2); + assert_eq!(stats.pair_count, 3); + assert_eq!(stats.atom_bytes, 10); + assert_eq!(stats.node_count(), 5); + assert_eq!(stats.sha_invocations(), 5); + } + + #[test] + fn test_from_serialized_bytes() { + // ff8568656c6c6f85776f726c64 = ("hello" . "world") + let bytes = hex::decode("ff8568656c6c6f85776f726c64").unwrap(); + let mut allocator = Allocator::new(); + let node = node_from_bytes(&mut allocator, &bytes).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + let stats = tree.stats(); + + assert_eq!(stats.atom_count, 2); + assert_eq!(stats.pair_count, 1); + assert_eq!(stats.atom_bytes, 10); // "hello" (5) + "world" (5) + } +} diff --git a/src/serde/mod.rs b/src/serde/mod.rs index 361412c42..b18e3d582 100644 --- a/src/serde/mod.rs +++ b/src/serde/mod.rs @@ -1,11 +1,12 @@ mod bitset; -mod bytes32; +pub(crate) mod bytes32; mod de; mod de_br; mod de_tree; mod identity_hash; mod incremental; -mod object_cache; +pub mod intern; +pub(crate) mod object_cache; mod parse_atom; mod path_builder; mod read_cache_lookup; @@ -19,13 +20,17 @@ pub mod write_atom; #[cfg(test)] mod test; +#[cfg(test)] +mod test_intern; pub use bitset::BitSet; +pub use bytes32::Bytes32; pub use de::node_from_bytes; pub use de_br::{node_from_bytes_backrefs, node_from_bytes_backrefs_old}; pub use de_tree::{ParsedTriple, parse_triples}; pub use identity_hash::RandomState; pub use incremental::{Serializer, UndoState}; +pub use intern::{InternedStats, InternedTree, intern}; pub use object_cache::{ObjectCache, serialized_length, treehash}; pub use path_builder::{ChildPos, PathBuilder}; pub use read_cache_lookup::ReadCacheLookup; diff --git a/src/serde/test_intern.rs b/src/serde/test_intern.rs new file mode 100644 index 000000000..db96694d7 --- /dev/null +++ b/src/serde/test_intern.rs @@ -0,0 +1,118 @@ +use crate::allocator::{Allocator, NodePtr}; +use crate::error::Result; +use crate::serde::bytes32::Bytes32; +use crate::serde::intern::intern; +use crate::serde::node_from_bytes_backrefs; +use crate::serde::node_to_bytes; +use crate::serde::object_cache::{ObjectCache, treehash}; + +fn treehash_for_node(allocator: &Allocator, node: NodePtr) -> Bytes32 { + let mut object_cache = ObjectCache::new(treehash); + *object_cache + .get_or_calculate(allocator, &node, None) + .unwrap() +} + +/// Helper to convert hex string to bytes +fn hex_to_bytes(hex: &str) -> Vec { + let hex_clean = hex.trim().replace([' ', '\n'], ""); + hex_clean + .chars() + .collect::>() + .chunks(2) + .map(|chunk| { + let s: String = chunk.iter().collect(); + u8::from_str_radix(&s, 16).expect("invalid hex") + }) + .collect() +} + +/// Helper to convert hex string directly to a node +fn hex_to_node(allocator: &mut Allocator, hex: &str) -> Result { + let bytes = hex_to_bytes(hex); + node_from_bytes_backrefs(allocator, &bytes) +} + +/// Helper to deserialize hex and create interned version, returning intern stats +fn test_hex_interning(hex: &str, expected_atoms: usize, expected_pairs: usize) -> Result<()> { + let mut allocator = Allocator::new(); + + // Deserialize from hex + let node = hex_to_node(&mut allocator, hex)?; + + // Create interned version using the new API + let tree = intern(&allocator, node)?; + + // Ensure interned node serializes to same bytes + let original_serialized = node_to_bytes(&allocator, node)?; + let new_serialized = node_to_bytes(&tree.allocator, tree.root)?; + assert_eq!( + original_serialized, new_serialized, + "Serialized bytes do not match after interning." + ); + + // Ensure treehashes match + let original_treehash = treehash_for_node(&allocator, node); + let new_treehash = tree.tree_hash(); + assert_eq!( + original_treehash, new_treehash, + "Treehashes do not match after interning." + ); + + // Verify unique atom and pair counts + assert_eq!( + tree.atoms.len(), + expected_atoms, + "Atom count doesn't match expected.\nGot: {:?}\nExpected: {:?}", + tree.atoms.len(), + expected_atoms + ); + assert_eq!( + tree.pairs.len(), + expected_pairs, + "Pair count doesn't match expected.\nGot: {:?}\nExpected: {:?}", + tree.pairs.len(), + expected_pairs + ); + + Ok(()) +} + +// ============================================================================ +// Hex-based test cases with intern statistics verification +// ============================================================================ + +#[test] +fn test_interning() -> Result<()> { + // Simple atom with value 1: 1 atom, 0 pairs + test_hex_interning("01", 1, 0)?; + + // Atom with value 10: 1 atom, 0 pairs + test_hex_interning("0a", 1, 0)?; + + // Pair of identical atoms (1 . 1): 1 atom (deduplicated), 1 pair + test_hex_interning("ff0101", 1, 1)?; + + // Pair of different atoms (1 . 10): 2 atoms, 1 pair + test_hex_interning("ff010a", 2, 1)?; + + // Nested structure (1 . (1 . 1)): 1 atom (deduplicated), 2 pairs + test_hex_interning("ff01ff0101", 1, 2)?; + + // Nested structure ((42 . 42) . 42): 1 atom (42 deduplicated), 2 pairs + test_hex_interning("ffff2a2a2a", 1, 2)?; + + // Deep nesting: (1 . (2 . (3 . 1))): 3 atoms (1,2,3 with 1 repeated), 3 pairs + test_hex_interning("ff01ff02ff0301", 3, 3)?; + + // Three-element chain: (1 . (2 . (3 . nil))): 4 atoms (1,2,3,nil), 3 pairs + test_hex_interning("ff01ff02ff0300", 4, 3)?; + + // Pair of different atoms at each level: (1 . (2 . (3 . 4))) + test_hex_interning("ff01ff02ff0304", 4, 3)?; + + // Mixed atoms with one repeated: (1 . (2 . (1 . 3))) + test_hex_interning("ff01ff02ff0103", 3, 3)?; + + Ok(()) +} From 69c1161b18dd1110578edcc105054f9a7da2f3cc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 26 Jan 2026 21:17:34 +0000 Subject: [PATCH 2/2] Initial plan