diff --git a/opencog/opencl/atomspace/gpu-cosine.cl b/opencog/opencl/atomspace/gpu-cosine.cl new file mode 100644 index 0000000..467d936 --- /dev/null +++ b/opencog/opencl/atomspace/gpu-cosine.cl @@ -0,0 +1,380 @@ +/* + * gpu-cosine.cl -- Cosine similarity and candidate generation on GPU + * + * Computes pairwise cosine similarity between words based on their + * section (disjunct) vectors, entirely on GPU-resident data. + * + * Pipeline: + * 1. compute_word_norms — ||v||² for each word + * 2. build_disjunct_chains — reverse index: djh → linked list of sections + * 3. accumulate_dot_products — walk chains, accumulate dot(w1,w2) + * 4. compute_cosines — cosine = dot / (||v1|| × ||v2||) + * 5. filter_candidates — compact output above threshold + * + * No CPU↔GPU data transfer between stages. + * + * Requires capacity macros (set via -D flags): + * DJH_HT_CAPACITY — disjunct reverse index hash table size (power of 2) + * CANDIDATE_CAPACITY — max candidate pairs + * CANDIDATE_HT_CAPACITY — candidate pair hash table size (power of 2) + * + * Appended after gpu-hashtable.cl, gpu-atomspace.cl at load time. + * (gpu-sections.cl optional — only needed for full pipeline tests) + */ + +/* ═══════════════════════════════════════════════════════════════ + * STEP 1: COMPUTE WORD NORMS + * + * One thread per section. Atomically adds count² to the word's + * norm accumulator. After all sections processed: + * word_norm_sq[w] = Σ_d count(w,d)² + * + * where d ranges over all disjuncts for word w. + * ═══════════════════════════════════════════════════════════════ */ + +__kernel void compute_word_norms( + __global const uint* sec_word, + __global const double* sec_count, + __global volatile double* word_norm_sq, + const uint num_sections) +{ + uint tid = get_global_id(0); + if (tid >= num_sections) return; + + double count = sec_count[tid]; + if (count < 0.5) return; + + uint word = sec_word[tid]; + atomic_add_double(&word_norm_sq[word], count * count); +} + +/* ═══════════════════════════════════════════════════════════════ + * STEP 2: BUILD DISJUNCT REVERSE INDEX (linked list chains) + * + * For each unique disjunct_hash, build a linked list of all + * sections that share that disjunct. Uses lock-free atomic + * prepend to a per-disjunct chain. + * + * Data structures: + * djh_ht_keys[DJH_HT_CAPACITY] — disjunct hash values + * djh_ht_values[DJH_HT_CAPACITY] — chain head (section index) + * sec_chain_next[SECTION_CAPACITY] — per-section next pointer + * + * Chain traversal: start at djh_ht_values[slot], follow + * sec_chain_next[] until HT_EMPTY_VALUE (end of chain). + * ═══════════════════════════════════════════════════════════════ */ + +__kernel void build_disjunct_chains( + __global const ulong* sec_disjunct_hash, + __global const double* sec_count, + __global volatile ulong* djh_ht_keys, + __global volatile uint* djh_ht_values, + __global uint* sec_chain_next, + const uint num_sections) +{ + uint tid = get_global_id(0); + if (tid >= num_sections) return; + + /* Skip empty sections */ + if (sec_count[tid] < 0.5) { + sec_chain_next[tid] = HT_EMPTY_VALUE; + return; + } + + ulong djh = sec_disjunct_hash[tid]; + if (djh == HT_EMPTY_KEY) { + sec_chain_next[tid] = HT_EMPTY_VALUE; + return; + } + + ulong cap = DJH_HT_CAPACITY; + ulong mask = cap - 1; + ulong slot = ht_hash(djh) & mask; + + for (uint probe = 0; probe < HT_MAX_PROBES; probe++) + { + ulong prev = atom_cmpxchg(&djh_ht_keys[slot], HT_EMPTY_KEY, djh); + + if (prev == HT_EMPTY_KEY || prev == djh) + { + /* Atomically prepend this section to the chain. + * atomic_xchg returns the old head (initially HT_EMPTY_VALUE + * for a fresh slot, which serves as the end-of-chain sentinel). + * Lock-free: concurrent prepends produce a valid chain. */ + uint old_head = atomic_xchg(&djh_ht_values[slot], tid); + sec_chain_next[tid] = old_head; + return; + } + + slot = (slot + 1) & mask; + } + + /* Hash table full — orphan this section */ + sec_chain_next[tid] = HT_EMPTY_VALUE; +} + +/* ═══════════════════════════════════════════════════════════════ + * STEP 3: ACCUMULATE DOT PRODUCTS + * + * One thread per section. For each section (word_i, djh_i, count_i): + * - Look up djh_i in the disjunct reverse index + * - Pre-count chain length; skip if > MAX_CHAIN_LEN + * (very common disjuncts are uninformative, like stopwords) + * - Walk the chain of sections sharing this disjunct + * - For each other section (word_j, djh_i, count_j) where word_j != word_i: + * - Find-or-create candidate pair (word_i, word_j) + * - Atomically add count_i × count_j to the candidate's dot product + * + * Uses canonical ordering (word_i < word_j) to avoid double-counting: + * each shared disjunct contributes exactly once to each pair's dot product. + * + * Candidate pairs are stored in a hash table + pool (same pattern + * as atom pools from gpu-atomspace.cl). + * ═══════════════════════════════════════════════════════════════ */ + +#ifndef MAX_CHAIN_LEN +#define MAX_CHAIN_LEN 200 +#endif + +__kernel void accumulate_dot_products( + /* Section data */ + __global const uint* sec_word, + __global const ulong* sec_disjunct_hash, + __global const double* sec_count, + /* Disjunct reverse index */ + __global const ulong* djh_ht_keys, + __global const uint* djh_ht_values, + __global const uint* sec_chain_next, + /* Candidate hash table */ + __global volatile ulong* cand_ht_keys, + __global volatile uint* cand_ht_values, + /* Candidate pool SoA */ + __global uint* cand_word_a, + __global uint* cand_word_b, + __global volatile double* cand_dot, + __global volatile uint* cand_next_free, + /* Size */ + const uint num_sections) +{ + uint tid = get_global_id(0); + if (tid >= num_sections) return; + + uint my_word = sec_word[tid]; + double my_count = sec_count[tid]; + if (my_count < 0.5) return; + + ulong my_djh = sec_disjunct_hash[tid]; + if (my_djh == HT_EMPTY_KEY) return; + + /* ── Look up disjunct in reverse index ── */ + + ulong cap = DJH_HT_CAPACITY; + ulong mask = cap - 1; + ulong slot = ht_hash(my_djh) & mask; + uint chain_head = HT_EMPTY_VALUE; + + for (uint probe = 0; probe < HT_MAX_PROBES; probe++) + { + ulong k = djh_ht_keys[slot]; + if (k == my_djh) { + chain_head = djh_ht_values[slot]; + break; + } + if (k == HT_EMPTY_KEY) break; + slot = (slot + 1) & mask; + } + + if (chain_head == HT_EMPTY_VALUE) return; + + /* ── Pre-count chain length — skip overly common disjuncts ── */ + /* Disjuncts appearing in > MAX_CHAIN_LEN sections are too common + * to be discriminative (like stopwords in information retrieval). + * Skipping them entirely avoids O(K²) explosion for popular chains + * and matches the CPU reverse-index filter. */ + + uint chain_len = 0; + uint pre = chain_head; + while (pre != HT_EMPTY_VALUE && chain_len <= MAX_CHAIN_LEN) + { + chain_len++; + pre = sec_chain_next[pre]; + } + if (chain_len > MAX_CHAIN_LEN) return; + + /* ── Walk chain, accumulate dot products ── */ + + uint cur = chain_head; + + while (cur != HT_EMPTY_VALUE) + { + uint other_word = sec_word[cur]; + double other_count = sec_count[cur]; + + /* Only accumulate once per pair (canonical: my_word < other_word) */ + if (other_word != my_word && my_word < other_word && other_count >= 0.5) + { + /* ── Find or create candidate pair ── */ + + uint lo = my_word; /* already < other_word */ + uint hi = other_word; + ulong cand_key = ((ulong)lo << 32) | (ulong)hi; + + ulong c_cap = CANDIDATE_HT_CAPACITY; + ulong c_mask = c_cap - 1; + ulong c_slot = ht_hash(cand_key) & c_mask; + uint cand_idx = HT_EMPTY_VALUE; + + for (uint p = 0; p < HT_MAX_PROBES; p++) + { + ulong prev = atom_cmpxchg( + &cand_ht_keys[c_slot], HT_EMPTY_KEY, cand_key); + + if (prev == HT_EMPTY_KEY) + { + /* New candidate — allocate from pool */ + uint idx = atomic_add(cand_next_free, 1U); + if (idx < CANDIDATE_CAPACITY) { + cand_word_a[idx] = lo; + cand_word_b[idx] = hi; + cand_dot[idx] = 0.0; + mem_fence(CLK_GLOBAL_MEM_FENCE); + cand_ht_values[c_slot] = idx; + cand_idx = idx; + } + break; + } + if (prev == cand_key) + { + /* Existing candidate — spin for value */ + uint val = cand_ht_values[c_slot]; + while (val == HT_EMPTY_VALUE) { + val = cand_ht_values[c_slot]; + } + cand_idx = val; + break; + } + + c_slot = (c_slot + 1) & c_mask; + } + + if (cand_idx != HT_EMPTY_VALUE) + { + atomic_add_double(&cand_dot[cand_idx], + my_count * other_count); + } + } + + cur = sec_chain_next[cur]; + } +} + +/* ═══════════════════════════════════════════════════════════════ + * STEP 4: COMPUTE COSINES + * + * One thread per candidate. Converts accumulated dot products + * to cosine similarity using pre-computed word norms. + * + * cosine(w1, w2) = dot(w1, w2) / (||w1|| × ||w2||) + * ═══════════════════════════════════════════════════════════════ */ + +/* MIN_NORM_SQ filters rare words: a word with N sections each count=1 + * has norm_sq = N. Setting MIN_NORM_SQ = 5.0 requires >= 5 section types. + * This prevents hapax legomena from getting cosine=1.0 by chance. */ +#ifndef MIN_NORM_SQ +#define MIN_NORM_SQ 50.0 +#endif + +__kernel void compute_cosines( + __global const uint* cand_word_a, + __global const uint* cand_word_b, + __global const double* cand_dot, + __global double* cand_cosine, + __global const double* word_norm_sq, + const uint num_candidates) +{ + uint tid = get_global_id(0); + if (tid >= num_candidates) return; + + uint wa = cand_word_a[tid]; + uint wb = cand_word_b[tid]; + double dot = cand_dot[tid]; + + double na = word_norm_sq[wa]; + double nb = word_norm_sq[wb]; + + /* Skip rare words — not enough data for meaningful cosine */ + if (na < MIN_NORM_SQ || nb < MIN_NORM_SQ) { + cand_cosine[tid] = 0.0; + return; + } + + double denom = sqrt(na) * sqrt(nb); + double cos_val = (denom > 1e-10) ? (dot / denom) : 0.0; + + /* Clamp to valid range — prevents transient GPU state issues */ + if (cos_val > 1.0) cos_val = 1.0; + if (cos_val < -1.0) cos_val = -1.0; + + cand_cosine[tid] = cos_val; +} + +/* ═══════════════════════════════════════════════════════════════ + * STEP 5: FILTER CANDIDATES + * + * Compact candidates above a cosine threshold into contiguous + * output arrays. Returns (word_a, word_b, cosine) for each + * passing pair. + * + * out_count must be initialized to 0 before kernel launch. + * ═══════════════════════════════════════════════════════════════ */ + +__kernel void filter_candidates( + __global const uint* cand_word_a, + __global const uint* cand_word_b, + __global const double* cand_cosine, + const uint num_candidates, + const double threshold, + __global uint* out_word_a, + __global uint* out_word_b, + __global double* out_cosine, + __global volatile uint* out_count, + const uint max_output) +{ + uint tid = get_global_id(0); + if (tid >= num_candidates) return; + + double cos_val = cand_cosine[tid]; + if (cos_val > threshold) + { + uint idx = atomic_add(out_count, 1U); + if (idx < max_output) { + out_word_a[idx] = cand_word_a[tid]; + out_word_b[idx] = cand_word_b[tid]; + out_cosine[idx] = cos_val; + } + } +} + +/* ═══════════════════════════════════════════════════════════════ + * READBACK: Dump candidate data for verification + * ═══════════════════════════════════════════════════════════════ */ + +__kernel void read_candidates( + __global const uint* cand_word_a, + __global const uint* cand_word_b, + __global const double* cand_dot, + __global const double* cand_cosine, + __global uint* out_word_a, + __global uint* out_word_b, + __global double* out_dot, + __global double* out_cosine, + const uint num_candidates) +{ + uint tid = get_global_id(0); + if (tid >= num_candidates) return; + + out_word_a[tid] = cand_word_a[tid]; + out_word_b[tid] = cand_word_b[tid]; + out_dot[tid] = cand_dot[tid]; + out_cosine[tid] = cand_cosine[tid]; +} diff --git a/opencog/opencl/atomspace/gpu-sections.cl b/opencog/opencl/atomspace/gpu-sections.cl new file mode 100644 index 0000000..83641b8 --- /dev/null +++ b/opencog/opencl/atomspace/gpu-sections.cl @@ -0,0 +1,317 @@ +/* + * gpu-sections.cl -- Section extraction from MST/PMFG edges on GPU + * + * Given MST edges for a batch of sentences (as position pairs within + * each sentence), extract sections (word + disjunct) and increment + * SectionPool counts. Disjunct hashing is done entirely on GPU — + * no strings involved. + * + * A section = (word, disjunct), where the disjunct is the set of + * connectors for that word in the parse. A connector = (partner_word, + * direction), where direction is '+' (right) or '-' (left). + * + * On GPU: connector = (word_pool_idx, direction_bit). + * Disjunct = FNV-1a hash of sorted connector sequence. + * + * Appended after gpu-hashtable.cl, gpu-atomspace.cl at load time. + */ + +/* Maximum connectors per word position (degree limit). + * PMFG on a 20-word sentence has at most 54 edges; per word, + * planar graph average degree < 6. 32 is very safe. */ +#define MAX_CONNECTORS 32 + +/* Connector direction bits */ +#define DIR_LEFT 0U /* partner is to the LEFT of this word */ +#define DIR_RIGHT 1U /* partner is to the RIGHT of this word */ + +/* ═══════════════════════════════════════════════════════════════ + * FNV-1a HASH for disjunct encoding + * + * Hash a sorted sequence of (word_pool_idx, direction) pairs + * into a single 64-bit disjunct hash. + * ═══════════════════════════════════════════════════════════════ */ + +inline ulong fnv1a_init(void) +{ + return 0xcbf29ce484222325UL; +} + +inline ulong fnv1a_mix(ulong hash, ulong val) +{ + hash ^= val; + hash *= 0x100000001b3UL; + return hash; +} + +inline ulong hash_disjunct(uint* conn_words, uint* conn_dirs, uint count) +{ + ulong h = fnv1a_init(); + for (uint i = 0; i < count; i++) { + /* Encode: (word_pool_idx << 1) | direction_bit */ + ulong encoded = ((ulong)conn_words[i] << 1) | (ulong)conn_dirs[i]; + h = fnv1a_mix(h, encoded); + } + /* Ensure it's never the hash table sentinel */ + if (h == HT_EMPTY_KEY) h = 0; + return h; +} + +/* ═══════════════════════════════════════════════════════════════ + * INSERTION SORT for connectors + * + * Sort by: direction first (LEFT=0 before RIGHT=1), + * then by word_pool_idx (ascending). + * + * This gives a deterministic disjunct representation: + * all left connectors (sorted by partner) then all right + * connectors (sorted by partner). + * ═══════════════════════════════════════════════════════════════ */ + +inline void sort_connectors(uint* conn_words, uint* conn_dirs, uint count) +{ + /* Insertion sort — count is small (typically < 10) */ + for (uint i = 1; i < count; i++) { + uint w = conn_words[i]; + uint d = conn_dirs[i]; + uint j = i; + while (j > 0) { + /* Compare: direction first, then word index */ + int swap = 0; + if (conn_dirs[j-1] > d) + swap = 1; + else if (conn_dirs[j-1] == d && conn_words[j-1] > w) + swap = 1; + + if (!swap) break; + + conn_words[j] = conn_words[j-1]; + conn_dirs[j] = conn_dirs[j-1]; + j--; + } + conn_words[j] = w; + conn_dirs[j] = d; + } +} + +/* ═══════════════════════════════════════════════════════════════ + * EXTRACT SECTIONS FROM MST EDGES + * + * One thread per word position across all sentences. + * Each thread: + * 1. Finds its sentence (binary search) + * 2. Scans MST edges for this sentence + * 3. Collects connectors (partner word + direction) + * 4. Sorts connectors deterministically + * 5. Hashes sorted sequence → disjunct_hash + * 6. Find-or-create section in SectionPool + * 7. Atomically increments section count + * + * Words with no edges (isolated) produce no section. + * + * Args: + * -- Sentence data -- + * flat_words - word pool indices [total_words] + * sent_offsets - start of each sentence in flat_words [num_sentences] + * sent_lengths - length of each sentence [num_sentences] + * num_sentences - number of sentences + * total_words - total words across all sentences + * + * -- MST edge data -- + * edge_p1 - position 1 within sentence [total_edges] + * edge_p2 - position 2 within sentence [total_edges] + * edge_offsets - start of each sentence's edges [num_sentences] + * edge_counts - number of edges per sentence [num_sentences] + * + * -- Section pool + hash table -- + * sht_keys, sht_values - section hash table + * sec_word - section pool: word indices + * sec_disjunct_hash - section pool: disjunct hashes + * sec_count - section pool: counts + * sec_next_free - section bump allocator + * + * -- Stats -- + * total_sections_created - atomic counter for new sections + * ═══════════════════════════════════════════════════════════════ */ + +__kernel void extract_sections( + /* sentence data */ + __global const uint* flat_words, + __global const uint* sent_offsets, + __global const uint* sent_lengths, + const uint num_sentences, + const uint total_words, + /* MST edge data */ + __global const uint* edge_p1, + __global const uint* edge_p2, + __global const uint* edge_offsets, + __global const uint* edge_counts, + /* section hash table */ + __global volatile ulong* sht_keys, + __global volatile uint* sht_values, + /* section pool SoA */ + __global uint* sec_word, + __global ulong* sec_disjunct_hash, + __global volatile double* sec_count, + __global volatile uint* sec_next_free, + /* stats */ + __global volatile uint* total_sections_created) +{ + uint tid = get_global_id(0); + if (tid >= total_words) return; + + /* ── Find which sentence this word belongs to (binary search) ── */ + + uint lo_s = 0, hi_s = num_sentences; + while (lo_s < hi_s) { + uint mid = (lo_s + hi_s) / 2; + if (sent_offsets[mid] + sent_lengths[mid] <= tid) + lo_s = mid + 1; + else + hi_s = mid; + } + uint sent_idx = lo_s; + if (sent_idx >= num_sentences) return; + + uint sent_start = sent_offsets[sent_idx]; + uint sent_len = sent_lengths[sent_idx]; + uint pos_in_sent = tid - sent_start; + + /* Verify we're in this sentence */ + if (pos_in_sent >= sent_len) return; + + uint my_word = flat_words[tid]; + + /* ── Collect connectors from MST edges ── */ + + uint conn_words[MAX_CONNECTORS]; + uint conn_dirs[MAX_CONNECTORS]; + uint conn_count = 0; + + uint e_start = edge_offsets[sent_idx]; + uint e_count = edge_counts[sent_idx]; + + for (uint e = 0; e < e_count && conn_count < MAX_CONNECTORS; e++) + { + uint p1 = edge_p1[e_start + e]; + uint p2 = edge_p2[e_start + e]; + + if (p1 == pos_in_sent) + { + /* Edge goes from me to p2 → p2 is to the right */ + uint partner_word = flat_words[sent_start + p2]; + conn_words[conn_count] = partner_word; + conn_dirs[conn_count] = (p2 > pos_in_sent) ? DIR_RIGHT : DIR_LEFT; + conn_count++; + } + else if (p2 == pos_in_sent) + { + /* Edge goes from p1 to me → p1 is to the left */ + uint partner_word = flat_words[sent_start + p1]; + conn_words[conn_count] = partner_word; + conn_dirs[conn_count] = (p1 < pos_in_sent) ? DIR_LEFT : DIR_RIGHT; + conn_count++; + } + } + + /* No connectors = isolated word → no section */ + if (conn_count == 0) return; + + /* ── Sort connectors deterministically ── */ + + sort_connectors(conn_words, conn_dirs, conn_count); + + /* ── Hash sorted connectors → disjunct_hash ── */ + + ulong djh = hash_disjunct(conn_words, conn_dirs, conn_count); + + /* ── Find or create section in SectionPool ── */ + + ulong key = section_key(my_word, djh); + + ulong ht_cap = SECTION_HT_CAPACITY; + ulong mask = ht_cap - 1; + ulong slot = ht_hash(key) & mask; + uint sec_idx = HT_EMPTY_VALUE; + + for (uint probe = 0; probe < HT_MAX_PROBES; probe++) + { + ulong prev = atom_cmpxchg(&sht_keys[slot], HT_EMPTY_KEY, key); + + if (prev == HT_EMPTY_KEY) + { + /* New section — allocate from pool */ + uint idx = atomic_add(sec_next_free, 1U); + if (idx < SECTION_CAPACITY) { + sec_word[idx] = my_word; + sec_disjunct_hash[idx] = djh; + sec_count[idx] = 0.0; + mem_fence(CLK_GLOBAL_MEM_FENCE); + sht_values[slot] = idx; + sec_idx = idx; + atomic_add(total_sections_created, 1U); + } + break; + } + if (prev == key) + { + /* Existing section — spin for value */ + uint val = sht_values[slot]; + while (val == HT_EMPTY_VALUE) { + val = sht_values[slot]; + } + sec_idx = val; + break; + } + + slot = (slot + 1) & mask; + } + + if (sec_idx == HT_EMPTY_VALUE) return; + + /* ── Increment section count ── */ + + atomic_add_double(&sec_count[sec_idx], 1.0); +} + +/* ═══════════════════════════════════════════════════════════════ + * READBACK: Dump section data for verification + * + * Reads section pool entries into flat output arrays. + * ═══════════════════════════════════════════════════════════════ */ + +__kernel void read_sections( + __global const uint* sec_word, + __global const ulong* sec_disjunct_hash, + __global const double* sec_count, + __global uint* out_word, + __global ulong* out_disjunct_hash, + __global double* out_count, + const uint num_sections) +{ + uint tid = get_global_id(0); + if (tid >= num_sections) return; + + out_word[tid] = sec_word[tid]; + out_disjunct_hash[tid] = sec_disjunct_hash[tid]; + out_count[tid] = sec_count[tid]; +} + +/* ═══════════════════════════════════════════════════════════════ + * EXTRACT SECTIONS + COUNT PAIRS (combined kernel) + * + * Same as extract_sections but also counts section pairs + * within a window (for Level 1 vocabulary learning). + * + * Each word position produces one section. Adjacent sections + * (within pair_window) form section pairs. Section pair counts + * are stored via the pair pool (reusing pair_find_or_create + * with section indices instead of word indices). + * + * This is Phase 4's full pipeline: edges → sections → section pairs + * all in one kernel launch. + * ═══════════════════════════════════════════════════════════════ */ + +/* Note: Section pair counting is deferred to Phase 5, where the + * section pool is used for cosine similarity. The basic extract_sections + * kernel above is sufficient for Phase 4. */ diff --git a/opencog/opencl/atomspace/test-cosine.c b/opencog/opencl/atomspace/test-cosine.c new file mode 100644 index 0000000..134d0f3 --- /dev/null +++ b/opencog/opencl/atomspace/test-cosine.c @@ -0,0 +1,954 @@ +/* + * test-cosine.c -- Test GPU cosine similarity and candidate generation + * + * Compile: gcc -O2 -o test-cosine test-cosine.c -lOpenCL -lm + * Run: ./test-cosine + * + * Tests: + * 1. Known cosine (2 words, 4 sections, exact verification) + * 2. Three words — all pairwise cosines + * 3. Identical vectors → cosine = 1.0 + * 4. No shared disjuncts → 0 candidates + * 5. Filter candidates above threshold + * 6. Benchmark: 1000 sentences → sections → cosines (full pipeline) + */ + +#include +#include +#include +#include +#include +#include +#include + +/* ─── Pool capacities ─── */ + +#define WORD_CAPACITY (128 * 1024) +#define PAIR_CAPACITY (4 * 1024 * 1024) +#define SECTION_CAPACITY (1024 * 1024) +#define WORD_HT_CAPACITY (256 * 1024) +#define PAIR_HT_CAPACITY (8 * 1024 * 1024) +#define SECTION_HT_CAPACITY (2 * 1024 * 1024) + +/* Phase 5 capacities */ +#define DJH_HT_CAPACITY (2 * 1024 * 1024) +#define CANDIDATE_CAPACITY (512 * 1024) +#define CANDIDATE_HT_CAPACITY (1024 * 1024) + +#define HT_EMPTY_KEY 0xFFFFFFFFFFFFFFFFULL +#define HT_EMPTY_VALUE 0xFFFFFFFFU + +/* ─── Error checking ─── */ + +#define CL_CHECK(err, msg) do { \ + if ((err) != CL_SUCCESS) { \ + fprintf(stderr, "OpenCL error %d at %s:%d: %s\n", \ + (err), __FILE__, __LINE__, (msg)); \ + exit(1); \ + } \ +} while(0) + +/* ─── Read file ─── */ + +char* read_file(const char* path, size_t* len) +{ + FILE* f = fopen(path, "r"); + if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); } + fseek(f, 0, SEEK_END); + *len = ftell(f); + fseek(f, 0, SEEK_SET); + char* buf = malloc(*len + 1); + size_t n = fread(buf, 1, *len, f); + buf[n] = '\0'; + *len = n; + fclose(f); + return buf; +} + +/* ─── Timing ─── */ + +double now_ms(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0; +} + +/* ─── Helper: Concatenate sources ─── */ + +char* concat_sources(const char** srcs, size_t* lens, int n, size_t* total) +{ + *total = 0; + for (int i = 0; i < n; i++) *total += lens[i] + 1; + char* buf = malloc(*total + 1); + size_t pos = 0; + for (int i = 0; i < n; i++) { + memcpy(buf + pos, srcs[i], lens[i]); + pos += lens[i]; + buf[pos++] = '\n'; + } + buf[pos] = '\0'; + *total = pos; + return buf; +} + +/* ─── Helper: reset cosine pipeline buffers ─── */ + +void reset_cosine_buffers(cl_command_queue queue, + cl_mem djh_ht_keys, cl_mem djh_ht_values, + cl_mem sec_chain_next, cl_mem word_norm_sq, + cl_mem cand_ht_keys, cl_mem cand_ht_values, + cl_mem cand_dot, cl_mem cand_cosine, + cl_mem cand_next_free) +{ + uint8_t pat_ff = 0xFF; + uint8_t pat_00 = 0x00; + uint32_t zero = 0; + + /* Disjunct reverse index HT */ + clEnqueueFillBuffer(queue, djh_ht_keys, &pat_ff, 1, 0, + sizeof(uint64_t) * DJH_HT_CAPACITY, 0, NULL, NULL); + clEnqueueFillBuffer(queue, djh_ht_values, &pat_ff, 1, 0, + sizeof(uint32_t) * DJH_HT_CAPACITY, 0, NULL, NULL); + + /* Section chain pointers */ + clEnqueueFillBuffer(queue, sec_chain_next, &pat_ff, 1, 0, + sizeof(uint32_t) * SECTION_CAPACITY, 0, NULL, NULL); + + /* Word norms */ + clEnqueueFillBuffer(queue, word_norm_sq, &pat_00, 1, 0, + sizeof(double) * WORD_CAPACITY, 0, NULL, NULL); + + /* Candidate HT */ + clEnqueueFillBuffer(queue, cand_ht_keys, &pat_ff, 1, 0, + sizeof(uint64_t) * CANDIDATE_HT_CAPACITY, 0, NULL, NULL); + clEnqueueFillBuffer(queue, cand_ht_values, &pat_ff, 1, 0, + sizeof(uint32_t) * CANDIDATE_HT_CAPACITY, 0, NULL, NULL); + + /* Candidate pool */ + clEnqueueFillBuffer(queue, cand_dot, &pat_00, 1, 0, + sizeof(double) * CANDIDATE_CAPACITY, 0, NULL, NULL); + clEnqueueFillBuffer(queue, cand_cosine, &pat_00, 1, 0, + sizeof(double) * CANDIDATE_CAPACITY, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, cand_next_free, CL_FALSE, 0, + sizeof(uint32_t), &zero, 0, NULL, NULL); + + clFinish(queue); +} + +/* ─── Helper: reset section pool ─── */ + +void reset_section_pool(cl_command_queue queue, + cl_mem sht_keys, cl_mem sht_values, + cl_mem sec_count, cl_mem sec_next_free) +{ + uint8_t pat_ff = 0xFF; + uint8_t pat_00 = 0x00; + uint32_t zero = 0; + + clEnqueueFillBuffer(queue, sht_keys, &pat_ff, 1, 0, + sizeof(uint64_t) * SECTION_HT_CAPACITY, 0, NULL, NULL); + clEnqueueFillBuffer(queue, sht_values, &pat_ff, 1, 0, + sizeof(uint32_t) * SECTION_HT_CAPACITY, 0, NULL, NULL); + clEnqueueFillBuffer(queue, sec_count, &pat_00, 1, 0, + sizeof(double) * SECTION_CAPACITY, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, sec_next_free, CL_FALSE, 0, + sizeof(uint32_t), &zero, 0, NULL, NULL); + clFinish(queue); +} + +/* ─── Helper: manually populate sections ─── */ + +void upload_sections(cl_command_queue queue, + cl_mem sec_word, cl_mem sec_disjunct_hash, cl_mem sec_count, + cl_mem sec_next_free, + uint32_t* words, uint64_t* djhs, double* counts, uint32_t n) +{ + clEnqueueWriteBuffer(queue, sec_word, CL_FALSE, 0, + sizeof(uint32_t) * n, words, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, sec_disjunct_hash, CL_FALSE, 0, + sizeof(uint64_t) * n, djhs, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, sec_count, CL_FALSE, 0, + sizeof(double) * n, counts, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, sec_next_free, CL_FALSE, 0, + sizeof(uint32_t), &n, 0, NULL, NULL); + clFinish(queue); +} + +/* ─── Helper: run cosine pipeline ─── */ + +void run_cosine_pipeline(cl_command_queue queue, + cl_kernel k_norms, cl_kernel k_chains, + cl_kernel k_dots, cl_kernel k_cosines, + uint32_t num_sections, + cl_mem cand_next_free, uint32_t* out_num_candidates) +{ + size_t local = 256; + size_t gs; + cl_int err; + + /* Step 1: Word norms */ + gs = ((num_sections + local - 1) / local) * local; + err = clEnqueueNDRangeKernel(queue, k_norms, 1, NULL, + &gs, &local, 0, NULL, NULL); + CL_CHECK(err, "enqueue compute_word_norms"); + + /* Step 2: Disjunct chains */ + err = clEnqueueNDRangeKernel(queue, k_chains, 1, NULL, + &gs, &local, 0, NULL, NULL); + CL_CHECK(err, "enqueue build_disjunct_chains"); + + /* Step 3: Dot products */ + err = clEnqueueNDRangeKernel(queue, k_dots, 1, NULL, + &gs, &local, 0, NULL, NULL); + CL_CHECK(err, "enqueue accumulate_dot_products"); + + clFinish(queue); + + /* Read candidate count */ + clEnqueueReadBuffer(queue, cand_next_free, CL_TRUE, 0, + sizeof(uint32_t), out_num_candidates, 0, NULL, NULL); + + if (*out_num_candidates == 0) return; + + /* Step 4: Cosines */ + gs = ((*out_num_candidates + local - 1) / local) * local; + cl_uint nc = *out_num_candidates; + clSetKernelArg(k_cosines, 5, sizeof(cl_uint), &nc); + err = clEnqueueNDRangeKernel(queue, k_cosines, 1, NULL, + &gs, &local, 0, NULL, NULL); + CL_CHECK(err, "enqueue compute_cosines"); + + clFinish(queue); +} + +/* ─── Main ─── */ + +int main(int argc, char** argv) +{ + cl_int err; + int pass_count = 0, fail_count = 0; + + printf("=== GPU Cosine Similarity Test ===\n\n"); + + /* ─── OpenCL setup ─── */ + + cl_platform_id platform; + err = clGetPlatformIDs(1, &platform, NULL); + CL_CHECK(err, "platform"); + + cl_device_id device; + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); + CL_CHECK(err, "device"); + + char dev_name[256]; + clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL); + printf("GPU: %s\n", dev_name); + + cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err); + CL_CHECK(err, "context"); + + cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &err); + CL_CHECK(err, "queue"); + + /* ─── Load and concatenate kernel sources ─── */ + + size_t len_ht, len_as, len_sc, len_cos; + char* src_ht = read_file("gpu-hashtable.cl", &len_ht); + char* src_as = read_file("gpu-atomspace.cl", &len_as); + char* src_sc = read_file("gpu-sections.cl", &len_sc); + char* src_cos = read_file("gpu-cosine.cl", &len_cos); + + const char* srcs[] = {src_ht, src_as, src_sc, src_cos}; + size_t lens[] = {len_ht, len_as, len_sc, len_cos}; + size_t total_len; + char* combined = concat_sources(srcs, lens, 4, &total_len); + + cl_program program = clCreateProgramWithSource(ctx, 1, + (const char**)&combined, &total_len, &err); + CL_CHECK(err, "create program"); + + char build_opts[1024]; + snprintf(build_opts, sizeof(build_opts), + "-cl-std=CL1.2 " + "-DWORD_CAPACITY=%d " + "-DPAIR_CAPACITY=%d " + "-DSECTION_CAPACITY=%d " + "-DWORD_HT_CAPACITY=%d " + "-DPAIR_HT_CAPACITY=%d " + "-DSECTION_HT_CAPACITY=%d " + "-DDJH_HT_CAPACITY=%d " + "-DCANDIDATE_CAPACITY=%d " + "-DCANDIDATE_HT_CAPACITY=%d", + WORD_CAPACITY, PAIR_CAPACITY, SECTION_CAPACITY, + WORD_HT_CAPACITY, PAIR_HT_CAPACITY, SECTION_HT_CAPACITY, + DJH_HT_CAPACITY, CANDIDATE_CAPACITY, CANDIDATE_HT_CAPACITY); + + err = clBuildProgram(program, 1, &device, build_opts, NULL, NULL); + if (err != CL_SUCCESS) { + char log[16384]; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, + sizeof(log), log, NULL); + fprintf(stderr, "Build error:\n%s\n", log); + return 1; + } + printf("Kernels compiled successfully\n\n"); + + /* ─── Create kernels ─── */ + + cl_kernel k_norms = clCreateKernel(program, "compute_word_norms", &err); + CL_CHECK(err, "kernel compute_word_norms"); + cl_kernel k_chains = clCreateKernel(program, "build_disjunct_chains", &err); + CL_CHECK(err, "kernel build_disjunct_chains"); + cl_kernel k_dots = clCreateKernel(program, "accumulate_dot_products", &err); + CL_CHECK(err, "kernel accumulate_dot_products"); + cl_kernel k_cosines = clCreateKernel(program, "compute_cosines", &err); + CL_CHECK(err, "kernel compute_cosines"); + cl_kernel k_filter = clCreateKernel(program, "filter_candidates", &err); + CL_CHECK(err, "kernel filter_candidates"); + cl_kernel k_extract = clCreateKernel(program, "extract_sections", &err); + CL_CHECK(err, "kernel extract_sections"); + + /* ─── Allocate GPU buffers ─── */ + + printf("Allocating GPU buffers...\n"); + uint32_t zero = 0; + + /* Section pool */ + cl_mem sec_word = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * SECTION_CAPACITY, NULL, &err); + cl_mem sec_disjunct_hash = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint64_t) * SECTION_CAPACITY, NULL, &err); + cl_mem sec_count = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(double) * SECTION_CAPACITY, NULL, &err); + cl_mem sec_next_free = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t), &zero, &err); + + /* Section hash table (for extract_sections) */ + cl_mem sht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint64_t) * SECTION_HT_CAPACITY, NULL, &err); + cl_mem sht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * SECTION_HT_CAPACITY, NULL, &err); + + /* Total sections counter (for extract_sections) */ + cl_mem total_sections_created = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t), &zero, &err); + + /* Disjunct reverse index HT */ + cl_mem djh_ht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint64_t) * DJH_HT_CAPACITY, NULL, &err); + CL_CHECK(err, "djh_ht_keys"); + cl_mem djh_ht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * DJH_HT_CAPACITY, NULL, &err); + CL_CHECK(err, "djh_ht_values"); + + /* Section chain pointers */ + cl_mem sec_chain_next = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * SECTION_CAPACITY, NULL, &err); + CL_CHECK(err, "sec_chain_next"); + + /* Word norms */ + cl_mem word_norm_sq = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(double) * WORD_CAPACITY, NULL, &err); + CL_CHECK(err, "word_norm_sq"); + + /* Candidate HT */ + cl_mem cand_ht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint64_t) * CANDIDATE_HT_CAPACITY, NULL, &err); + cl_mem cand_ht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * CANDIDATE_HT_CAPACITY, NULL, &err); + + /* Candidate pool */ + cl_mem cand_word_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * CANDIDATE_CAPACITY, NULL, &err); + cl_mem cand_word_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * CANDIDATE_CAPACITY, NULL, &err); + cl_mem cand_dot = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(double) * CANDIDATE_CAPACITY, NULL, &err); + cl_mem cand_cosine = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(double) * CANDIDATE_CAPACITY, NULL, &err); + cl_mem cand_next_free = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t), &zero, &err); + + /* ─── Set kernel args (persistent across tests) ─── */ + + /* compute_word_norms: (sec_word, sec_count, word_norm_sq, num_sections) */ + clSetKernelArg(k_norms, 0, sizeof(cl_mem), &sec_word); + clSetKernelArg(k_norms, 1, sizeof(cl_mem), &sec_count); + clSetKernelArg(k_norms, 2, sizeof(cl_mem), &word_norm_sq); + /* arg 3 = num_sections — set per test */ + + /* build_disjunct_chains: (sec_djh, sec_count, djh_ht_keys, djh_ht_values, + * sec_chain_next, num_sections) */ + clSetKernelArg(k_chains, 0, sizeof(cl_mem), &sec_disjunct_hash); + clSetKernelArg(k_chains, 1, sizeof(cl_mem), &sec_count); + clSetKernelArg(k_chains, 2, sizeof(cl_mem), &djh_ht_keys); + clSetKernelArg(k_chains, 3, sizeof(cl_mem), &djh_ht_values); + clSetKernelArg(k_chains, 4, sizeof(cl_mem), &sec_chain_next); + /* arg 5 = num_sections — set per test */ + + /* accumulate_dot_products: 14 args */ + clSetKernelArg(k_dots, 0, sizeof(cl_mem), &sec_word); + clSetKernelArg(k_dots, 1, sizeof(cl_mem), &sec_disjunct_hash); + clSetKernelArg(k_dots, 2, sizeof(cl_mem), &sec_count); + clSetKernelArg(k_dots, 3, sizeof(cl_mem), &djh_ht_keys); + clSetKernelArg(k_dots, 4, sizeof(cl_mem), &djh_ht_values); + clSetKernelArg(k_dots, 5, sizeof(cl_mem), &sec_chain_next); + clSetKernelArg(k_dots, 6, sizeof(cl_mem), &cand_ht_keys); + clSetKernelArg(k_dots, 7, sizeof(cl_mem), &cand_ht_values); + clSetKernelArg(k_dots, 8, sizeof(cl_mem), &cand_word_a); + clSetKernelArg(k_dots, 9, sizeof(cl_mem), &cand_word_b); + clSetKernelArg(k_dots, 10, sizeof(cl_mem), &cand_dot); + clSetKernelArg(k_dots, 11, sizeof(cl_mem), &cand_next_free); + /* arg 12 = num_sections — set per test */ + + /* compute_cosines: (cand_word_a, cand_word_b, cand_dot, cand_cosine, + * word_norm_sq, num_candidates) */ + clSetKernelArg(k_cosines, 0, sizeof(cl_mem), &cand_word_a); + clSetKernelArg(k_cosines, 1, sizeof(cl_mem), &cand_word_b); + clSetKernelArg(k_cosines, 2, sizeof(cl_mem), &cand_dot); + clSetKernelArg(k_cosines, 3, sizeof(cl_mem), &cand_cosine); + clSetKernelArg(k_cosines, 4, sizeof(cl_mem), &word_norm_sq); + /* arg 5 = num_candidates — set in pipeline */ + + /* filter_candidates */ + clSetKernelArg(k_filter, 0, sizeof(cl_mem), &cand_word_a); + clSetKernelArg(k_filter, 1, sizeof(cl_mem), &cand_word_b); + clSetKernelArg(k_filter, 2, sizeof(cl_mem), &cand_cosine); + /* args 3-8 set per test */ + + printf("GPU buffers ready\n\n"); + + /* ═══════════════════════════════════════════════════════════════ + * TEST 1: Known cosine (2 words, 4 sections) + * + * Section 0: word=0, djh=0x100, count=3.0 + * Section 1: word=0, djh=0x200, count=4.0 + * Section 2: word=1, djh=0x100, count=5.0 + * Section 3: word=1, djh=0x300, count=2.0 + * + * word 0: {0x100: 3, 0x200: 4} → norm² = 25, norm = 5 + * word 1: {0x100: 5, 0x300: 2} → norm² = 29, norm = √29 + * Shared: 0x100 → dot = 3×5 = 15 + * Cosine = 15 / (5 × √29) = 0.5571 + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 1: Known cosine (2 words, 4 sections) ---\n"); + + reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values, + sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values, + cand_dot, cand_cosine, cand_next_free); + + { + uint32_t sw[] = {0, 0, 1, 1}; + uint64_t sd[] = {0x100, 0x200, 0x100, 0x300}; + double sc[] = {3.0, 4.0, 5.0, 2.0}; + cl_uint ns = 4; + + upload_sections(queue, sec_word, sec_disjunct_hash, sec_count, + sec_next_free, sw, sd, sc, ns); + + /* Set num_sections for each kernel */ + clSetKernelArg(k_norms, 3, sizeof(cl_uint), &ns); + clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns); + clSetKernelArg(k_dots, 12, sizeof(cl_uint), &ns); + + uint32_t num_cands = 0; + double t0 = now_ms(); + run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines, + ns, cand_next_free, &num_cands); + double t1 = now_ms(); + + /* Read results */ + uint32_t h_wa[4], h_wb[4]; + double h_dot[4], h_cos[4]; + if (num_cands > 0) { + clEnqueueReadBuffer(queue, cand_word_a, CL_TRUE, 0, + sizeof(uint32_t) * num_cands, h_wa, 0, NULL, NULL); + clEnqueueReadBuffer(queue, cand_word_b, CL_TRUE, 0, + sizeof(uint32_t) * num_cands, h_wb, 0, NULL, NULL); + clEnqueueReadBuffer(queue, cand_dot, CL_TRUE, 0, + sizeof(double) * num_cands, h_dot, 0, NULL, NULL); + clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0, + sizeof(double) * num_cands, h_cos, 0, NULL, NULL); + } + + /* Expected: 1 candidate pair (0,1) with dot=15.0, cosine≈0.5571 */ + double expected_cos = 15.0 / (5.0 * sqrt(29.0)); + + printf(" Candidates: %u (expected 1)\n", num_cands); + if (num_cands > 0) { + printf(" Pair: (%u, %u) dot=%.1f cosine=%.4f\n", + h_wa[0], h_wb[0], h_dot[0], h_cos[0]); + printf(" Expected: (0, 1) dot=15.0 cosine=%.4f\n", expected_cos); + } + printf(" Time: %.2f ms\n", t1 - t0); + + int pass = (num_cands == 1) && + (h_wa[0] == 0) && (h_wb[0] == 1) && + (fabs(h_dot[0] - 15.0) < 0.01) && + (fabs(h_cos[0] - expected_cos) < 0.001); + printf(" %s\n\n", pass ? "PASS" : "FAIL"); + if (pass) pass_count++; else fail_count++; + } + + /* ═══════════════════════════════════════════════════════════════ + * TEST 2: Three words — all pairwise cosines + * + * word 0: {X=0x10: 1, Y=0x20: 2} norm² = 5 + * word 1: {X=0x10: 3, Z=0x30: 1} norm² = 10 + * word 2: {Y=0x20: 2, Z=0x30: 4} norm² = 20 + * + * dot(0,1) = 1×3 = 3 cos = 3/√50 ≈ 0.4243 + * dot(0,2) = 2×2 = 4 cos = 4/√100 = 0.4000 + * dot(1,2) = 1×4 = 4 cos = 4/√200 ≈ 0.2828 + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 2: Three words, all pairwise cosines ---\n"); + + reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values, + sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values, + cand_dot, cand_cosine, cand_next_free); + + { + uint32_t sw[] = {0, 0, 1, 1, 2, 2}; + uint64_t sd[] = {0x10, 0x20, 0x10, 0x30, 0x20, 0x30}; + double sc[] = {1.0, 2.0, 3.0, 1.0, 2.0, 4.0}; + cl_uint ns = 6; + + upload_sections(queue, sec_word, sec_disjunct_hash, sec_count, + sec_next_free, sw, sd, sc, ns); + + clSetKernelArg(k_norms, 3, sizeof(cl_uint), &ns); + clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns); + clSetKernelArg(k_dots, 12, sizeof(cl_uint), &ns); + + uint32_t num_cands = 0; + double t0 = now_ms(); + run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines, + ns, cand_next_free, &num_cands); + double t1 = now_ms(); + + uint32_t h_wa[8], h_wb[8]; + double h_dot[8], h_cos[8]; + if (num_cands > 0) { + clEnqueueReadBuffer(queue, cand_word_a, CL_TRUE, 0, + sizeof(uint32_t) * num_cands, h_wa, 0, NULL, NULL); + clEnqueueReadBuffer(queue, cand_word_b, CL_TRUE, 0, + sizeof(uint32_t) * num_cands, h_wb, 0, NULL, NULL); + clEnqueueReadBuffer(queue, cand_dot, CL_TRUE, 0, + sizeof(double) * num_cands, h_dot, 0, NULL, NULL); + clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0, + sizeof(double) * num_cands, h_cos, 0, NULL, NULL); + } + + double exp_01 = 3.0 / sqrt(50.0); + double exp_02 = 4.0 / sqrt(100.0); + double exp_12 = 4.0 / sqrt(200.0); + + printf(" Candidates: %u (expected 3)\n", num_cands); + + /* Find each pair in results */ + double got_01 = -1, got_02 = -1, got_12 = -1; + double got_dot_01 = -1, got_dot_02 = -1, got_dot_12 = -1; + for (uint32_t i = 0; i < num_cands; i++) { + if (h_wa[i] == 0 && h_wb[i] == 1) + { got_01 = h_cos[i]; got_dot_01 = h_dot[i]; } + if (h_wa[i] == 0 && h_wb[i] == 2) + { got_02 = h_cos[i]; got_dot_02 = h_dot[i]; } + if (h_wa[i] == 1 && h_wb[i] == 2) + { got_12 = h_cos[i]; got_dot_12 = h_dot[i]; } + } + + printf(" (0,1): dot=%.1f cos=%.4f (exp dot=3.0 cos=%.4f)\n", + got_dot_01, got_01, exp_01); + printf(" (0,2): dot=%.1f cos=%.4f (exp dot=4.0 cos=%.4f)\n", + got_dot_02, got_02, exp_02); + printf(" (1,2): dot=%.1f cos=%.4f (exp dot=4.0 cos=%.4f)\n", + got_dot_12, got_12, exp_12); + printf(" Time: %.2f ms\n", t1 - t0); + + int pass = (num_cands == 3) && + (fabs(got_01 - exp_01) < 0.001) && + (fabs(got_02 - exp_02) < 0.001) && + (fabs(got_12 - exp_12) < 0.001); + printf(" %s\n\n", pass ? "PASS" : "FAIL"); + if (pass) pass_count++; else fail_count++; + } + + /* ═══════════════════════════════════════════════════════════════ + * TEST 3: Identical vectors → cosine = 1.0 + * + * word 0: {X=0x10: 3, Y=0x20: 4} + * word 1: {X=0x10: 3, Y=0x20: 4} + * + * dot = 9+16 = 25, norms = 5 each, cosine = 25/25 = 1.0 + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 3: Identical vectors → cosine = 1.0 ---\n"); + + reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values, + sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values, + cand_dot, cand_cosine, cand_next_free); + + { + uint32_t sw[] = {0, 0, 1, 1}; + uint64_t sd[] = {0x10, 0x20, 0x10, 0x20}; + double sc[] = {3.0, 4.0, 3.0, 4.0}; + cl_uint ns = 4; + + upload_sections(queue, sec_word, sec_disjunct_hash, sec_count, + sec_next_free, sw, sd, sc, ns); + + clSetKernelArg(k_norms, 3, sizeof(cl_uint), &ns); + clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns); + clSetKernelArg(k_dots, 12, sizeof(cl_uint), &ns); + + uint32_t num_cands = 0; + run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines, + ns, cand_next_free, &num_cands); + + double h_cos = 0; + if (num_cands > 0) { + clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0, + sizeof(double), &h_cos, 0, NULL, NULL); + } + + printf(" Candidates: %u (expected 1)\n", num_cands); + printf(" Cosine: %.4f (expected 1.0000)\n", h_cos); + + int pass = (num_cands == 1) && (fabs(h_cos - 1.0) < 0.001); + printf(" %s\n\n", pass ? "PASS" : "FAIL"); + if (pass) pass_count++; else fail_count++; + } + + /* ═══════════════════════════════════════════════════════════════ + * TEST 4: No shared disjuncts → 0 candidates + * + * word 0: {X=0x10: 1} + * word 1: {Y=0x20: 1} + * + * No shared disjuncts → no chain overlap → 0 candidates + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 4: No shared disjuncts → 0 candidates ---\n"); + + reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values, + sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values, + cand_dot, cand_cosine, cand_next_free); + + { + uint32_t sw[] = {0, 1}; + uint64_t sd[] = {0x10, 0x20}; + double sc[] = {1.0, 1.0}; + cl_uint ns = 2; + + upload_sections(queue, sec_word, sec_disjunct_hash, sec_count, + sec_next_free, sw, sd, sc, ns); + + clSetKernelArg(k_norms, 3, sizeof(cl_uint), &ns); + clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns); + clSetKernelArg(k_dots, 12, sizeof(cl_uint), &ns); + + uint32_t num_cands = 0; + run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines, + ns, cand_next_free, &num_cands); + + printf(" Candidates: %u (expected 0)\n", num_cands); + + int pass = (num_cands == 0); + printf(" %s\n\n", pass ? "PASS" : "FAIL"); + if (pass) pass_count++; else fail_count++; + } + + /* ═══════════════════════════════════════════════════════════════ + * TEST 5: Filter candidates above threshold + * + * Reuse test 2's scenario (3 words): + * cos(0,1) ≈ 0.4243 + * cos(0,2) = 0.4000 + * cos(1,2) ≈ 0.2828 + * + * Filter at 0.35 → should get 2 candidates (0,1) and (0,2) + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 5: Filter candidates above threshold ---\n"); + + reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values, + sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values, + cand_dot, cand_cosine, cand_next_free); + + { + uint32_t sw[] = {0, 0, 1, 1, 2, 2}; + uint64_t sd[] = {0x10, 0x20, 0x10, 0x30, 0x20, 0x30}; + double sc[] = {1.0, 2.0, 3.0, 1.0, 2.0, 4.0}; + cl_uint ns = 6; + + upload_sections(queue, sec_word, sec_disjunct_hash, sec_count, + sec_next_free, sw, sd, sc, ns); + + clSetKernelArg(k_norms, 3, sizeof(cl_uint), &ns); + clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns); + clSetKernelArg(k_dots, 12, sizeof(cl_uint), &ns); + + uint32_t num_cands = 0; + run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines, + ns, cand_next_free, &num_cands); + + /* Now filter */ + cl_double threshold = 0.35; + cl_uint max_output = 64; + cl_mem out_wa = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, + sizeof(uint32_t) * max_output, NULL, &err); + cl_mem out_wb = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, + sizeof(uint32_t) * max_output, NULL, &err); + cl_mem out_cos = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, + sizeof(double) * max_output, NULL, &err); + cl_mem out_count = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t), &zero, &err); + + clSetKernelArg(k_filter, 3, sizeof(cl_uint), &num_cands); + clSetKernelArg(k_filter, 4, sizeof(cl_double), &threshold); + clSetKernelArg(k_filter, 5, sizeof(cl_mem), &out_wa); + clSetKernelArg(k_filter, 6, sizeof(cl_mem), &out_wb); + clSetKernelArg(k_filter, 7, sizeof(cl_mem), &out_cos); + clSetKernelArg(k_filter, 8, sizeof(cl_mem), &out_count); + clSetKernelArg(k_filter, 9, sizeof(cl_uint), &max_output); + + size_t local = 256; + size_t gs = ((num_cands + local - 1) / local) * local; + err = clEnqueueNDRangeKernel(queue, k_filter, 1, NULL, + &gs, &local, 0, NULL, NULL); + CL_CHECK(err, "enqueue filter"); + clFinish(queue); + + uint32_t n_filtered; + clEnqueueReadBuffer(queue, out_count, CL_TRUE, 0, + sizeof(uint32_t), &n_filtered, 0, NULL, NULL); + + uint32_t fwa[8], fwb[8]; + double fcos[8]; + if (n_filtered > 0) { + clEnqueueReadBuffer(queue, out_wa, CL_TRUE, 0, + sizeof(uint32_t) * n_filtered, fwa, 0, NULL, NULL); + clEnqueueReadBuffer(queue, out_wb, CL_TRUE, 0, + sizeof(uint32_t) * n_filtered, fwb, 0, NULL, NULL); + clEnqueueReadBuffer(queue, out_cos, CL_TRUE, 0, + sizeof(double) * n_filtered, fcos, 0, NULL, NULL); + } + + printf(" Total candidates: %u, filtered (>0.35): %u (expected 2)\n", + num_cands, n_filtered); + for (uint32_t i = 0; i < n_filtered; i++) { + printf(" (%u, %u) cos=%.4f\n", fwa[i], fwb[i], fcos[i]); + } + + /* cos(1,2) ≈ 0.2828 should be filtered out */ + int pass = (n_filtered == 2); + printf(" %s\n\n", pass ? "PASS" : "FAIL"); + if (pass) pass_count++; else fail_count++; + + clReleaseMemObject(out_wa); + clReleaseMemObject(out_wb); + clReleaseMemObject(out_cos); + clReleaseMemObject(out_count); + } + + /* ═══════════════════════════════════════════════════════════════ + * TEST 6: Benchmark — 1000 sentences → sections → cosines + * + * Full pipeline: + * extract_sections (Phase 4) → cosine pipeline (Phase 5) + * + * 1000 sentences, 10-20 words each, chain MST parse. + * 500 word vocabulary for realistic disjunct sharing. + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 6: Benchmark (1000 sentences, full pipeline) ---\n"); + + /* Reset everything */ + reset_section_pool(queue, sht_keys, sht_values, sec_count, sec_next_free); + reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values, + sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values, + cand_dot, cand_cosine, cand_next_free); + { + uint32_t tsec_zero = 0; + clEnqueueWriteBuffer(queue, total_sections_created, CL_TRUE, 0, + sizeof(uint32_t), &tsec_zero, 0, NULL, NULL); + } + + srand(42); + uint32_t bench_ns = 1000; + uint32_t vocab_size = 500; + + uint32_t max_words = bench_ns * 25; + uint32_t max_edges = bench_ns * 25; + uint32_t* b_words = malloc(sizeof(uint32_t) * max_words); + uint32_t* b_sent_offsets = malloc(sizeof(uint32_t) * bench_ns); + uint32_t* b_sent_lengths = malloc(sizeof(uint32_t) * bench_ns); + uint32_t* b_edge_p1 = malloc(sizeof(uint32_t) * max_edges); + uint32_t* b_edge_p2 = malloc(sizeof(uint32_t) * max_edges); + uint32_t* b_edge_offsets = malloc(sizeof(uint32_t) * bench_ns); + uint32_t* b_edge_counts = malloc(sizeof(uint32_t) * bench_ns); + + uint32_t word_pos = 0, edge_pos = 0; + for (uint32_t s = 0; s < bench_ns; s++) { + uint32_t slen = 10 + (rand() % 11); + b_sent_offsets[s] = word_pos; + b_sent_lengths[s] = slen; + b_edge_offsets[s] = edge_pos; + b_edge_counts[s] = slen - 1; + + for (uint32_t w = 0; w < slen; w++) + b_words[word_pos++] = rand() % vocab_size; + for (uint32_t e = 0; e < slen - 1; e++) { + b_edge_p1[edge_pos] = e; + b_edge_p2[edge_pos] = e + 1; + edge_pos++; + } + } + + printf(" Sentences: %u, words: %u, edges: %u\n", bench_ns, word_pos, edge_pos); + + cl_mem d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * word_pos, b_words, &err); + cl_mem d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_sent_offsets, &err); + cl_mem d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_sent_lengths, &err); + cl_mem d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * edge_pos, b_edge_p1, &err); + cl_mem d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * edge_pos, b_edge_p2, &err); + cl_mem d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_edge_offsets, &err); + cl_mem d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_edge_counts, &err); + + /* Set extract_sections args */ + cl_uint tw = word_pos; + clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words); + clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets); + clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths); + clSetKernelArg(k_extract, 3, sizeof(cl_uint), &bench_ns); + clSetKernelArg(k_extract, 4, sizeof(cl_uint), &tw); + clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1); + clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2); + clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets); + clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts); + clSetKernelArg(k_extract, 9, sizeof(cl_mem), &sht_keys); + clSetKernelArg(k_extract, 10, sizeof(cl_mem), &sht_values); + clSetKernelArg(k_extract, 11, sizeof(cl_mem), &sec_word); + clSetKernelArg(k_extract, 12, sizeof(cl_mem), &sec_disjunct_hash); + clSetKernelArg(k_extract, 13, sizeof(cl_mem), &sec_count); + clSetKernelArg(k_extract, 14, sizeof(cl_mem), &sec_next_free); + clSetKernelArg(k_extract, 15, sizeof(cl_mem), &total_sections_created); + + size_t local = 256; + size_t gs; + + /* Phase 4: Extract sections */ + double t_start = now_ms(); + + gs = ((tw + local - 1) / local) * local; + err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local, 0, NULL, NULL); + CL_CHECK(err, "enqueue extract"); + clFinish(queue); + + double t_extract = now_ms(); + + /* Read section count */ + uint32_t h_num_sections; + clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0, + sizeof(uint32_t), &h_num_sections, 0, NULL, NULL); + + printf(" Sections extracted: %u (%.2f ms)\n", + h_num_sections, t_extract - t_start); + + /* Phase 5: Cosine pipeline */ + clSetKernelArg(k_norms, 3, sizeof(cl_uint), &h_num_sections); + clSetKernelArg(k_chains, 5, sizeof(cl_uint), &h_num_sections); + clSetKernelArg(k_dots, 12, sizeof(cl_uint), &h_num_sections); + + double t_cos_start = now_ms(); + + uint32_t num_cands = 0; + run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines, + h_num_sections, cand_next_free, &num_cands); + + double t_cos_end = now_ms(); + + double total_time = t_cos_end - t_start; + double cos_time = t_cos_end - t_cos_start; + + printf(" Candidate pairs: %u\n", num_cands); + printf(" Cosine pipeline: %.2f ms\n", cos_time); + printf(" Full pipeline (extract + cosine): %.2f ms\n", total_time); + printf(" Throughput: %.0f sentences/sec\n", + bench_ns / (total_time / 1000.0)); + + if (num_cands > 0) { + /* Read a few top cosines to verify */ + uint32_t peek = (num_cands < 8) ? num_cands : 8; + double h_cos[8]; + clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0, + sizeof(double) * peek, h_cos, 0, NULL, NULL); + + /* Find max cosine */ + double max_cos = 0; + for (uint32_t i = 0; i < peek; i++) + if (h_cos[i] > max_cos) max_cos = h_cos[i]; + printf(" Max cosine (first %u): %.4f\n", peek, max_cos); + } + + int t6_pass = (h_num_sections > 0) && (num_cands > 0) && (total_time < 5000.0); + printf(" %s\n\n", t6_pass ? "PASS" : "FAIL"); + if (t6_pass) pass_count++; else fail_count++; + + /* ─── Summary ─── */ + + printf("=== Results: %d PASS, %d FAIL ===\n", pass_count, fail_count); + + /* Cleanup */ + free(b_words); free(b_sent_offsets); free(b_sent_lengths); + free(b_edge_p1); free(b_edge_p2); free(b_edge_offsets); free(b_edge_counts); + free(src_ht); free(src_as); free(src_sc); free(src_cos); free(combined); + + clReleaseMemObject(d_flat_words); + clReleaseMemObject(d_sent_offsets); + clReleaseMemObject(d_sent_lengths); + clReleaseMemObject(d_edge_p1); + clReleaseMemObject(d_edge_p2); + clReleaseMemObject(d_edge_offsets); + clReleaseMemObject(d_edge_counts); + clReleaseMemObject(sec_word); + clReleaseMemObject(sec_disjunct_hash); + clReleaseMemObject(sec_count); + clReleaseMemObject(sec_next_free); + clReleaseMemObject(sht_keys); + clReleaseMemObject(sht_values); + clReleaseMemObject(total_sections_created); + clReleaseMemObject(djh_ht_keys); + clReleaseMemObject(djh_ht_values); + clReleaseMemObject(sec_chain_next); + clReleaseMemObject(word_norm_sq); + clReleaseMemObject(cand_ht_keys); + clReleaseMemObject(cand_ht_values); + clReleaseMemObject(cand_word_a); + clReleaseMemObject(cand_word_b); + clReleaseMemObject(cand_dot); + clReleaseMemObject(cand_cosine); + clReleaseMemObject(cand_next_free); + clReleaseKernel(k_norms); + clReleaseKernel(k_chains); + clReleaseKernel(k_dots); + clReleaseKernel(k_cosines); + clReleaseKernel(k_filter); + clReleaseKernel(k_extract); + clReleaseProgram(program); + clReleaseCommandQueue(queue); + clReleaseContext(ctx); + + return fail_count > 0 ? 1 : 0; +} diff --git a/opencog/opencl/atomspace/test-sections.c b/opencog/opencl/atomspace/test-sections.c new file mode 100644 index 0000000..06472fb --- /dev/null +++ b/opencog/opencl/atomspace/test-sections.c @@ -0,0 +1,883 @@ +/* + * test-sections.c -- Test GPU section extraction kernel + * + * Compile: gcc -O2 -o test-sections test-sections.c -lOpenCL -lm + * Run: ./test-sections + * + * Tests: + * 1. Simple MST: 3-word sentence with 2 edges → 3 sections + * 2. Star parse: all edges from one root → verify disjuncts + * 3. Multi-sentence batch (no cross-boundary sections) + * 4. Duplicate disjuncts: same parse seen twice → counts accumulate + * 5. Readback kernel verification + * 6. Benchmark: 1000 sentences with random MST edges + */ + +#include +#include +#include +#include +#include +#include +#include + +/* ─── Pool capacities ─── */ + +#define WORD_CAPACITY (128 * 1024) +#define PAIR_CAPACITY (4 * 1024 * 1024) +#define SECTION_CAPACITY (1024 * 1024) +#define WORD_HT_CAPACITY (256 * 1024) +#define PAIR_HT_CAPACITY (8 * 1024 * 1024) +#define SECTION_HT_CAPACITY (2 * 1024 * 1024) + +#define HT_EMPTY_KEY 0xFFFFFFFFFFFFFFFFULL +#define HT_EMPTY_VALUE 0xFFFFFFFFU + +/* ─── Error checking ─── */ + +#define CL_CHECK(err, msg) do { \ + if ((err) != CL_SUCCESS) { \ + fprintf(stderr, "OpenCL error %d at %s:%d: %s\n", \ + (err), __FILE__, __LINE__, (msg)); \ + exit(1); \ + } \ +} while(0) + +/* ─── Read file ─── */ + +char* read_file(const char* path, size_t* len) +{ + FILE* f = fopen(path, "r"); + if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); } + fseek(f, 0, SEEK_END); + *len = ftell(f); + fseek(f, 0, SEEK_SET); + char* buf = malloc(*len + 1); + size_t n = fread(buf, 1, *len, f); + buf[n] = '\0'; + *len = n; + fclose(f); + return buf; +} + +/* ─── Timing ─── */ + +double now_ms(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0; +} + +/* ─── CPU-side disjunct hash (must match GPU) ─── */ + +static uint64_t cpu_fnv1a_init(void) { return 0xcbf29ce484222325ULL; } + +static uint64_t cpu_fnv1a_mix(uint64_t hash, uint64_t val) +{ + hash ^= val; + hash *= 0x100000001b3ULL; + return hash; +} + +static uint64_t cpu_hash_disjunct(uint32_t* words, uint32_t* dirs, uint32_t count) +{ + uint64_t h = cpu_fnv1a_init(); + for (uint32_t i = 0; i < count; i++) { + uint64_t encoded = ((uint64_t)words[i] << 1) | (uint64_t)dirs[i]; + h = cpu_fnv1a_mix(h, encoded); + } + if (h == HT_EMPTY_KEY) h = 0; + return h; +} + +/* CPU-side section_key (must match GPU) */ +static uint64_t cpu_section_key(uint32_t word_idx, uint64_t disjunct_hash) +{ + uint64_t key = disjunct_hash ^ ((uint64_t)word_idx * 0x9E3779B97F4A7C15ULL); + if (key == HT_EMPTY_KEY) key = 0; + return key; +} + +/* ─── Helper: reset section pool and hash table ─── */ + +void reset_section_pool(cl_command_queue queue, + cl_mem sht_keys, cl_mem sht_values, + cl_mem sec_count, + cl_mem sec_next_free, cl_mem total_sections) +{ + uint8_t pat_ff = 0xFF; + uint8_t pat_00 = 0x00; + uint32_t zero = 0; + + clEnqueueFillBuffer(queue, sht_keys, &pat_ff, 1, 0, + sizeof(uint64_t) * SECTION_HT_CAPACITY, 0, NULL, NULL); + clEnqueueFillBuffer(queue, sht_values, &pat_ff, 1, 0, + sizeof(uint32_t) * SECTION_HT_CAPACITY, 0, NULL, NULL); + clEnqueueFillBuffer(queue, sec_count, &pat_00, 1, 0, + sizeof(double) * SECTION_CAPACITY, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, sec_next_free, CL_FALSE, 0, + sizeof(uint32_t), &zero, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, total_sections, CL_FALSE, 0, + sizeof(uint32_t), &zero, 0, NULL, NULL); + clFinish(queue); +} + +/* ─── Main ─── */ + +int main(int argc, char** argv) +{ + cl_int err; + int pass_count = 0, fail_count = 0; + + printf("=== GPU Section Extraction Test ===\n\n"); + + /* ─── OpenCL setup ─── */ + + cl_platform_id platform; + err = clGetPlatformIDs(1, &platform, NULL); + CL_CHECK(err, "platform"); + + cl_device_id device; + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); + CL_CHECK(err, "device"); + + char dev_name[256]; + clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL); + printf("GPU: %s\n", dev_name); + + cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err); + CL_CHECK(err, "context"); + + cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &err); + CL_CHECK(err, "queue"); + + /* ─── Load and concatenate kernel sources ─── */ + + size_t ht_len, as_len, sc_len; + char* ht_src = read_file("gpu-hashtable.cl", &ht_len); + char* as_src = read_file("gpu-atomspace.cl", &as_len); + char* sc_src = read_file("gpu-sections.cl", &sc_len); + + size_t total_len = ht_len + 1 + as_len + 1 + sc_len; + char* combined = malloc(total_len + 1); + memcpy(combined, ht_src, ht_len); + combined[ht_len] = '\n'; + memcpy(combined + ht_len + 1, as_src, as_len); + combined[ht_len + 1 + as_len] = '\n'; + memcpy(combined + ht_len + 1 + as_len + 1, sc_src, sc_len); + combined[total_len] = '\0'; + + cl_program program = clCreateProgramWithSource(ctx, 1, + (const char**)&combined, &total_len, &err); + CL_CHECK(err, "create program"); + + char build_opts[512]; + snprintf(build_opts, sizeof(build_opts), + "-cl-std=CL1.2 " + "-DWORD_CAPACITY=%d " + "-DPAIR_CAPACITY=%d " + "-DSECTION_CAPACITY=%d " + "-DWORD_HT_CAPACITY=%d " + "-DPAIR_HT_CAPACITY=%d " + "-DSECTION_HT_CAPACITY=%d", + WORD_CAPACITY, PAIR_CAPACITY, SECTION_CAPACITY, + WORD_HT_CAPACITY, PAIR_HT_CAPACITY, SECTION_HT_CAPACITY); + + err = clBuildProgram(program, 1, &device, build_opts, NULL, NULL); + if (err != CL_SUCCESS) { + char log[16384]; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, + sizeof(log), log, NULL); + fprintf(stderr, "Build error:\n%s\n", log); + return 1; + } + printf("Kernels compiled successfully\n\n"); + + /* ─── Create kernels ─── */ + + cl_kernel k_extract = clCreateKernel(program, "extract_sections", &err); + CL_CHECK(err, "kernel extract_sections"); + cl_kernel k_read = clCreateKernel(program, "read_sections", &err); + CL_CHECK(err, "kernel read_sections"); + + size_t local_size = 256; + + /* ─── Allocate GPU buffers ─── */ + + printf("Allocating GPU buffers...\n"); + uint32_t zero = 0; + + /* Section hash table */ + cl_mem sht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint64_t) * SECTION_HT_CAPACITY, NULL, &err); + CL_CHECK(err, "sht_keys"); + cl_mem sht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * SECTION_HT_CAPACITY, NULL, &err); + CL_CHECK(err, "sht_values"); + + /* Section pool SoA */ + cl_mem sec_word = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint32_t) * SECTION_CAPACITY, NULL, &err); + cl_mem sec_disjunct_hash = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(uint64_t) * SECTION_CAPACITY, NULL, &err); + cl_mem sec_count = clCreateBuffer(ctx, CL_MEM_READ_WRITE, + sizeof(double) * SECTION_CAPACITY, NULL, &err); + + /* Section bump allocator */ + cl_mem sec_next_free = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t), &zero, &err); + + /* Total sections created (stats counter) */ + cl_mem total_sections = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t), &zero, &err); + + /* Initial reset */ + reset_section_pool(queue, sht_keys, sht_values, sec_count, + sec_next_free, total_sections); + + printf("GPU buffers ready\n\n"); + + /* ═══════════════════════════════════════════════════════════════ + * TEST 1: Simple MST — 3-word sentence with chain parse + * + * Sentence: words [10, 20, 30] (word pool indices) + * MST edges: (0,1), (1,2) — chain: 10—20—30 + * + * Expected sections: + * Word 10 (pos 0): connectors = [(20, RIGHT)] + * disjunct = "20+" + * Word 20 (pos 1): connectors = [(10, LEFT), (30, RIGHT)] + * disjunct = "10- 30+" + * Word 30 (pos 2): connectors = [(20, LEFT)] + * disjunct = "20-" + * + * = 3 unique sections + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 1: Simple chain parse (3 words, 2 edges) ---\n"); + + uint32_t t1_words[] = {10, 20, 30}; + uint32_t t1_sent_offsets[] = {0}; + uint32_t t1_sent_lengths[] = {3}; + uint32_t t1_edge_p1[] = {0, 1}; + uint32_t t1_edge_p2[] = {1, 2}; + uint32_t t1_edge_offsets[] = {0}; + uint32_t t1_edge_counts[] = {2}; + cl_uint t1_ns = 1, t1_tw = 3; + + cl_mem d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t1_tw, t1_words, &err); + cl_mem d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t1_ns, t1_sent_offsets, &err); + cl_mem d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t1_ns, t1_sent_lengths, &err); + cl_mem d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 2, t1_edge_p1, &err); + cl_mem d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 2, t1_edge_p2, &err); + cl_mem d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t1_ns, t1_edge_offsets, &err); + cl_mem d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t1_ns, t1_edge_counts, &err); + + /* Set kernel args */ + clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words); + clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets); + clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths); + clSetKernelArg(k_extract, 3, sizeof(cl_uint), &t1_ns); + clSetKernelArg(k_extract, 4, sizeof(cl_uint), &t1_tw); + clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1); + clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2); + clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets); + clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts); + clSetKernelArg(k_extract, 9, sizeof(cl_mem), &sht_keys); + clSetKernelArg(k_extract, 10, sizeof(cl_mem), &sht_values); + clSetKernelArg(k_extract, 11, sizeof(cl_mem), &sec_word); + clSetKernelArg(k_extract, 12, sizeof(cl_mem), &sec_disjunct_hash); + clSetKernelArg(k_extract, 13, sizeof(cl_mem), &sec_count); + clSetKernelArg(k_extract, 14, sizeof(cl_mem), &sec_next_free); + clSetKernelArg(k_extract, 15, sizeof(cl_mem), &total_sections); + + double t0 = now_ms(); + size_t gs = ((t1_tw + local_size - 1) / local_size) * local_size; + err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + CL_CHECK(err, "enqueue extract"); + clFinish(queue); + double t1 = now_ms(); + + /* Read back results */ + uint32_t h_num_sections, h_total_created; + clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0, + sizeof(uint32_t), &h_num_sections, 0, NULL, NULL); + clEnqueueReadBuffer(queue, total_sections, CL_TRUE, 0, + sizeof(uint32_t), &h_total_created, 0, NULL, NULL); + + /* Read section data */ + uint32_t h_sec_words[8]; + uint64_t h_sec_djh[8]; + double h_sec_counts[8]; + clEnqueueReadBuffer(queue, sec_word, CL_TRUE, 0, + sizeof(uint32_t) * h_num_sections, h_sec_words, 0, NULL, NULL); + clEnqueueReadBuffer(queue, sec_disjunct_hash, CL_TRUE, 0, + sizeof(uint64_t) * h_num_sections, h_sec_djh, 0, NULL, NULL); + clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0, + sizeof(double) * h_num_sections, h_sec_counts, 0, NULL, NULL); + + /* Compute expected disjunct hashes on CPU */ + /* Word 10 (pos 0): [(20, RIGHT=1)] */ + uint32_t cw0[] = {20}; uint32_t cd0[] = {1}; + uint64_t exp_djh_0 = cpu_hash_disjunct(cw0, cd0, 1); + /* Word 20 (pos 1): [(10, LEFT=0), (30, RIGHT=1)] — already sorted */ + uint32_t cw1[] = {10, 30}; uint32_t cd1[] = {0, 1}; + uint64_t exp_djh_1 = cpu_hash_disjunct(cw1, cd1, 2); + /* Word 30 (pos 2): [(20, LEFT=0)] */ + uint32_t cw2[] = {20}; uint32_t cd2[] = {0}; + uint64_t exp_djh_2 = cpu_hash_disjunct(cw2, cd2, 1); + + printf(" Sections created: %u (expected 3)\n", h_num_sections); + printf(" Stats counter: %u (expected 3)\n", h_total_created); + + /* Verify each section exists with correct data */ + int found_10 = 0, found_20 = 0, found_30 = 0; + for (uint32_t i = 0; i < h_num_sections; i++) { + if (h_sec_words[i] == 10 && h_sec_djh[i] == exp_djh_0 && + fabs(h_sec_counts[i] - 1.0) < 0.01) found_10 = 1; + if (h_sec_words[i] == 20 && h_sec_djh[i] == exp_djh_1 && + fabs(h_sec_counts[i] - 1.0) < 0.01) found_20 = 1; + if (h_sec_words[i] == 30 && h_sec_djh[i] == exp_djh_2 && + fabs(h_sec_counts[i] - 1.0) < 0.01) found_30 = 1; + } + + printf(" Section (word=10, djh=0x%016llx): %s\n", + (unsigned long long)exp_djh_0, found_10 ? "found" : "MISSING"); + printf(" Section (word=20, djh=0x%016llx): %s\n", + (unsigned long long)exp_djh_1, found_20 ? "found" : "MISSING"); + printf(" Section (word=30, djh=0x%016llx): %s\n", + (unsigned long long)exp_djh_2, found_30 ? "found" : "MISSING"); + printf(" Time: %.2f ms\n", t1 - t0); + + int t1_pass = (h_num_sections == 3) && (h_total_created == 3) && + found_10 && found_20 && found_30; + printf(" %s\n\n", t1_pass ? "PASS" : "FAIL"); + if (t1_pass) pass_count++; else fail_count++; + + /* ═══════════════════════════════════════════════════════════════ + * TEST 2: Star parse — root word connected to all others + * + * Sentence: words [100, 101, 102, 103, 104] (5 words) + * MST edges: (2,0), (2,1), (2,3), (2,4) — word 102 is root + * + * Expected sections: + * Word 100 (pos 0): [(102, RIGHT)] — 1 connector + * Word 101 (pos 1): [(102, RIGHT)] — 1 connector + * Word 102 (pos 2): [(100, LEFT), (101, LEFT), (103, RIGHT), (104, RIGHT)] + * Word 103 (pos 3): [(102, LEFT)] — 1 connector + * Word 104 (pos 4): [(102, LEFT)] — 1 connector + * + * = 5 sections (4 unique disjuncts: leaf-left, leaf-right, root-4conn) + * But words 100 and 101 have same disjunct hash ONLY if their + * partner word pool index is the same (both connect to 102) AND + * direction is the same (both RIGHT). So disjunct hash matches! + * → 100 and 101 have same disjunct but different words → 2 sections + * Similarly 103 and 104 connect LEFT to 102 → same disjunct → 2 sections + * + * Total unique sections (word, disjunct) pairs: 5 + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 2: Star parse (5 words, root at center) ---\n"); + + reset_section_pool(queue, sht_keys, sht_values, sec_count, + sec_next_free, total_sections); + + uint32_t t2_words[] = {100, 101, 102, 103, 104}; + uint32_t t2_sent_offsets[] = {0}; + uint32_t t2_sent_lengths[] = {5}; + uint32_t t2_edge_p1[] = {2, 2, 2, 2}; + uint32_t t2_edge_p2[] = {0, 1, 3, 4}; + uint32_t t2_edge_offsets[] = {0}; + uint32_t t2_edge_counts[] = {4}; + cl_uint t2_ns = 1, t2_tw = 5; + + clReleaseMemObject(d_flat_words); + clReleaseMemObject(d_sent_offsets); + clReleaseMemObject(d_sent_lengths); + clReleaseMemObject(d_edge_p1); + clReleaseMemObject(d_edge_p2); + clReleaseMemObject(d_edge_offsets); + clReleaseMemObject(d_edge_counts); + + d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t2_tw, t2_words, &err); + d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t2_ns, t2_sent_offsets, &err); + d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t2_ns, t2_sent_lengths, &err); + d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 4, t2_edge_p1, &err); + d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 4, t2_edge_p2, &err); + d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t2_ns, t2_edge_offsets, &err); + d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t2_ns, t2_edge_counts, &err); + + /* Set kernel args */ + clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words); + clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets); + clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths); + clSetKernelArg(k_extract, 3, sizeof(cl_uint), &t2_ns); + clSetKernelArg(k_extract, 4, sizeof(cl_uint), &t2_tw); + clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1); + clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2); + clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets); + clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts); + + t0 = now_ms(); + gs = ((t2_tw + local_size - 1) / local_size) * local_size; + err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + CL_CHECK(err, "enqueue extract t2"); + clFinish(queue); + t1 = now_ms(); + + clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0, + sizeof(uint32_t), &h_num_sections, 0, NULL, NULL); + clEnqueueReadBuffer(queue, total_sections, CL_TRUE, 0, + sizeof(uint32_t), &h_total_created, 0, NULL, NULL); + + /* Compute expected disjunct for root word 102 (pos 2): + * Connectors: (100, LEFT=0), (101, LEFT=0), (103, RIGHT=1), (104, RIGHT=1) + * Sorted: dir 0 first sorted by word → (100,0), (101,0), (103,1), (104,1) */ + uint32_t root_cw[] = {100, 101, 103, 104}; + uint32_t root_cd[] = {0, 0, 1, 1}; + uint64_t exp_root_djh = cpu_hash_disjunct(root_cw, root_cd, 4); + + /* Read back all sections */ + uint32_t h2_sec_words[8]; + uint64_t h2_sec_djh[8]; + double h2_sec_counts[8]; + uint32_t n_read = (h_num_sections < 8) ? h_num_sections : 8; + clEnqueueReadBuffer(queue, sec_word, CL_TRUE, 0, + sizeof(uint32_t) * n_read, h2_sec_words, 0, NULL, NULL); + clEnqueueReadBuffer(queue, sec_disjunct_hash, CL_TRUE, 0, + sizeof(uint64_t) * n_read, h2_sec_djh, 0, NULL, NULL); + clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0, + sizeof(double) * n_read, h2_sec_counts, 0, NULL, NULL); + + printf(" Sections created: %u (expected 5)\n", h_num_sections); + + /* Find root section */ + int found_root = 0; + for (uint32_t i = 0; i < n_read; i++) { + if (h2_sec_words[i] == 102 && h2_sec_djh[i] == exp_root_djh && + fabs(h2_sec_counts[i] - 1.0) < 0.01) { + found_root = 1; + } + } + printf(" Root section (word=102, 4 connectors): %s\n", + found_root ? "found" : "MISSING"); + printf(" Time: %.2f ms\n", t1 - t0); + + int t2_pass = (h_num_sections == 5) && found_root; + printf(" %s\n\n", t2_pass ? "PASS" : "FAIL"); + if (t2_pass) pass_count++; else fail_count++; + + /* ═══════════════════════════════════════════════════════════════ + * TEST 3: Multi-sentence batch (no cross-boundary sections) + * + * Sentence 1: [10, 20, 30] edges: (0,1), (1,2) — chain + * Sentence 2: [40, 50, 60] edges: (0,1), (0,2) — star from 40 + * + * flat_words = [10, 20, 30, 40, 50, 60] + * flat_edges = [(0,1), (1,2), (0,1), (0,2)] + * edge_offsets = [0, 2] + * edge_counts = [2, 2] + * + * Expected: 6 sections (3 per sentence), none spanning boundary + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 3: Multi-sentence batch ---\n"); + + reset_section_pool(queue, sht_keys, sht_values, sec_count, + sec_next_free, total_sections); + + uint32_t t3_words[] = {10, 20, 30, 40, 50, 60}; + uint32_t t3_sent_offsets[] = {0, 3}; + uint32_t t3_sent_lengths[] = {3, 3}; + uint32_t t3_edge_p1[] = {0, 1, 0, 0}; + uint32_t t3_edge_p2[] = {1, 2, 1, 2}; + uint32_t t3_edge_offsets[] = {0, 2}; + uint32_t t3_edge_counts[] = {2, 2}; + cl_uint t3_ns = 2, t3_tw = 6; + + clReleaseMemObject(d_flat_words); + clReleaseMemObject(d_sent_offsets); + clReleaseMemObject(d_sent_lengths); + clReleaseMemObject(d_edge_p1); + clReleaseMemObject(d_edge_p2); + clReleaseMemObject(d_edge_offsets); + clReleaseMemObject(d_edge_counts); + + d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t3_tw, t3_words, &err); + d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t3_ns, t3_sent_offsets, &err); + d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t3_ns, t3_sent_lengths, &err); + d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 4, t3_edge_p1, &err); + d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 4, t3_edge_p2, &err); + d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t3_ns, t3_edge_offsets, &err); + d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t3_ns, t3_edge_counts, &err); + + clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words); + clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets); + clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths); + clSetKernelArg(k_extract, 3, sizeof(cl_uint), &t3_ns); + clSetKernelArg(k_extract, 4, sizeof(cl_uint), &t3_tw); + clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1); + clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2); + clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets); + clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts); + + t0 = now_ms(); + gs = ((t3_tw + local_size - 1) / local_size) * local_size; + err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + CL_CHECK(err, "enqueue extract t3"); + clFinish(queue); + t1 = now_ms(); + + clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0, + sizeof(uint32_t), &h_num_sections, 0, NULL, NULL); + + /* Verify: sentence 1 chain gives 3 sections, sentence 2 star gives 3. + * But sentence 1's word 10 has disjunct "20+" and sentence 2's word 50 + * has disjunct "40-" — different disjuncts. All 6 words produce sections. + * Are any (word, disjunct) pairs the same? No — different words, different + * disjuncts. So 6 unique sections. */ + printf(" Sections created: %u (expected 6)\n", h_num_sections); + printf(" Time: %.2f ms\n", t1 - t0); + + int t3_pass = (h_num_sections == 6); + printf(" %s\n\n", t3_pass ? "PASS" : "FAIL"); + if (t3_pass) pass_count++; else fail_count++; + + /* ═══════════════════════════════════════════════════════════════ + * TEST 4: Duplicate sections — same parse seen twice + * + * Process the SAME sentence twice → section counts should be 2.0 + * + * Sentence: [10, 20, 30] edges: (0,1), (1,2) + * Run extract_sections TWICE without resetting. + * + * Expected: 3 sections, each with count = 2.0 + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 4: Duplicate sections (same parse twice) ---\n"); + + reset_section_pool(queue, sht_keys, sht_values, sec_count, + sec_next_free, total_sections); + + uint32_t t4_words[] = {10, 20, 30}; + uint32_t t4_sent_offsets[] = {0}; + uint32_t t4_sent_lengths[] = {3}; + uint32_t t4_edge_p1[] = {0, 1}; + uint32_t t4_edge_p2[] = {1, 2}; + uint32_t t4_edge_offsets[] = {0}; + uint32_t t4_edge_counts[] = {2}; + cl_uint t4_ns = 1, t4_tw = 3; + + clReleaseMemObject(d_flat_words); + clReleaseMemObject(d_sent_offsets); + clReleaseMemObject(d_sent_lengths); + clReleaseMemObject(d_edge_p1); + clReleaseMemObject(d_edge_p2); + clReleaseMemObject(d_edge_offsets); + clReleaseMemObject(d_edge_counts); + + d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t4_tw, t4_words, &err); + d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t4_ns, t4_sent_offsets, &err); + d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t4_ns, t4_sent_lengths, &err); + d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 2, t4_edge_p1, &err); + d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * 2, t4_edge_p2, &err); + d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t4_ns, t4_edge_offsets, &err); + d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * t4_ns, t4_edge_counts, &err); + + clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words); + clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets); + clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths); + clSetKernelArg(k_extract, 3, sizeof(cl_uint), &t4_ns); + clSetKernelArg(k_extract, 4, sizeof(cl_uint), &t4_tw); + clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1); + clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2); + clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets); + clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts); + + /* Run TWICE */ + gs = ((t4_tw + local_size - 1) / local_size) * local_size; + err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + CL_CHECK(err, "enqueue extract t4a"); + clFinish(queue); + + err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + CL_CHECK(err, "enqueue extract t4b"); + clFinish(queue); + + clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0, + sizeof(uint32_t), &h_num_sections, 0, NULL, NULL); + + /* Read counts */ + double h4_counts[4]; + clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0, + sizeof(double) * h_num_sections, h4_counts, 0, NULL, NULL); + + printf(" Sections created: %u (expected 3 — dedup works)\n", h_num_sections); + + int all_count_2 = 1; + for (uint32_t i = 0; i < h_num_sections; i++) { + printf(" Section %u count: %.1f (expected 2.0)\n", i, h4_counts[i]); + if (fabs(h4_counts[i] - 2.0) > 0.01) all_count_2 = 0; + } + + int t4_pass = (h_num_sections == 3) && all_count_2; + printf(" %s\n\n", t4_pass ? "PASS" : "FAIL"); + if (t4_pass) pass_count++; else fail_count++; + + /* ═══════════════════════════════════════════════════════════════ + * TEST 5: Readback kernel + * + * Use read_sections to verify section pool data matches + * what extract_sections stored. (Reuse state from test 4.) + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 5: Readback kernel ---\n"); + + uint32_t n_secs = h_num_sections; + cl_mem d_out_word = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, + sizeof(uint32_t) * n_secs, NULL, &err); + cl_mem d_out_djh = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, + sizeof(uint64_t) * n_secs, NULL, &err); + cl_mem d_out_count = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, + sizeof(double) * n_secs, NULL, &err); + + clSetKernelArg(k_read, 0, sizeof(cl_mem), &sec_word); + clSetKernelArg(k_read, 1, sizeof(cl_mem), &sec_disjunct_hash); + clSetKernelArg(k_read, 2, sizeof(cl_mem), &sec_count); + clSetKernelArg(k_read, 3, sizeof(cl_mem), &d_out_word); + clSetKernelArg(k_read, 4, sizeof(cl_mem), &d_out_djh); + clSetKernelArg(k_read, 5, sizeof(cl_mem), &d_out_count); + clSetKernelArg(k_read, 6, sizeof(cl_uint), &n_secs); + + gs = ((n_secs + local_size - 1) / local_size) * local_size; + err = clEnqueueNDRangeKernel(queue, k_read, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + CL_CHECK(err, "enqueue read_sections"); + clFinish(queue); + + uint32_t rb_words[4]; + uint64_t rb_djh[4]; + double rb_counts[4]; + clEnqueueReadBuffer(queue, d_out_word, CL_TRUE, 0, + sizeof(uint32_t) * n_secs, rb_words, 0, NULL, NULL); + clEnqueueReadBuffer(queue, d_out_djh, CL_TRUE, 0, + sizeof(uint64_t) * n_secs, rb_djh, 0, NULL, NULL); + clEnqueueReadBuffer(queue, d_out_count, CL_TRUE, 0, + sizeof(double) * n_secs, rb_counts, 0, NULL, NULL); + + /* Should match test 4's data */ + uint32_t h5_words[4]; + uint64_t h5_djh[4]; + double h5_counts[4]; + clEnqueueReadBuffer(queue, sec_word, CL_TRUE, 0, + sizeof(uint32_t) * n_secs, h5_words, 0, NULL, NULL); + clEnqueueReadBuffer(queue, sec_disjunct_hash, CL_TRUE, 0, + sizeof(uint64_t) * n_secs, h5_djh, 0, NULL, NULL); + clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0, + sizeof(double) * n_secs, h5_counts, 0, NULL, NULL); + + int readback_match = 1; + for (uint32_t i = 0; i < n_secs; i++) { + if (rb_words[i] != h5_words[i] || rb_djh[i] != h5_djh[i] || + fabs(rb_counts[i] - h5_counts[i]) > 0.01) + readback_match = 0; + } + printf(" Readback matches direct read: %s\n", readback_match ? "yes" : "NO"); + + int t5_pass = readback_match; + printf(" %s\n\n", t5_pass ? "PASS" : "FAIL"); + if (t5_pass) pass_count++; else fail_count++; + + clReleaseMemObject(d_out_word); + clReleaseMemObject(d_out_djh); + clReleaseMemObject(d_out_count); + + /* ═══════════════════════════════════════════════════════════════ + * TEST 6: Benchmark — 1000 sentences with random MST edges + * + * Each sentence: 10-20 words, 9-19 MST edges (chain parse) + * Total: ~15000 words, ~14000 edges + * + * Measures extract_sections throughput. + * ═══════════════════════════════════════════════════════════════ */ + + printf("--- Test 6: Benchmark (1000 sentences) ---\n"); + + reset_section_pool(queue, sht_keys, sht_values, sec_count, + sec_next_free, total_sections); + + srand(42); + uint32_t bench_ns = 1000; + uint32_t vocab_size = 500; /* word pool indices 0..499 */ + + /* Generate sentences */ + uint32_t max_words = bench_ns * 25; + uint32_t max_edges = bench_ns * 25; + uint32_t* b_words = malloc(sizeof(uint32_t) * max_words); + uint32_t* b_sent_offsets = malloc(sizeof(uint32_t) * bench_ns); + uint32_t* b_sent_lengths = malloc(sizeof(uint32_t) * bench_ns); + uint32_t* b_edge_p1 = malloc(sizeof(uint32_t) * max_edges); + uint32_t* b_edge_p2 = malloc(sizeof(uint32_t) * max_edges); + uint32_t* b_edge_offsets = malloc(sizeof(uint32_t) * bench_ns); + uint32_t* b_edge_counts = malloc(sizeof(uint32_t) * bench_ns); + + uint32_t word_pos = 0, edge_pos = 0; + for (uint32_t s = 0; s < bench_ns; s++) { + uint32_t slen = 10 + (rand() % 11); /* 10-20 words */ + b_sent_offsets[s] = word_pos; + b_sent_lengths[s] = slen; + b_edge_offsets[s] = edge_pos; + b_edge_counts[s] = slen - 1; /* chain parse */ + + for (uint32_t w = 0; w < slen; w++) { + b_words[word_pos++] = rand() % vocab_size; + } + /* Chain parse: pos 0-1, 1-2, ..., (slen-2)-(slen-1) */ + for (uint32_t e = 0; e < slen - 1; e++) { + b_edge_p1[edge_pos] = e; + b_edge_p2[edge_pos] = e + 1; + edge_pos++; + } + } + + printf(" Total words: %u\n", word_pos); + printf(" Total edges: %u\n", edge_pos); + + clReleaseMemObject(d_flat_words); + clReleaseMemObject(d_sent_offsets); + clReleaseMemObject(d_sent_lengths); + clReleaseMemObject(d_edge_p1); + clReleaseMemObject(d_edge_p2); + clReleaseMemObject(d_edge_offsets); + clReleaseMemObject(d_edge_counts); + + d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * word_pos, b_words, &err); + d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_sent_offsets, &err); + d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_sent_lengths, &err); + d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * edge_pos, b_edge_p1, &err); + d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * edge_pos, b_edge_p2, &err); + d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_edge_offsets, &err); + d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(uint32_t) * bench_ns, b_edge_counts, &err); + + cl_uint tw_bench = word_pos; + clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words); + clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets); + clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths); + clSetKernelArg(k_extract, 3, sizeof(cl_uint), &bench_ns); + clSetKernelArg(k_extract, 4, sizeof(cl_uint), &tw_bench); + clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1); + clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2); + clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets); + clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts); + + /* Warm up */ + gs = ((tw_bench + local_size - 1) / local_size) * local_size; + clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + clFinish(queue); + + /* Reset for actual benchmark */ + reset_section_pool(queue, sht_keys, sht_values, sec_count, + sec_next_free, total_sections); + + t0 = now_ms(); + err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL, + &gs, &local_size, 0, NULL, NULL); + CL_CHECK(err, "enqueue benchmark"); + clFinish(queue); + t1 = now_ms(); + + clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0, + sizeof(uint32_t), &h_num_sections, 0, NULL, NULL); + + double elapsed = t1 - t0; + double secs_per_sec = bench_ns / (elapsed / 1000.0); + double words_per_sec = word_pos / (elapsed / 1000.0); + + printf(" Sections created: %u\n", h_num_sections); + printf(" Time: %.2f ms\n", elapsed); + printf(" Throughput: %.0f sentences/sec, %.0f words/sec\n", + secs_per_sec, words_per_sec); + printf(" Throughput: %.1fM sections/sec\n", + h_num_sections / (elapsed / 1000.0) / 1e6); + + /* Sanity: every word should produce a section in a chain parse, + * but some (word, disjunct) pairs may collide. So sections < words + * but > 0. */ + int t6_pass = (h_num_sections > 0) && (h_num_sections <= word_pos) && + (elapsed < 1000.0); + printf(" %s\n\n", t6_pass ? "PASS" : "FAIL"); + if (t6_pass) pass_count++; else fail_count++; + + /* ─── Summary ─── */ + + printf("=== Results: %d PASS, %d FAIL ===\n", pass_count, fail_count); + + /* Cleanup */ + free(b_words); free(b_sent_offsets); free(b_sent_lengths); + free(b_edge_p1); free(b_edge_p2); free(b_edge_offsets); free(b_edge_counts); + free(ht_src); free(as_src); free(sc_src); free(combined); + + clReleaseMemObject(d_flat_words); + clReleaseMemObject(d_sent_offsets); + clReleaseMemObject(d_sent_lengths); + clReleaseMemObject(d_edge_p1); + clReleaseMemObject(d_edge_p2); + clReleaseMemObject(d_edge_offsets); + clReleaseMemObject(d_edge_counts); + clReleaseMemObject(sht_keys); + clReleaseMemObject(sht_values); + clReleaseMemObject(sec_word); + clReleaseMemObject(sec_disjunct_hash); + clReleaseMemObject(sec_count); + clReleaseMemObject(sec_next_free); + clReleaseMemObject(total_sections); + clReleaseKernel(k_extract); + clReleaseKernel(k_read); + clReleaseProgram(program); + clReleaseCommandQueue(queue); + clReleaseContext(ctx); + + return fail_count > 0 ? 1 : 0; +}