diff --git a/opencog/opencl/atomspace/gpu-cosine.cl b/opencog/opencl/atomspace/gpu-cosine.cl
new file mode 100644
index 0000000..467d936
--- /dev/null
+++ b/opencog/opencl/atomspace/gpu-cosine.cl
@@ -0,0 +1,380 @@
+/*
+ * gpu-cosine.cl -- Cosine similarity and candidate generation on GPU
+ *
+ * Computes pairwise cosine similarity between words based on their
+ * section (disjunct) vectors, entirely on GPU-resident data.
+ *
+ * Pipeline:
+ *   1. compute_word_norms       — ||v||² for each word
+ *   2. build_disjunct_chains    — reverse index: djh → linked list of sections
+ *   3. accumulate_dot_products  — walk chains, accumulate dot(w1,w2)
+ *   4. compute_cosines          — cosine = dot / (||v1|| × ||v2||)
+ *   5. filter_candidates        — compact output above threshold
+ *
+ * No CPU↔GPU data transfer between stages.
+ *
+ * Requires capacity macros (set via -D flags):
+ *   DJH_HT_CAPACITY        — disjunct reverse index hash table size (power of 2)
+ *   CANDIDATE_CAPACITY      — max candidate pairs
+ *   CANDIDATE_HT_CAPACITY   — candidate pair hash table size (power of 2)
+ *
+ * Appended after gpu-hashtable.cl, gpu-atomspace.cl at load time.
+ * (gpu-sections.cl optional — only needed for full pipeline tests)
+ */
+
+/* ═══════════════════════════════════════════════════════════════
+ *  STEP 1: COMPUTE WORD NORMS
+ *
+ *  One thread per section. Atomically adds count² to the word's
+ *  norm accumulator. After all sections processed:
+ *    word_norm_sq[w] = Σ_d count(w,d)²
+ *
+ *  where d ranges over all disjuncts for word w.
+ * ═══════════════════════════════════════════════════════════════ */
+
+__kernel void compute_word_norms(
+    __global const uint*    sec_word,
+    __global const double*  sec_count,
+    __global volatile double* word_norm_sq,
+    const uint              num_sections)
+{
+    uint tid = get_global_id(0);
+    if (tid >= num_sections) return;
+
+    double count = sec_count[tid];
+    if (count < 0.5) return;
+
+    uint word = sec_word[tid];
+    atomic_add_double(&word_norm_sq[word], count * count);
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  STEP 2: BUILD DISJUNCT REVERSE INDEX (linked list chains)
+ *
+ *  For each unique disjunct_hash, build a linked list of all
+ *  sections that share that disjunct. Uses lock-free atomic
+ *  prepend to a per-disjunct chain.
+ *
+ *  Data structures:
+ *    djh_ht_keys[DJH_HT_CAPACITY]    — disjunct hash values
+ *    djh_ht_values[DJH_HT_CAPACITY]  — chain head (section index)
+ *    sec_chain_next[SECTION_CAPACITY] — per-section next pointer
+ *
+ *  Chain traversal: start at djh_ht_values[slot], follow
+ *  sec_chain_next[] until HT_EMPTY_VALUE (end of chain).
+ * ═══════════════════════════════════════════════════════════════ */
+
+__kernel void build_disjunct_chains(
+    __global const ulong*    sec_disjunct_hash,
+    __global const double*   sec_count,
+    __global volatile ulong* djh_ht_keys,
+    __global volatile uint*  djh_ht_values,
+    __global uint*           sec_chain_next,
+    const uint               num_sections)
+{
+    uint tid = get_global_id(0);
+    if (tid >= num_sections) return;
+
+    /* Skip empty sections */
+    if (sec_count[tid] < 0.5) {
+        sec_chain_next[tid] = HT_EMPTY_VALUE;
+        return;
+    }
+
+    ulong djh = sec_disjunct_hash[tid];
+    if (djh == HT_EMPTY_KEY) {
+        sec_chain_next[tid] = HT_EMPTY_VALUE;
+        return;
+    }
+
+    ulong cap = DJH_HT_CAPACITY;
+    ulong mask = cap - 1;
+    ulong slot = ht_hash(djh) & mask;
+
+    for (uint probe = 0; probe < HT_MAX_PROBES; probe++)
+    {
+        ulong prev = atom_cmpxchg(&djh_ht_keys[slot], HT_EMPTY_KEY, djh);
+
+        if (prev == HT_EMPTY_KEY || prev == djh)
+        {
+            /* Atomically prepend this section to the chain.
+             * atomic_xchg returns the old head (initially HT_EMPTY_VALUE
+             * for a fresh slot, which serves as the end-of-chain sentinel).
+             * Lock-free: concurrent prepends produce a valid chain. */
+            uint old_head = atomic_xchg(&djh_ht_values[slot], tid);
+            sec_chain_next[tid] = old_head;
+            return;
+        }
+
+        slot = (slot + 1) & mask;
+    }
+
+    /* Hash table full — orphan this section */
+    sec_chain_next[tid] = HT_EMPTY_VALUE;
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  STEP 3: ACCUMULATE DOT PRODUCTS
+ *
+ *  One thread per section. For each section (word_i, djh_i, count_i):
+ *    - Look up djh_i in the disjunct reverse index
+ *    - Pre-count chain length; skip if > MAX_CHAIN_LEN
+ *      (very common disjuncts are uninformative, like stopwords)
+ *    - Walk the chain of sections sharing this disjunct
+ *    - For each other section (word_j, djh_i, count_j) where word_j != word_i:
+ *      - Find-or-create candidate pair (word_i, word_j)
+ *      - Atomically add count_i × count_j to the candidate's dot product
+ *
+ *  Uses canonical ordering (word_i < word_j) to avoid double-counting:
+ *  each shared disjunct contributes exactly once to each pair's dot product.
+ *
+ *  Candidate pairs are stored in a hash table + pool (same pattern
+ *  as atom pools from gpu-atomspace.cl).
+ * ═══════════════════════════════════════════════════════════════ */
+
+#ifndef MAX_CHAIN_LEN
+#define MAX_CHAIN_LEN 200
+#endif
+
+__kernel void accumulate_dot_products(
+    /* Section data */
+    __global const uint*     sec_word,
+    __global const ulong*    sec_disjunct_hash,
+    __global const double*   sec_count,
+    /* Disjunct reverse index */
+    __global const ulong*    djh_ht_keys,
+    __global const uint*     djh_ht_values,
+    __global const uint*     sec_chain_next,
+    /* Candidate hash table */
+    __global volatile ulong* cand_ht_keys,
+    __global volatile uint*  cand_ht_values,
+    /* Candidate pool SoA */
+    __global uint*            cand_word_a,
+    __global uint*            cand_word_b,
+    __global volatile double* cand_dot,
+    __global volatile uint*   cand_next_free,
+    /* Size */
+    const uint               num_sections)
+{
+    uint tid = get_global_id(0);
+    if (tid >= num_sections) return;
+
+    uint my_word = sec_word[tid];
+    double my_count = sec_count[tid];
+    if (my_count < 0.5) return;
+
+    ulong my_djh = sec_disjunct_hash[tid];
+    if (my_djh == HT_EMPTY_KEY) return;
+
+    /* ── Look up disjunct in reverse index ── */
+
+    ulong cap = DJH_HT_CAPACITY;
+    ulong mask = cap - 1;
+    ulong slot = ht_hash(my_djh) & mask;
+    uint chain_head = HT_EMPTY_VALUE;
+
+    for (uint probe = 0; probe < HT_MAX_PROBES; probe++)
+    {
+        ulong k = djh_ht_keys[slot];
+        if (k == my_djh) {
+            chain_head = djh_ht_values[slot];
+            break;
+        }
+        if (k == HT_EMPTY_KEY) break;
+        slot = (slot + 1) & mask;
+    }
+
+    if (chain_head == HT_EMPTY_VALUE) return;
+
+    /* ── Pre-count chain length — skip overly common disjuncts ── */
+    /* Disjuncts appearing in > MAX_CHAIN_LEN sections are too common
+     * to be discriminative (like stopwords in information retrieval).
+     * Skipping them entirely avoids O(K²) explosion for popular chains
+     * and matches the CPU reverse-index filter. */
+
+    uint chain_len = 0;
+    uint pre = chain_head;
+    while (pre != HT_EMPTY_VALUE && chain_len <= MAX_CHAIN_LEN)
+    {
+        chain_len++;
+        pre = sec_chain_next[pre];
+    }
+    if (chain_len > MAX_CHAIN_LEN) return;
+
+    /* ── Walk chain, accumulate dot products ── */
+
+    uint cur = chain_head;
+
+    while (cur != HT_EMPTY_VALUE)
+    {
+        uint other_word = sec_word[cur];
+        double other_count = sec_count[cur];
+
+        /* Only accumulate once per pair (canonical: my_word < other_word) */
+        if (other_word != my_word && my_word < other_word && other_count >= 0.5)
+        {
+            /* ── Find or create candidate pair ── */
+
+            uint lo = my_word;    /* already < other_word */
+            uint hi = other_word;
+            ulong cand_key = ((ulong)lo << 32) | (ulong)hi;
+
+            ulong c_cap = CANDIDATE_HT_CAPACITY;
+            ulong c_mask = c_cap - 1;
+            ulong c_slot = ht_hash(cand_key) & c_mask;
+            uint cand_idx = HT_EMPTY_VALUE;
+
+            for (uint p = 0; p < HT_MAX_PROBES; p++)
+            {
+                ulong prev = atom_cmpxchg(
+                    &cand_ht_keys[c_slot], HT_EMPTY_KEY, cand_key);
+
+                if (prev == HT_EMPTY_KEY)
+                {
+                    /* New candidate — allocate from pool */
+                    uint idx = atomic_add(cand_next_free, 1U);
+                    if (idx < CANDIDATE_CAPACITY) {
+                        cand_word_a[idx] = lo;
+                        cand_word_b[idx] = hi;
+                        cand_dot[idx] = 0.0;
+                        mem_fence(CLK_GLOBAL_MEM_FENCE);
+                        cand_ht_values[c_slot] = idx;
+                        cand_idx = idx;
+                    }
+                    break;
+                }
+                if (prev == cand_key)
+                {
+                    /* Existing candidate — spin for value */
+                    uint val = cand_ht_values[c_slot];
+                    while (val == HT_EMPTY_VALUE) {
+                        val = cand_ht_values[c_slot];
+                    }
+                    cand_idx = val;
+                    break;
+                }
+
+                c_slot = (c_slot + 1) & c_mask;
+            }
+
+            if (cand_idx != HT_EMPTY_VALUE)
+            {
+                atomic_add_double(&cand_dot[cand_idx],
+                                  my_count * other_count);
+            }
+        }
+
+        cur = sec_chain_next[cur];
+    }
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  STEP 4: COMPUTE COSINES
+ *
+ *  One thread per candidate. Converts accumulated dot products
+ *  to cosine similarity using pre-computed word norms.
+ *
+ *  cosine(w1, w2) = dot(w1, w2) / (||w1|| × ||w2||)
+ * ═══════════════════════════════════════════════════════════════ */
+
+/* MIN_NORM_SQ filters rare words: a word with N sections each count=1
+ * has norm_sq = N. Setting MIN_NORM_SQ = 5.0 requires >= 5 section types.
+ * This prevents hapax legomena from getting cosine=1.0 by chance. */
+#ifndef MIN_NORM_SQ
+#define MIN_NORM_SQ 50.0
+#endif
+
+__kernel void compute_cosines(
+    __global const uint*   cand_word_a,
+    __global const uint*   cand_word_b,
+    __global const double* cand_dot,
+    __global double*       cand_cosine,
+    __global const double* word_norm_sq,
+    const uint             num_candidates)
+{
+    uint tid = get_global_id(0);
+    if (tid >= num_candidates) return;
+
+    uint wa = cand_word_a[tid];
+    uint wb = cand_word_b[tid];
+    double dot = cand_dot[tid];
+
+    double na = word_norm_sq[wa];
+    double nb = word_norm_sq[wb];
+
+    /* Skip rare words — not enough data for meaningful cosine */
+    if (na < MIN_NORM_SQ || nb < MIN_NORM_SQ) {
+        cand_cosine[tid] = 0.0;
+        return;
+    }
+
+    double denom = sqrt(na) * sqrt(nb);
+    double cos_val = (denom > 1e-10) ? (dot / denom) : 0.0;
+
+    /* Clamp to valid range — prevents transient GPU state issues */
+    if (cos_val > 1.0) cos_val = 1.0;
+    if (cos_val < -1.0) cos_val = -1.0;
+
+    cand_cosine[tid] = cos_val;
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  STEP 5: FILTER CANDIDATES
+ *
+ *  Compact candidates above a cosine threshold into contiguous
+ *  output arrays. Returns (word_a, word_b, cosine) for each
+ *  passing pair.
+ *
+ *  out_count must be initialized to 0 before kernel launch.
+ * ═══════════════════════════════════════════════════════════════ */
+
+__kernel void filter_candidates(
+    __global const uint*    cand_word_a,
+    __global const uint*    cand_word_b,
+    __global const double*  cand_cosine,
+    const uint              num_candidates,
+    const double            threshold,
+    __global uint*          out_word_a,
+    __global uint*          out_word_b,
+    __global double*        out_cosine,
+    __global volatile uint* out_count,
+    const uint              max_output)
+{
+    uint tid = get_global_id(0);
+    if (tid >= num_candidates) return;
+
+    double cos_val = cand_cosine[tid];
+    if (cos_val > threshold)
+    {
+        uint idx = atomic_add(out_count, 1U);
+        if (idx < max_output) {
+            out_word_a[idx]  = cand_word_a[tid];
+            out_word_b[idx]  = cand_word_b[tid];
+            out_cosine[idx]  = cos_val;
+        }
+    }
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  READBACK: Dump candidate data for verification
+ * ═══════════════════════════════════════════════════════════════ */
+
+__kernel void read_candidates(
+    __global const uint*   cand_word_a,
+    __global const uint*   cand_word_b,
+    __global const double* cand_dot,
+    __global const double* cand_cosine,
+    __global uint*         out_word_a,
+    __global uint*         out_word_b,
+    __global double*       out_dot,
+    __global double*       out_cosine,
+    const uint             num_candidates)
+{
+    uint tid = get_global_id(0);
+    if (tid >= num_candidates) return;
+
+    out_word_a[tid]  = cand_word_a[tid];
+    out_word_b[tid]  = cand_word_b[tid];
+    out_dot[tid]     = cand_dot[tid];
+    out_cosine[tid]  = cand_cosine[tid];
+}
diff --git a/opencog/opencl/atomspace/gpu-sections.cl b/opencog/opencl/atomspace/gpu-sections.cl
new file mode 100644
index 0000000..83641b8
--- /dev/null
+++ b/opencog/opencl/atomspace/gpu-sections.cl
@@ -0,0 +1,317 @@
+/*
+ * gpu-sections.cl -- Section extraction from MST/PMFG edges on GPU
+ *
+ * Given MST edges for a batch of sentences (as position pairs within
+ * each sentence), extract sections (word + disjunct) and increment
+ * SectionPool counts. Disjunct hashing is done entirely on GPU —
+ * no strings involved.
+ *
+ * A section = (word, disjunct), where the disjunct is the set of
+ * connectors for that word in the parse. A connector = (partner_word,
+ * direction), where direction is '+' (right) or '-' (left).
+ *
+ * On GPU: connector = (word_pool_idx, direction_bit).
+ * Disjunct = FNV-1a hash of sorted connector sequence.
+ *
+ * Appended after gpu-hashtable.cl, gpu-atomspace.cl at load time.
+ */
+
+/* Maximum connectors per word position (degree limit).
+ * PMFG on a 20-word sentence has at most 54 edges; per word,
+ * planar graph average degree < 6. 32 is very safe. */
+#define MAX_CONNECTORS 32
+
+/* Connector direction bits */
+#define DIR_LEFT  0U   /* partner is to the LEFT of this word */
+#define DIR_RIGHT 1U   /* partner is to the RIGHT of this word */
+
+/* ═══════════════════════════════════════════════════════════════
+ *  FNV-1a HASH for disjunct encoding
+ *
+ *  Hash a sorted sequence of (word_pool_idx, direction) pairs
+ *  into a single 64-bit disjunct hash.
+ * ═══════════════════════════════════════════════════════════════ */
+
+inline ulong fnv1a_init(void)
+{
+    return 0xcbf29ce484222325UL;
+}
+
+inline ulong fnv1a_mix(ulong hash, ulong val)
+{
+    hash ^= val;
+    hash *= 0x100000001b3UL;
+    return hash;
+}
+
+inline ulong hash_disjunct(uint* conn_words, uint* conn_dirs, uint count)
+{
+    ulong h = fnv1a_init();
+    for (uint i = 0; i < count; i++) {
+        /* Encode: (word_pool_idx << 1) | direction_bit */
+        ulong encoded = ((ulong)conn_words[i] << 1) | (ulong)conn_dirs[i];
+        h = fnv1a_mix(h, encoded);
+    }
+    /* Ensure it's never the hash table sentinel */
+    if (h == HT_EMPTY_KEY) h = 0;
+    return h;
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  INSERTION SORT for connectors
+ *
+ *  Sort by: direction first (LEFT=0 before RIGHT=1),
+ *           then by word_pool_idx (ascending).
+ *
+ *  This gives a deterministic disjunct representation:
+ *  all left connectors (sorted by partner) then all right
+ *  connectors (sorted by partner).
+ * ═══════════════════════════════════════════════════════════════ */
+
+inline void sort_connectors(uint* conn_words, uint* conn_dirs, uint count)
+{
+    /* Insertion sort — count is small (typically < 10) */
+    for (uint i = 1; i < count; i++) {
+        uint w = conn_words[i];
+        uint d = conn_dirs[i];
+        uint j = i;
+        while (j > 0) {
+            /* Compare: direction first, then word index */
+            int swap = 0;
+            if (conn_dirs[j-1] > d)
+                swap = 1;
+            else if (conn_dirs[j-1] == d && conn_words[j-1] > w)
+                swap = 1;
+
+            if (!swap) break;
+
+            conn_words[j] = conn_words[j-1];
+            conn_dirs[j]  = conn_dirs[j-1];
+            j--;
+        }
+        conn_words[j] = w;
+        conn_dirs[j]  = d;
+    }
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  EXTRACT SECTIONS FROM MST EDGES
+ *
+ *  One thread per word position across all sentences.
+ *  Each thread:
+ *    1. Finds its sentence (binary search)
+ *    2. Scans MST edges for this sentence
+ *    3. Collects connectors (partner word + direction)
+ *    4. Sorts connectors deterministically
+ *    5. Hashes sorted sequence → disjunct_hash
+ *    6. Find-or-create section in SectionPool
+ *    7. Atomically increments section count
+ *
+ *  Words with no edges (isolated) produce no section.
+ *
+ *  Args:
+ *    -- Sentence data --
+ *    flat_words        - word pool indices [total_words]
+ *    sent_offsets      - start of each sentence in flat_words [num_sentences]
+ *    sent_lengths      - length of each sentence [num_sentences]
+ *    num_sentences     - number of sentences
+ *    total_words       - total words across all sentences
+ *
+ *    -- MST edge data --
+ *    edge_p1           - position 1 within sentence [total_edges]
+ *    edge_p2           - position 2 within sentence [total_edges]
+ *    edge_offsets      - start of each sentence's edges [num_sentences]
+ *    edge_counts       - number of edges per sentence [num_sentences]
+ *
+ *    -- Section pool + hash table --
+ *    sht_keys, sht_values  - section hash table
+ *    sec_word              - section pool: word indices
+ *    sec_disjunct_hash     - section pool: disjunct hashes
+ *    sec_count             - section pool: counts
+ *    sec_next_free         - section bump allocator
+ *
+ *    -- Stats --
+ *    total_sections_created - atomic counter for new sections
+ * ═══════════════════════════════════════════════════════════════ */
+
+__kernel void extract_sections(
+    /* sentence data */
+    __global const uint*     flat_words,
+    __global const uint*     sent_offsets,
+    __global const uint*     sent_lengths,
+    const uint               num_sentences,
+    const uint               total_words,
+    /* MST edge data */
+    __global const uint*     edge_p1,
+    __global const uint*     edge_p2,
+    __global const uint*     edge_offsets,
+    __global const uint*     edge_counts,
+    /* section hash table */
+    __global volatile ulong* sht_keys,
+    __global volatile uint*  sht_values,
+    /* section pool SoA */
+    __global uint*           sec_word,
+    __global ulong*          sec_disjunct_hash,
+    __global volatile double* sec_count,
+    __global volatile uint*  sec_next_free,
+    /* stats */
+    __global volatile uint*  total_sections_created)
+{
+    uint tid = get_global_id(0);
+    if (tid >= total_words) return;
+
+    /* ── Find which sentence this word belongs to (binary search) ── */
+
+    uint lo_s = 0, hi_s = num_sentences;
+    while (lo_s < hi_s) {
+        uint mid = (lo_s + hi_s) / 2;
+        if (sent_offsets[mid] + sent_lengths[mid] <= tid)
+            lo_s = mid + 1;
+        else
+            hi_s = mid;
+    }
+    uint sent_idx = lo_s;
+    if (sent_idx >= num_sentences) return;
+
+    uint sent_start = sent_offsets[sent_idx];
+    uint sent_len   = sent_lengths[sent_idx];
+    uint pos_in_sent = tid - sent_start;
+
+    /* Verify we're in this sentence */
+    if (pos_in_sent >= sent_len) return;
+
+    uint my_word = flat_words[tid];
+
+    /* ── Collect connectors from MST edges ── */
+
+    uint conn_words[MAX_CONNECTORS];
+    uint conn_dirs[MAX_CONNECTORS];
+    uint conn_count = 0;
+
+    uint e_start = edge_offsets[sent_idx];
+    uint e_count = edge_counts[sent_idx];
+
+    for (uint e = 0; e < e_count && conn_count < MAX_CONNECTORS; e++)
+    {
+        uint p1 = edge_p1[e_start + e];
+        uint p2 = edge_p2[e_start + e];
+
+        if (p1 == pos_in_sent)
+        {
+            /* Edge goes from me to p2 → p2 is to the right */
+            uint partner_word = flat_words[sent_start + p2];
+            conn_words[conn_count] = partner_word;
+            conn_dirs[conn_count]  = (p2 > pos_in_sent) ? DIR_RIGHT : DIR_LEFT;
+            conn_count++;
+        }
+        else if (p2 == pos_in_sent)
+        {
+            /* Edge goes from p1 to me → p1 is to the left */
+            uint partner_word = flat_words[sent_start + p1];
+            conn_words[conn_count] = partner_word;
+            conn_dirs[conn_count]  = (p1 < pos_in_sent) ? DIR_LEFT : DIR_RIGHT;
+            conn_count++;
+        }
+    }
+
+    /* No connectors = isolated word → no section */
+    if (conn_count == 0) return;
+
+    /* ── Sort connectors deterministically ── */
+
+    sort_connectors(conn_words, conn_dirs, conn_count);
+
+    /* ── Hash sorted connectors → disjunct_hash ── */
+
+    ulong djh = hash_disjunct(conn_words, conn_dirs, conn_count);
+
+    /* ── Find or create section in SectionPool ── */
+
+    ulong key = section_key(my_word, djh);
+
+    ulong ht_cap = SECTION_HT_CAPACITY;
+    ulong mask = ht_cap - 1;
+    ulong slot = ht_hash(key) & mask;
+    uint sec_idx = HT_EMPTY_VALUE;
+
+    for (uint probe = 0; probe < HT_MAX_PROBES; probe++)
+    {
+        ulong prev = atom_cmpxchg(&sht_keys[slot], HT_EMPTY_KEY, key);
+
+        if (prev == HT_EMPTY_KEY)
+        {
+            /* New section — allocate from pool */
+            uint idx = atomic_add(sec_next_free, 1U);
+            if (idx < SECTION_CAPACITY) {
+                sec_word[idx] = my_word;
+                sec_disjunct_hash[idx] = djh;
+                sec_count[idx] = 0.0;
+                mem_fence(CLK_GLOBAL_MEM_FENCE);
+                sht_values[slot] = idx;
+                sec_idx = idx;
+                atomic_add(total_sections_created, 1U);
+            }
+            break;
+        }
+        if (prev == key)
+        {
+            /* Existing section — spin for value */
+            uint val = sht_values[slot];
+            while (val == HT_EMPTY_VALUE) {
+                val = sht_values[slot];
+            }
+            sec_idx = val;
+            break;
+        }
+
+        slot = (slot + 1) & mask;
+    }
+
+    if (sec_idx == HT_EMPTY_VALUE) return;
+
+    /* ── Increment section count ── */
+
+    atomic_add_double(&sec_count[sec_idx], 1.0);
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  READBACK: Dump section data for verification
+ *
+ *  Reads section pool entries into flat output arrays.
+ * ═══════════════════════════════════════════════════════════════ */
+
+__kernel void read_sections(
+    __global const uint*   sec_word,
+    __global const ulong*  sec_disjunct_hash,
+    __global const double* sec_count,
+    __global uint*         out_word,
+    __global ulong*        out_disjunct_hash,
+    __global double*       out_count,
+    const uint             num_sections)
+{
+    uint tid = get_global_id(0);
+    if (tid >= num_sections) return;
+
+    out_word[tid]           = sec_word[tid];
+    out_disjunct_hash[tid]  = sec_disjunct_hash[tid];
+    out_count[tid]          = sec_count[tid];
+}
+
+/* ═══════════════════════════════════════════════════════════════
+ *  EXTRACT SECTIONS + COUNT PAIRS (combined kernel)
+ *
+ *  Same as extract_sections but also counts section pairs
+ *  within a window (for Level 1 vocabulary learning).
+ *
+ *  Each word position produces one section. Adjacent sections
+ *  (within pair_window) form section pairs. Section pair counts
+ *  are stored via the pair pool (reusing pair_find_or_create
+ *  with section indices instead of word indices).
+ *
+ *  This is Phase 4's full pipeline: edges → sections → section pairs
+ *  all in one kernel launch.
+ * ═══════════════════════════════════════════════════════════════ */
+
+/* Note: Section pair counting is deferred to Phase 5, where the
+ * section pool is used for cosine similarity. The basic extract_sections
+ * kernel above is sufficient for Phase 4. */
diff --git a/opencog/opencl/atomspace/test-cosine.c b/opencog/opencl/atomspace/test-cosine.c
new file mode 100644
index 0000000..134d0f3
--- /dev/null
+++ b/opencog/opencl/atomspace/test-cosine.c
@@ -0,0 +1,954 @@
+/*
+ * test-cosine.c -- Test GPU cosine similarity and candidate generation
+ *
+ * Compile: gcc -O2 -o test-cosine test-cosine.c -lOpenCL -lm
+ * Run:     ./test-cosine
+ *
+ * Tests:
+ *   1. Known cosine (2 words, 4 sections, exact verification)
+ *   2. Three words — all pairwise cosines
+ *   3. Identical vectors → cosine = 1.0
+ *   4. No shared disjuncts → 0 candidates
+ *   5. Filter candidates above threshold
+ *   6. Benchmark: 1000 sentences → sections → cosines (full pipeline)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include <CL/cl.h>
+
+/* ─── Pool capacities ─── */
+
+#define WORD_CAPACITY          (128 * 1024)
+#define PAIR_CAPACITY          (4 * 1024 * 1024)
+#define SECTION_CAPACITY       (1024 * 1024)
+#define WORD_HT_CAPACITY       (256 * 1024)
+#define PAIR_HT_CAPACITY       (8 * 1024 * 1024)
+#define SECTION_HT_CAPACITY    (2 * 1024 * 1024)
+
+/* Phase 5 capacities */
+#define DJH_HT_CAPACITY        (2 * 1024 * 1024)
+#define CANDIDATE_CAPACITY     (512 * 1024)
+#define CANDIDATE_HT_CAPACITY  (1024 * 1024)
+
+#define HT_EMPTY_KEY    0xFFFFFFFFFFFFFFFFULL
+#define HT_EMPTY_VALUE  0xFFFFFFFFU
+
+/* ─── Error checking ─── */
+
+#define CL_CHECK(err, msg) do { \
+    if ((err) != CL_SUCCESS) { \
+        fprintf(stderr, "OpenCL error %d at %s:%d: %s\n", \
+                (err), __FILE__, __LINE__, (msg)); \
+        exit(1); \
+    } \
+} while(0)
+
+/* ─── Read file ─── */
+
+char* read_file(const char* path, size_t* len)
+{
+    FILE* f = fopen(path, "r");
+    if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
+    fseek(f, 0, SEEK_END);
+    *len = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    char* buf = malloc(*len + 1);
+    size_t n = fread(buf, 1, *len, f);
+    buf[n] = '\0';
+    *len = n;
+    fclose(f);
+    return buf;
+}
+
+/* ─── Timing ─── */
+
+double now_ms(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
+}
+
+/* ─── Helper: Concatenate sources ─── */
+
+char* concat_sources(const char** srcs, size_t* lens, int n, size_t* total)
+{
+    *total = 0;
+    for (int i = 0; i < n; i++) *total += lens[i] + 1;
+    char* buf = malloc(*total + 1);
+    size_t pos = 0;
+    for (int i = 0; i < n; i++) {
+        memcpy(buf + pos, srcs[i], lens[i]);
+        pos += lens[i];
+        buf[pos++] = '\n';
+    }
+    buf[pos] = '\0';
+    *total = pos;
+    return buf;
+}
+
+/* ─── Helper: reset cosine pipeline buffers ─── */
+
+void reset_cosine_buffers(cl_command_queue queue,
+    cl_mem djh_ht_keys, cl_mem djh_ht_values,
+    cl_mem sec_chain_next, cl_mem word_norm_sq,
+    cl_mem cand_ht_keys, cl_mem cand_ht_values,
+    cl_mem cand_dot, cl_mem cand_cosine,
+    cl_mem cand_next_free)
+{
+    uint8_t pat_ff = 0xFF;
+    uint8_t pat_00 = 0x00;
+    uint32_t zero = 0;
+
+    /* Disjunct reverse index HT */
+    clEnqueueFillBuffer(queue, djh_ht_keys, &pat_ff, 1, 0,
+        sizeof(uint64_t) * DJH_HT_CAPACITY, 0, NULL, NULL);
+    clEnqueueFillBuffer(queue, djh_ht_values, &pat_ff, 1, 0,
+        sizeof(uint32_t) * DJH_HT_CAPACITY, 0, NULL, NULL);
+
+    /* Section chain pointers */
+    clEnqueueFillBuffer(queue, sec_chain_next, &pat_ff, 1, 0,
+        sizeof(uint32_t) * SECTION_CAPACITY, 0, NULL, NULL);
+
+    /* Word norms */
+    clEnqueueFillBuffer(queue, word_norm_sq, &pat_00, 1, 0,
+        sizeof(double) * WORD_CAPACITY, 0, NULL, NULL);
+
+    /* Candidate HT */
+    clEnqueueFillBuffer(queue, cand_ht_keys, &pat_ff, 1, 0,
+        sizeof(uint64_t) * CANDIDATE_HT_CAPACITY, 0, NULL, NULL);
+    clEnqueueFillBuffer(queue, cand_ht_values, &pat_ff, 1, 0,
+        sizeof(uint32_t) * CANDIDATE_HT_CAPACITY, 0, NULL, NULL);
+
+    /* Candidate pool */
+    clEnqueueFillBuffer(queue, cand_dot, &pat_00, 1, 0,
+        sizeof(double) * CANDIDATE_CAPACITY, 0, NULL, NULL);
+    clEnqueueFillBuffer(queue, cand_cosine, &pat_00, 1, 0,
+        sizeof(double) * CANDIDATE_CAPACITY, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, cand_next_free, CL_FALSE, 0,
+        sizeof(uint32_t), &zero, 0, NULL, NULL);
+
+    clFinish(queue);
+}
+
+/* ─── Helper: reset section pool ─── */
+
+void reset_section_pool(cl_command_queue queue,
+    cl_mem sht_keys, cl_mem sht_values,
+    cl_mem sec_count, cl_mem sec_next_free)
+{
+    uint8_t pat_ff = 0xFF;
+    uint8_t pat_00 = 0x00;
+    uint32_t zero = 0;
+
+    clEnqueueFillBuffer(queue, sht_keys, &pat_ff, 1, 0,
+        sizeof(uint64_t) * SECTION_HT_CAPACITY, 0, NULL, NULL);
+    clEnqueueFillBuffer(queue, sht_values, &pat_ff, 1, 0,
+        sizeof(uint32_t) * SECTION_HT_CAPACITY, 0, NULL, NULL);
+    clEnqueueFillBuffer(queue, sec_count, &pat_00, 1, 0,
+        sizeof(double) * SECTION_CAPACITY, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, sec_next_free, CL_FALSE, 0,
+        sizeof(uint32_t), &zero, 0, NULL, NULL);
+    clFinish(queue);
+}
+
+/* ─── Helper: manually populate sections ─── */
+
+void upload_sections(cl_command_queue queue,
+    cl_mem sec_word, cl_mem sec_disjunct_hash, cl_mem sec_count,
+    cl_mem sec_next_free,
+    uint32_t* words, uint64_t* djhs, double* counts, uint32_t n)
+{
+    clEnqueueWriteBuffer(queue, sec_word, CL_FALSE, 0,
+        sizeof(uint32_t) * n, words, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, sec_disjunct_hash, CL_FALSE, 0,
+        sizeof(uint64_t) * n, djhs, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, sec_count, CL_FALSE, 0,
+        sizeof(double) * n, counts, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, sec_next_free, CL_FALSE, 0,
+        sizeof(uint32_t), &n, 0, NULL, NULL);
+    clFinish(queue);
+}
+
+/* ─── Helper: run cosine pipeline ─── */
+
+void run_cosine_pipeline(cl_command_queue queue,
+    cl_kernel k_norms, cl_kernel k_chains,
+    cl_kernel k_dots, cl_kernel k_cosines,
+    uint32_t num_sections,
+    cl_mem cand_next_free, uint32_t* out_num_candidates)
+{
+    size_t local = 256;
+    size_t gs;
+    cl_int err;
+
+    /* Step 1: Word norms */
+    gs = ((num_sections + local - 1) / local) * local;
+    err = clEnqueueNDRangeKernel(queue, k_norms, 1, NULL,
+        &gs, &local, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue compute_word_norms");
+
+    /* Step 2: Disjunct chains */
+    err = clEnqueueNDRangeKernel(queue, k_chains, 1, NULL,
+        &gs, &local, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue build_disjunct_chains");
+
+    /* Step 3: Dot products */
+    err = clEnqueueNDRangeKernel(queue, k_dots, 1, NULL,
+        &gs, &local, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue accumulate_dot_products");
+
+    clFinish(queue);
+
+    /* Read candidate count */
+    clEnqueueReadBuffer(queue, cand_next_free, CL_TRUE, 0,
+        sizeof(uint32_t), out_num_candidates, 0, NULL, NULL);
+
+    if (*out_num_candidates == 0) return;
+
+    /* Step 4: Cosines */
+    gs = ((*out_num_candidates + local - 1) / local) * local;
+    cl_uint nc = *out_num_candidates;
+    clSetKernelArg(k_cosines, 5, sizeof(cl_uint), &nc);
+    err = clEnqueueNDRangeKernel(queue, k_cosines, 1, NULL,
+        &gs, &local, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue compute_cosines");
+
+    clFinish(queue);
+}
+
+/* ─── Main ─── */
+
+int main(int argc, char** argv)
+{
+    cl_int err;
+    int pass_count = 0, fail_count = 0;
+
+    printf("=== GPU Cosine Similarity Test ===\n\n");
+
+    /* ─── OpenCL setup ─── */
+
+    cl_platform_id platform;
+    err = clGetPlatformIDs(1, &platform, NULL);
+    CL_CHECK(err, "platform");
+
+    cl_device_id device;
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    CL_CHECK(err, "device");
+
+    char dev_name[256];
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL);
+    printf("GPU: %s\n", dev_name);
+
+    cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    CL_CHECK(err, "context");
+
+    cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &err);
+    CL_CHECK(err, "queue");
+
+    /* ─── Load and concatenate kernel sources ─── */
+
+    size_t len_ht, len_as, len_sc, len_cos;
+    char* src_ht  = read_file("gpu-hashtable.cl", &len_ht);
+    char* src_as  = read_file("gpu-atomspace.cl", &len_as);
+    char* src_sc  = read_file("gpu-sections.cl", &len_sc);
+    char* src_cos = read_file("gpu-cosine.cl", &len_cos);
+
+    const char* srcs[] = {src_ht, src_as, src_sc, src_cos};
+    size_t lens[] = {len_ht, len_as, len_sc, len_cos};
+    size_t total_len;
+    char* combined = concat_sources(srcs, lens, 4, &total_len);
+
+    cl_program program = clCreateProgramWithSource(ctx, 1,
+        (const char**)&combined, &total_len, &err);
+    CL_CHECK(err, "create program");
+
+    char build_opts[1024];
+    snprintf(build_opts, sizeof(build_opts),
+        "-cl-std=CL1.2 "
+        "-DWORD_CAPACITY=%d "
+        "-DPAIR_CAPACITY=%d "
+        "-DSECTION_CAPACITY=%d "
+        "-DWORD_HT_CAPACITY=%d "
+        "-DPAIR_HT_CAPACITY=%d "
+        "-DSECTION_HT_CAPACITY=%d "
+        "-DDJH_HT_CAPACITY=%d "
+        "-DCANDIDATE_CAPACITY=%d "
+        "-DCANDIDATE_HT_CAPACITY=%d",
+        WORD_CAPACITY, PAIR_CAPACITY, SECTION_CAPACITY,
+        WORD_HT_CAPACITY, PAIR_HT_CAPACITY, SECTION_HT_CAPACITY,
+        DJH_HT_CAPACITY, CANDIDATE_CAPACITY, CANDIDATE_HT_CAPACITY);
+
+    err = clBuildProgram(program, 1, &device, build_opts, NULL, NULL);
+    if (err != CL_SUCCESS) {
+        char log[16384];
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+                              sizeof(log), log, NULL);
+        fprintf(stderr, "Build error:\n%s\n", log);
+        return 1;
+    }
+    printf("Kernels compiled successfully\n\n");
+
+    /* ─── Create kernels ─── */
+
+    cl_kernel k_norms   = clCreateKernel(program, "compute_word_norms", &err);
+    CL_CHECK(err, "kernel compute_word_norms");
+    cl_kernel k_chains  = clCreateKernel(program, "build_disjunct_chains", &err);
+    CL_CHECK(err, "kernel build_disjunct_chains");
+    cl_kernel k_dots    = clCreateKernel(program, "accumulate_dot_products", &err);
+    CL_CHECK(err, "kernel accumulate_dot_products");
+    cl_kernel k_cosines = clCreateKernel(program, "compute_cosines", &err);
+    CL_CHECK(err, "kernel compute_cosines");
+    cl_kernel k_filter  = clCreateKernel(program, "filter_candidates", &err);
+    CL_CHECK(err, "kernel filter_candidates");
+    cl_kernel k_extract = clCreateKernel(program, "extract_sections", &err);
+    CL_CHECK(err, "kernel extract_sections");
+
+    /* ─── Allocate GPU buffers ─── */
+
+    printf("Allocating GPU buffers...\n");
+    uint32_t zero = 0;
+
+    /* Section pool */
+    cl_mem sec_word = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * SECTION_CAPACITY, NULL, &err);
+    cl_mem sec_disjunct_hash = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint64_t) * SECTION_CAPACITY, NULL, &err);
+    cl_mem sec_count = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(double) * SECTION_CAPACITY, NULL, &err);
+    cl_mem sec_next_free = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t), &zero, &err);
+
+    /* Section hash table (for extract_sections) */
+    cl_mem sht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint64_t) * SECTION_HT_CAPACITY, NULL, &err);
+    cl_mem sht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * SECTION_HT_CAPACITY, NULL, &err);
+
+    /* Total sections counter (for extract_sections) */
+    cl_mem total_sections_created = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t), &zero, &err);
+
+    /* Disjunct reverse index HT */
+    cl_mem djh_ht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint64_t) * DJH_HT_CAPACITY, NULL, &err);
+    CL_CHECK(err, "djh_ht_keys");
+    cl_mem djh_ht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * DJH_HT_CAPACITY, NULL, &err);
+    CL_CHECK(err, "djh_ht_values");
+
+    /* Section chain pointers */
+    cl_mem sec_chain_next = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * SECTION_CAPACITY, NULL, &err);
+    CL_CHECK(err, "sec_chain_next");
+
+    /* Word norms */
+    cl_mem word_norm_sq = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(double) * WORD_CAPACITY, NULL, &err);
+    CL_CHECK(err, "word_norm_sq");
+
+    /* Candidate HT */
+    cl_mem cand_ht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint64_t) * CANDIDATE_HT_CAPACITY, NULL, &err);
+    cl_mem cand_ht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * CANDIDATE_HT_CAPACITY, NULL, &err);
+
+    /* Candidate pool */
+    cl_mem cand_word_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * CANDIDATE_CAPACITY, NULL, &err);
+    cl_mem cand_word_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * CANDIDATE_CAPACITY, NULL, &err);
+    cl_mem cand_dot = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(double) * CANDIDATE_CAPACITY, NULL, &err);
+    cl_mem cand_cosine = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(double) * CANDIDATE_CAPACITY, NULL, &err);
+    cl_mem cand_next_free = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t), &zero, &err);
+
+    /* ─── Set kernel args (persistent across tests) ─── */
+
+    /* compute_word_norms: (sec_word, sec_count, word_norm_sq, num_sections) */
+    clSetKernelArg(k_norms, 0, sizeof(cl_mem), &sec_word);
+    clSetKernelArg(k_norms, 1, sizeof(cl_mem), &sec_count);
+    clSetKernelArg(k_norms, 2, sizeof(cl_mem), &word_norm_sq);
+    /* arg 3 = num_sections — set per test */
+
+    /* build_disjunct_chains: (sec_djh, sec_count, djh_ht_keys, djh_ht_values,
+     *                         sec_chain_next, num_sections) */
+    clSetKernelArg(k_chains, 0, sizeof(cl_mem), &sec_disjunct_hash);
+    clSetKernelArg(k_chains, 1, sizeof(cl_mem), &sec_count);
+    clSetKernelArg(k_chains, 2, sizeof(cl_mem), &djh_ht_keys);
+    clSetKernelArg(k_chains, 3, sizeof(cl_mem), &djh_ht_values);
+    clSetKernelArg(k_chains, 4, sizeof(cl_mem), &sec_chain_next);
+    /* arg 5 = num_sections — set per test */
+
+    /* accumulate_dot_products: 14 args */
+    clSetKernelArg(k_dots, 0,  sizeof(cl_mem), &sec_word);
+    clSetKernelArg(k_dots, 1,  sizeof(cl_mem), &sec_disjunct_hash);
+    clSetKernelArg(k_dots, 2,  sizeof(cl_mem), &sec_count);
+    clSetKernelArg(k_dots, 3,  sizeof(cl_mem), &djh_ht_keys);
+    clSetKernelArg(k_dots, 4,  sizeof(cl_mem), &djh_ht_values);
+    clSetKernelArg(k_dots, 5,  sizeof(cl_mem), &sec_chain_next);
+    clSetKernelArg(k_dots, 6,  sizeof(cl_mem), &cand_ht_keys);
+    clSetKernelArg(k_dots, 7,  sizeof(cl_mem), &cand_ht_values);
+    clSetKernelArg(k_dots, 8,  sizeof(cl_mem), &cand_word_a);
+    clSetKernelArg(k_dots, 9,  sizeof(cl_mem), &cand_word_b);
+    clSetKernelArg(k_dots, 10, sizeof(cl_mem), &cand_dot);
+    clSetKernelArg(k_dots, 11, sizeof(cl_mem), &cand_next_free);
+    /* arg 12 = num_sections — set per test */
+
+    /* compute_cosines: (cand_word_a, cand_word_b, cand_dot, cand_cosine,
+     *                   word_norm_sq, num_candidates) */
+    clSetKernelArg(k_cosines, 0, sizeof(cl_mem), &cand_word_a);
+    clSetKernelArg(k_cosines, 1, sizeof(cl_mem), &cand_word_b);
+    clSetKernelArg(k_cosines, 2, sizeof(cl_mem), &cand_dot);
+    clSetKernelArg(k_cosines, 3, sizeof(cl_mem), &cand_cosine);
+    clSetKernelArg(k_cosines, 4, sizeof(cl_mem), &word_norm_sq);
+    /* arg 5 = num_candidates — set in pipeline */
+
+    /* filter_candidates */
+    clSetKernelArg(k_filter, 0, sizeof(cl_mem), &cand_word_a);
+    clSetKernelArg(k_filter, 1, sizeof(cl_mem), &cand_word_b);
+    clSetKernelArg(k_filter, 2, sizeof(cl_mem), &cand_cosine);
+    /* args 3-8 set per test */
+
+    printf("GPU buffers ready\n\n");
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 1: Known cosine (2 words, 4 sections)
+     *
+     *  Section 0: word=0, djh=0x100, count=3.0
+     *  Section 1: word=0, djh=0x200, count=4.0
+     *  Section 2: word=1, djh=0x100, count=5.0
+     *  Section 3: word=1, djh=0x300, count=2.0
+     *
+     *  word 0: {0x100: 3, 0x200: 4} → norm² = 25, norm = 5
+     *  word 1: {0x100: 5, 0x300: 2} → norm² = 29, norm = √29
+     *  Shared: 0x100 → dot = 3×5 = 15
+     *  Cosine = 15 / (5 × √29) = 0.5571
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 1: Known cosine (2 words, 4 sections) ---\n");
+
+    reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values,
+        sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values,
+        cand_dot, cand_cosine, cand_next_free);
+
+    {
+        uint32_t sw[] = {0, 0, 1, 1};
+        uint64_t sd[] = {0x100, 0x200, 0x100, 0x300};
+        double   sc[] = {3.0, 4.0, 5.0, 2.0};
+        cl_uint  ns = 4;
+
+        upload_sections(queue, sec_word, sec_disjunct_hash, sec_count,
+                        sec_next_free, sw, sd, sc, ns);
+
+        /* Set num_sections for each kernel */
+        clSetKernelArg(k_norms,  3, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_dots,  12, sizeof(cl_uint), &ns);
+
+        uint32_t num_cands = 0;
+        double t0 = now_ms();
+        run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines,
+                            ns, cand_next_free, &num_cands);
+        double t1 = now_ms();
+
+        /* Read results */
+        uint32_t h_wa[4], h_wb[4];
+        double h_dot[4], h_cos[4];
+        if (num_cands > 0) {
+            clEnqueueReadBuffer(queue, cand_word_a, CL_TRUE, 0,
+                sizeof(uint32_t) * num_cands, h_wa, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, cand_word_b, CL_TRUE, 0,
+                sizeof(uint32_t) * num_cands, h_wb, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, cand_dot, CL_TRUE, 0,
+                sizeof(double) * num_cands, h_dot, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0,
+                sizeof(double) * num_cands, h_cos, 0, NULL, NULL);
+        }
+
+        /* Expected: 1 candidate pair (0,1) with dot=15.0, cosine≈0.5571 */
+        double expected_cos = 15.0 / (5.0 * sqrt(29.0));
+
+        printf("  Candidates: %u (expected 1)\n", num_cands);
+        if (num_cands > 0) {
+            printf("  Pair: (%u, %u)  dot=%.1f  cosine=%.4f\n",
+                   h_wa[0], h_wb[0], h_dot[0], h_cos[0]);
+            printf("  Expected: (0, 1)  dot=15.0  cosine=%.4f\n", expected_cos);
+        }
+        printf("  Time: %.2f ms\n", t1 - t0);
+
+        int pass = (num_cands == 1) &&
+                   (h_wa[0] == 0) && (h_wb[0] == 1) &&
+                   (fabs(h_dot[0] - 15.0) < 0.01) &&
+                   (fabs(h_cos[0] - expected_cos) < 0.001);
+        printf("  %s\n\n", pass ? "PASS" : "FAIL");
+        if (pass) pass_count++; else fail_count++;
+    }
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 2: Three words — all pairwise cosines
+     *
+     *  word 0: {X=0x10: 1, Y=0x20: 2}   norm² = 5
+     *  word 1: {X=0x10: 3, Z=0x30: 1}   norm² = 10
+     *  word 2: {Y=0x20: 2, Z=0x30: 4}   norm² = 20
+     *
+     *  dot(0,1) = 1×3 = 3       cos = 3/√50  ≈ 0.4243
+     *  dot(0,2) = 2×2 = 4       cos = 4/√100 = 0.4000
+     *  dot(1,2) = 1×4 = 4       cos = 4/√200 ≈ 0.2828
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 2: Three words, all pairwise cosines ---\n");
+
+    reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values,
+        sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values,
+        cand_dot, cand_cosine, cand_next_free);
+
+    {
+        uint32_t sw[] = {0, 0, 1, 1, 2, 2};
+        uint64_t sd[] = {0x10, 0x20, 0x10, 0x30, 0x20, 0x30};
+        double   sc[] = {1.0, 2.0, 3.0, 1.0, 2.0, 4.0};
+        cl_uint  ns = 6;
+
+        upload_sections(queue, sec_word, sec_disjunct_hash, sec_count,
+                        sec_next_free, sw, sd, sc, ns);
+
+        clSetKernelArg(k_norms,  3, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_dots,  12, sizeof(cl_uint), &ns);
+
+        uint32_t num_cands = 0;
+        double t0 = now_ms();
+        run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines,
+                            ns, cand_next_free, &num_cands);
+        double t1 = now_ms();
+
+        uint32_t h_wa[8], h_wb[8];
+        double h_dot[8], h_cos[8];
+        if (num_cands > 0) {
+            clEnqueueReadBuffer(queue, cand_word_a, CL_TRUE, 0,
+                sizeof(uint32_t) * num_cands, h_wa, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, cand_word_b, CL_TRUE, 0,
+                sizeof(uint32_t) * num_cands, h_wb, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, cand_dot, CL_TRUE, 0,
+                sizeof(double) * num_cands, h_dot, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0,
+                sizeof(double) * num_cands, h_cos, 0, NULL, NULL);
+        }
+
+        double exp_01 = 3.0 / sqrt(50.0);
+        double exp_02 = 4.0 / sqrt(100.0);
+        double exp_12 = 4.0 / sqrt(200.0);
+
+        printf("  Candidates: %u (expected 3)\n", num_cands);
+
+        /* Find each pair in results */
+        double got_01 = -1, got_02 = -1, got_12 = -1;
+        double got_dot_01 = -1, got_dot_02 = -1, got_dot_12 = -1;
+        for (uint32_t i = 0; i < num_cands; i++) {
+            if (h_wa[i] == 0 && h_wb[i] == 1)
+                { got_01 = h_cos[i]; got_dot_01 = h_dot[i]; }
+            if (h_wa[i] == 0 && h_wb[i] == 2)
+                { got_02 = h_cos[i]; got_dot_02 = h_dot[i]; }
+            if (h_wa[i] == 1 && h_wb[i] == 2)
+                { got_12 = h_cos[i]; got_dot_12 = h_dot[i]; }
+        }
+
+        printf("  (0,1): dot=%.1f cos=%.4f (exp dot=3.0 cos=%.4f)\n",
+               got_dot_01, got_01, exp_01);
+        printf("  (0,2): dot=%.1f cos=%.4f (exp dot=4.0 cos=%.4f)\n",
+               got_dot_02, got_02, exp_02);
+        printf("  (1,2): dot=%.1f cos=%.4f (exp dot=4.0 cos=%.4f)\n",
+               got_dot_12, got_12, exp_12);
+        printf("  Time: %.2f ms\n", t1 - t0);
+
+        int pass = (num_cands == 3) &&
+                   (fabs(got_01 - exp_01) < 0.001) &&
+                   (fabs(got_02 - exp_02) < 0.001) &&
+                   (fabs(got_12 - exp_12) < 0.001);
+        printf("  %s\n\n", pass ? "PASS" : "FAIL");
+        if (pass) pass_count++; else fail_count++;
+    }
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 3: Identical vectors → cosine = 1.0
+     *
+     *  word 0: {X=0x10: 3, Y=0x20: 4}
+     *  word 1: {X=0x10: 3, Y=0x20: 4}
+     *
+     *  dot = 9+16 = 25, norms = 5 each, cosine = 25/25 = 1.0
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 3: Identical vectors → cosine = 1.0 ---\n");
+
+    reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values,
+        sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values,
+        cand_dot, cand_cosine, cand_next_free);
+
+    {
+        uint32_t sw[] = {0, 0, 1, 1};
+        uint64_t sd[] = {0x10, 0x20, 0x10, 0x20};
+        double   sc[] = {3.0, 4.0, 3.0, 4.0};
+        cl_uint  ns = 4;
+
+        upload_sections(queue, sec_word, sec_disjunct_hash, sec_count,
+                        sec_next_free, sw, sd, sc, ns);
+
+        clSetKernelArg(k_norms,  3, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_dots,  12, sizeof(cl_uint), &ns);
+
+        uint32_t num_cands = 0;
+        run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines,
+                            ns, cand_next_free, &num_cands);
+
+        double h_cos = 0;
+        if (num_cands > 0) {
+            clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0,
+                sizeof(double), &h_cos, 0, NULL, NULL);
+        }
+
+        printf("  Candidates: %u (expected 1)\n", num_cands);
+        printf("  Cosine: %.4f (expected 1.0000)\n", h_cos);
+
+        int pass = (num_cands == 1) && (fabs(h_cos - 1.0) < 0.001);
+        printf("  %s\n\n", pass ? "PASS" : "FAIL");
+        if (pass) pass_count++; else fail_count++;
+    }
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 4: No shared disjuncts → 0 candidates
+     *
+     *  word 0: {X=0x10: 1}
+     *  word 1: {Y=0x20: 1}
+     *
+     *  No shared disjuncts → no chain overlap → 0 candidates
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 4: No shared disjuncts → 0 candidates ---\n");
+
+    reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values,
+        sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values,
+        cand_dot, cand_cosine, cand_next_free);
+
+    {
+        uint32_t sw[] = {0, 1};
+        uint64_t sd[] = {0x10, 0x20};
+        double   sc[] = {1.0, 1.0};
+        cl_uint  ns = 2;
+
+        upload_sections(queue, sec_word, sec_disjunct_hash, sec_count,
+                        sec_next_free, sw, sd, sc, ns);
+
+        clSetKernelArg(k_norms,  3, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_dots,  12, sizeof(cl_uint), &ns);
+
+        uint32_t num_cands = 0;
+        run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines,
+                            ns, cand_next_free, &num_cands);
+
+        printf("  Candidates: %u (expected 0)\n", num_cands);
+
+        int pass = (num_cands == 0);
+        printf("  %s\n\n", pass ? "PASS" : "FAIL");
+        if (pass) pass_count++; else fail_count++;
+    }
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 5: Filter candidates above threshold
+     *
+     *  Reuse test 2's scenario (3 words):
+     *    cos(0,1) ≈ 0.4243
+     *    cos(0,2) = 0.4000
+     *    cos(1,2) ≈ 0.2828
+     *
+     *  Filter at 0.35 → should get 2 candidates (0,1) and (0,2)
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 5: Filter candidates above threshold ---\n");
+
+    reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values,
+        sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values,
+        cand_dot, cand_cosine, cand_next_free);
+
+    {
+        uint32_t sw[] = {0, 0, 1, 1, 2, 2};
+        uint64_t sd[] = {0x10, 0x20, 0x10, 0x30, 0x20, 0x30};
+        double   sc[] = {1.0, 2.0, 3.0, 1.0, 2.0, 4.0};
+        cl_uint  ns = 6;
+
+        upload_sections(queue, sec_word, sec_disjunct_hash, sec_count,
+                        sec_next_free, sw, sd, sc, ns);
+
+        clSetKernelArg(k_norms,  3, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_chains, 5, sizeof(cl_uint), &ns);
+        clSetKernelArg(k_dots,  12, sizeof(cl_uint), &ns);
+
+        uint32_t num_cands = 0;
+        run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines,
+                            ns, cand_next_free, &num_cands);
+
+        /* Now filter */
+        cl_double threshold = 0.35;
+        cl_uint max_output = 64;
+        cl_mem out_wa = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
+            sizeof(uint32_t) * max_output, NULL, &err);
+        cl_mem out_wb = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
+            sizeof(uint32_t) * max_output, NULL, &err);
+        cl_mem out_cos = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
+            sizeof(double) * max_output, NULL, &err);
+        cl_mem out_count = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+            sizeof(uint32_t), &zero, &err);
+
+        clSetKernelArg(k_filter, 3, sizeof(cl_uint), &num_cands);
+        clSetKernelArg(k_filter, 4, sizeof(cl_double), &threshold);
+        clSetKernelArg(k_filter, 5, sizeof(cl_mem), &out_wa);
+        clSetKernelArg(k_filter, 6, sizeof(cl_mem), &out_wb);
+        clSetKernelArg(k_filter, 7, sizeof(cl_mem), &out_cos);
+        clSetKernelArg(k_filter, 8, sizeof(cl_mem), &out_count);
+        clSetKernelArg(k_filter, 9, sizeof(cl_uint), &max_output);
+
+        size_t local = 256;
+        size_t gs = ((num_cands + local - 1) / local) * local;
+        err = clEnqueueNDRangeKernel(queue, k_filter, 1, NULL,
+            &gs, &local, 0, NULL, NULL);
+        CL_CHECK(err, "enqueue filter");
+        clFinish(queue);
+
+        uint32_t n_filtered;
+        clEnqueueReadBuffer(queue, out_count, CL_TRUE, 0,
+            sizeof(uint32_t), &n_filtered, 0, NULL, NULL);
+
+        uint32_t fwa[8], fwb[8];
+        double fcos[8];
+        if (n_filtered > 0) {
+            clEnqueueReadBuffer(queue, out_wa, CL_TRUE, 0,
+                sizeof(uint32_t) * n_filtered, fwa, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, out_wb, CL_TRUE, 0,
+                sizeof(uint32_t) * n_filtered, fwb, 0, NULL, NULL);
+            clEnqueueReadBuffer(queue, out_cos, CL_TRUE, 0,
+                sizeof(double) * n_filtered, fcos, 0, NULL, NULL);
+        }
+
+        printf("  Total candidates: %u, filtered (>0.35): %u (expected 2)\n",
+               num_cands, n_filtered);
+        for (uint32_t i = 0; i < n_filtered; i++) {
+            printf("    (%u, %u) cos=%.4f\n", fwa[i], fwb[i], fcos[i]);
+        }
+
+        /* cos(1,2) ≈ 0.2828 should be filtered out */
+        int pass = (n_filtered == 2);
+        printf("  %s\n\n", pass ? "PASS" : "FAIL");
+        if (pass) pass_count++; else fail_count++;
+
+        clReleaseMemObject(out_wa);
+        clReleaseMemObject(out_wb);
+        clReleaseMemObject(out_cos);
+        clReleaseMemObject(out_count);
+    }
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 6: Benchmark — 1000 sentences → sections → cosines
+     *
+     *  Full pipeline:
+     *    extract_sections (Phase 4) → cosine pipeline (Phase 5)
+     *
+     *  1000 sentences, 10-20 words each, chain MST parse.
+     *  500 word vocabulary for realistic disjunct sharing.
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 6: Benchmark (1000 sentences, full pipeline) ---\n");
+
+    /* Reset everything */
+    reset_section_pool(queue, sht_keys, sht_values, sec_count, sec_next_free);
+    reset_cosine_buffers(queue, djh_ht_keys, djh_ht_values,
+        sec_chain_next, word_norm_sq, cand_ht_keys, cand_ht_values,
+        cand_dot, cand_cosine, cand_next_free);
+    {
+        uint32_t tsec_zero = 0;
+        clEnqueueWriteBuffer(queue, total_sections_created, CL_TRUE, 0,
+            sizeof(uint32_t), &tsec_zero, 0, NULL, NULL);
+    }
+
+    srand(42);
+    uint32_t bench_ns = 1000;
+    uint32_t vocab_size = 500;
+
+    uint32_t max_words = bench_ns * 25;
+    uint32_t max_edges = bench_ns * 25;
+    uint32_t* b_words = malloc(sizeof(uint32_t) * max_words);
+    uint32_t* b_sent_offsets = malloc(sizeof(uint32_t) * bench_ns);
+    uint32_t* b_sent_lengths = malloc(sizeof(uint32_t) * bench_ns);
+    uint32_t* b_edge_p1 = malloc(sizeof(uint32_t) * max_edges);
+    uint32_t* b_edge_p2 = malloc(sizeof(uint32_t) * max_edges);
+    uint32_t* b_edge_offsets = malloc(sizeof(uint32_t) * bench_ns);
+    uint32_t* b_edge_counts = malloc(sizeof(uint32_t) * bench_ns);
+
+    uint32_t word_pos = 0, edge_pos = 0;
+    for (uint32_t s = 0; s < bench_ns; s++) {
+        uint32_t slen = 10 + (rand() % 11);
+        b_sent_offsets[s] = word_pos;
+        b_sent_lengths[s] = slen;
+        b_edge_offsets[s] = edge_pos;
+        b_edge_counts[s] = slen - 1;
+
+        for (uint32_t w = 0; w < slen; w++)
+            b_words[word_pos++] = rand() % vocab_size;
+        for (uint32_t e = 0; e < slen - 1; e++) {
+            b_edge_p1[edge_pos] = e;
+            b_edge_p2[edge_pos] = e + 1;
+            edge_pos++;
+        }
+    }
+
+    printf("  Sentences: %u, words: %u, edges: %u\n", bench_ns, word_pos, edge_pos);
+
+    cl_mem d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * word_pos, b_words, &err);
+    cl_mem d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_sent_offsets, &err);
+    cl_mem d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_sent_lengths, &err);
+    cl_mem d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * edge_pos, b_edge_p1, &err);
+    cl_mem d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * edge_pos, b_edge_p2, &err);
+    cl_mem d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_edge_offsets, &err);
+    cl_mem d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_edge_counts, &err);
+
+    /* Set extract_sections args */
+    cl_uint tw = word_pos;
+    clSetKernelArg(k_extract, 0,  sizeof(cl_mem), &d_flat_words);
+    clSetKernelArg(k_extract, 1,  sizeof(cl_mem), &d_sent_offsets);
+    clSetKernelArg(k_extract, 2,  sizeof(cl_mem), &d_sent_lengths);
+    clSetKernelArg(k_extract, 3,  sizeof(cl_uint), &bench_ns);
+    clSetKernelArg(k_extract, 4,  sizeof(cl_uint), &tw);
+    clSetKernelArg(k_extract, 5,  sizeof(cl_mem), &d_edge_p1);
+    clSetKernelArg(k_extract, 6,  sizeof(cl_mem), &d_edge_p2);
+    clSetKernelArg(k_extract, 7,  sizeof(cl_mem), &d_edge_offsets);
+    clSetKernelArg(k_extract, 8,  sizeof(cl_mem), &d_edge_counts);
+    clSetKernelArg(k_extract, 9,  sizeof(cl_mem), &sht_keys);
+    clSetKernelArg(k_extract, 10, sizeof(cl_mem), &sht_values);
+    clSetKernelArg(k_extract, 11, sizeof(cl_mem), &sec_word);
+    clSetKernelArg(k_extract, 12, sizeof(cl_mem), &sec_disjunct_hash);
+    clSetKernelArg(k_extract, 13, sizeof(cl_mem), &sec_count);
+    clSetKernelArg(k_extract, 14, sizeof(cl_mem), &sec_next_free);
+    clSetKernelArg(k_extract, 15, sizeof(cl_mem), &total_sections_created);
+
+    size_t local = 256;
+    size_t gs;
+
+    /* Phase 4: Extract sections */
+    double t_start = now_ms();
+
+    gs = ((tw + local - 1) / local) * local;
+    err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue extract");
+    clFinish(queue);
+
+    double t_extract = now_ms();
+
+    /* Read section count */
+    uint32_t h_num_sections;
+    clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0,
+        sizeof(uint32_t), &h_num_sections, 0, NULL, NULL);
+
+    printf("  Sections extracted: %u (%.2f ms)\n",
+           h_num_sections, t_extract - t_start);
+
+    /* Phase 5: Cosine pipeline */
+    clSetKernelArg(k_norms,  3, sizeof(cl_uint), &h_num_sections);
+    clSetKernelArg(k_chains, 5, sizeof(cl_uint), &h_num_sections);
+    clSetKernelArg(k_dots,  12, sizeof(cl_uint), &h_num_sections);
+
+    double t_cos_start = now_ms();
+
+    uint32_t num_cands = 0;
+    run_cosine_pipeline(queue, k_norms, k_chains, k_dots, k_cosines,
+                        h_num_sections, cand_next_free, &num_cands);
+
+    double t_cos_end = now_ms();
+
+    double total_time = t_cos_end - t_start;
+    double cos_time = t_cos_end - t_cos_start;
+
+    printf("  Candidate pairs: %u\n", num_cands);
+    printf("  Cosine pipeline: %.2f ms\n", cos_time);
+    printf("  Full pipeline (extract + cosine): %.2f ms\n", total_time);
+    printf("  Throughput: %.0f sentences/sec\n",
+           bench_ns / (total_time / 1000.0));
+
+    if (num_cands > 0) {
+        /* Read a few top cosines to verify */
+        uint32_t peek = (num_cands < 8) ? num_cands : 8;
+        double h_cos[8];
+        clEnqueueReadBuffer(queue, cand_cosine, CL_TRUE, 0,
+            sizeof(double) * peek, h_cos, 0, NULL, NULL);
+
+        /* Find max cosine */
+        double max_cos = 0;
+        for (uint32_t i = 0; i < peek; i++)
+            if (h_cos[i] > max_cos) max_cos = h_cos[i];
+        printf("  Max cosine (first %u): %.4f\n", peek, max_cos);
+    }
+
+    int t6_pass = (h_num_sections > 0) && (num_cands > 0) && (total_time < 5000.0);
+    printf("  %s\n\n", t6_pass ? "PASS" : "FAIL");
+    if (t6_pass) pass_count++; else fail_count++;
+
+    /* ─── Summary ─── */
+
+    printf("=== Results: %d PASS, %d FAIL ===\n", pass_count, fail_count);
+
+    /* Cleanup */
+    free(b_words); free(b_sent_offsets); free(b_sent_lengths);
+    free(b_edge_p1); free(b_edge_p2); free(b_edge_offsets); free(b_edge_counts);
+    free(src_ht); free(src_as); free(src_sc); free(src_cos); free(combined);
+
+    clReleaseMemObject(d_flat_words);
+    clReleaseMemObject(d_sent_offsets);
+    clReleaseMemObject(d_sent_lengths);
+    clReleaseMemObject(d_edge_p1);
+    clReleaseMemObject(d_edge_p2);
+    clReleaseMemObject(d_edge_offsets);
+    clReleaseMemObject(d_edge_counts);
+    clReleaseMemObject(sec_word);
+    clReleaseMemObject(sec_disjunct_hash);
+    clReleaseMemObject(sec_count);
+    clReleaseMemObject(sec_next_free);
+    clReleaseMemObject(sht_keys);
+    clReleaseMemObject(sht_values);
+    clReleaseMemObject(total_sections_created);
+    clReleaseMemObject(djh_ht_keys);
+    clReleaseMemObject(djh_ht_values);
+    clReleaseMemObject(sec_chain_next);
+    clReleaseMemObject(word_norm_sq);
+    clReleaseMemObject(cand_ht_keys);
+    clReleaseMemObject(cand_ht_values);
+    clReleaseMemObject(cand_word_a);
+    clReleaseMemObject(cand_word_b);
+    clReleaseMemObject(cand_dot);
+    clReleaseMemObject(cand_cosine);
+    clReleaseMemObject(cand_next_free);
+    clReleaseKernel(k_norms);
+    clReleaseKernel(k_chains);
+    clReleaseKernel(k_dots);
+    clReleaseKernel(k_cosines);
+    clReleaseKernel(k_filter);
+    clReleaseKernel(k_extract);
+    clReleaseProgram(program);
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return fail_count > 0 ? 1 : 0;
+}
diff --git a/opencog/opencl/atomspace/test-sections.c b/opencog/opencl/atomspace/test-sections.c
new file mode 100644
index 0000000..06472fb
--- /dev/null
+++ b/opencog/opencl/atomspace/test-sections.c
@@ -0,0 +1,883 @@
+/*
+ * test-sections.c -- Test GPU section extraction kernel
+ *
+ * Compile: gcc -O2 -o test-sections test-sections.c -lOpenCL -lm
+ * Run:     ./test-sections
+ *
+ * Tests:
+ *   1. Simple MST: 3-word sentence with 2 edges → 3 sections
+ *   2. Star parse: all edges from one root → verify disjuncts
+ *   3. Multi-sentence batch (no cross-boundary sections)
+ *   4. Duplicate disjuncts: same parse seen twice → counts accumulate
+ *   5. Readback kernel verification
+ *   6. Benchmark: 1000 sentences with random MST edges
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include <CL/cl.h>
+
+/* ─── Pool capacities ─── */
+
+#define WORD_CAPACITY        (128 * 1024)
+#define PAIR_CAPACITY        (4 * 1024 * 1024)
+#define SECTION_CAPACITY     (1024 * 1024)
+#define WORD_HT_CAPACITY     (256 * 1024)
+#define PAIR_HT_CAPACITY     (8 * 1024 * 1024)
+#define SECTION_HT_CAPACITY  (2 * 1024 * 1024)
+
+#define HT_EMPTY_KEY    0xFFFFFFFFFFFFFFFFULL
+#define HT_EMPTY_VALUE  0xFFFFFFFFU
+
+/* ─── Error checking ─── */
+
+#define CL_CHECK(err, msg) do { \
+    if ((err) != CL_SUCCESS) { \
+        fprintf(stderr, "OpenCL error %d at %s:%d: %s\n", \
+                (err), __FILE__, __LINE__, (msg)); \
+        exit(1); \
+    } \
+} while(0)
+
+/* ─── Read file ─── */
+
+char* read_file(const char* path, size_t* len)
+{
+    FILE* f = fopen(path, "r");
+    if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
+    fseek(f, 0, SEEK_END);
+    *len = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    char* buf = malloc(*len + 1);
+    size_t n = fread(buf, 1, *len, f);
+    buf[n] = '\0';
+    *len = n;
+    fclose(f);
+    return buf;
+}
+
+/* ─── Timing ─── */
+
+double now_ms(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
+}
+
+/* ─── CPU-side disjunct hash (must match GPU) ─── */
+
+static uint64_t cpu_fnv1a_init(void) { return 0xcbf29ce484222325ULL; }
+
+static uint64_t cpu_fnv1a_mix(uint64_t hash, uint64_t val)
+{
+    hash ^= val;
+    hash *= 0x100000001b3ULL;
+    return hash;
+}
+
+static uint64_t cpu_hash_disjunct(uint32_t* words, uint32_t* dirs, uint32_t count)
+{
+    uint64_t h = cpu_fnv1a_init();
+    for (uint32_t i = 0; i < count; i++) {
+        uint64_t encoded = ((uint64_t)words[i] << 1) | (uint64_t)dirs[i];
+        h = cpu_fnv1a_mix(h, encoded);
+    }
+    if (h == HT_EMPTY_KEY) h = 0;
+    return h;
+}
+
+/* CPU-side section_key (must match GPU) */
+static uint64_t cpu_section_key(uint32_t word_idx, uint64_t disjunct_hash)
+{
+    uint64_t key = disjunct_hash ^ ((uint64_t)word_idx * 0x9E3779B97F4A7C15ULL);
+    if (key == HT_EMPTY_KEY) key = 0;
+    return key;
+}
+
+/* ─── Helper: reset section pool and hash table ─── */
+
+void reset_section_pool(cl_command_queue queue,
+                        cl_mem sht_keys, cl_mem sht_values,
+                        cl_mem sec_count,
+                        cl_mem sec_next_free, cl_mem total_sections)
+{
+    uint8_t pat_ff = 0xFF;
+    uint8_t pat_00 = 0x00;
+    uint32_t zero = 0;
+
+    clEnqueueFillBuffer(queue, sht_keys, &pat_ff, 1, 0,
+        sizeof(uint64_t) * SECTION_HT_CAPACITY, 0, NULL, NULL);
+    clEnqueueFillBuffer(queue, sht_values, &pat_ff, 1, 0,
+        sizeof(uint32_t) * SECTION_HT_CAPACITY, 0, NULL, NULL);
+    clEnqueueFillBuffer(queue, sec_count, &pat_00, 1, 0,
+        sizeof(double) * SECTION_CAPACITY, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, sec_next_free, CL_FALSE, 0,
+        sizeof(uint32_t), &zero, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, total_sections, CL_FALSE, 0,
+        sizeof(uint32_t), &zero, 0, NULL, NULL);
+    clFinish(queue);
+}
+
+/* ─── Main ─── */
+
+int main(int argc, char** argv)
+{
+    cl_int err;
+    int pass_count = 0, fail_count = 0;
+
+    printf("=== GPU Section Extraction Test ===\n\n");
+
+    /* ─── OpenCL setup ─── */
+
+    cl_platform_id platform;
+    err = clGetPlatformIDs(1, &platform, NULL);
+    CL_CHECK(err, "platform");
+
+    cl_device_id device;
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    CL_CHECK(err, "device");
+
+    char dev_name[256];
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL);
+    printf("GPU: %s\n", dev_name);
+
+    cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    CL_CHECK(err, "context");
+
+    cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &err);
+    CL_CHECK(err, "queue");
+
+    /* ─── Load and concatenate kernel sources ─── */
+
+    size_t ht_len, as_len, sc_len;
+    char* ht_src = read_file("gpu-hashtable.cl", &ht_len);
+    char* as_src = read_file("gpu-atomspace.cl", &as_len);
+    char* sc_src = read_file("gpu-sections.cl", &sc_len);
+
+    size_t total_len = ht_len + 1 + as_len + 1 + sc_len;
+    char* combined = malloc(total_len + 1);
+    memcpy(combined, ht_src, ht_len);
+    combined[ht_len] = '\n';
+    memcpy(combined + ht_len + 1, as_src, as_len);
+    combined[ht_len + 1 + as_len] = '\n';
+    memcpy(combined + ht_len + 1 + as_len + 1, sc_src, sc_len);
+    combined[total_len] = '\0';
+
+    cl_program program = clCreateProgramWithSource(ctx, 1,
+        (const char**)&combined, &total_len, &err);
+    CL_CHECK(err, "create program");
+
+    char build_opts[512];
+    snprintf(build_opts, sizeof(build_opts),
+        "-cl-std=CL1.2 "
+        "-DWORD_CAPACITY=%d "
+        "-DPAIR_CAPACITY=%d "
+        "-DSECTION_CAPACITY=%d "
+        "-DWORD_HT_CAPACITY=%d "
+        "-DPAIR_HT_CAPACITY=%d "
+        "-DSECTION_HT_CAPACITY=%d",
+        WORD_CAPACITY, PAIR_CAPACITY, SECTION_CAPACITY,
+        WORD_HT_CAPACITY, PAIR_HT_CAPACITY, SECTION_HT_CAPACITY);
+
+    err = clBuildProgram(program, 1, &device, build_opts, NULL, NULL);
+    if (err != CL_SUCCESS) {
+        char log[16384];
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+                              sizeof(log), log, NULL);
+        fprintf(stderr, "Build error:\n%s\n", log);
+        return 1;
+    }
+    printf("Kernels compiled successfully\n\n");
+
+    /* ─── Create kernels ─── */
+
+    cl_kernel k_extract = clCreateKernel(program, "extract_sections", &err);
+    CL_CHECK(err, "kernel extract_sections");
+    cl_kernel k_read = clCreateKernel(program, "read_sections", &err);
+    CL_CHECK(err, "kernel read_sections");
+
+    size_t local_size = 256;
+
+    /* ─── Allocate GPU buffers ─── */
+
+    printf("Allocating GPU buffers...\n");
+    uint32_t zero = 0;
+
+    /* Section hash table */
+    cl_mem sht_keys = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint64_t) * SECTION_HT_CAPACITY, NULL, &err);
+    CL_CHECK(err, "sht_keys");
+    cl_mem sht_values = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * SECTION_HT_CAPACITY, NULL, &err);
+    CL_CHECK(err, "sht_values");
+
+    /* Section pool SoA */
+    cl_mem sec_word = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint32_t) * SECTION_CAPACITY, NULL, &err);
+    cl_mem sec_disjunct_hash = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(uint64_t) * SECTION_CAPACITY, NULL, &err);
+    cl_mem sec_count = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+        sizeof(double) * SECTION_CAPACITY, NULL, &err);
+
+    /* Section bump allocator */
+    cl_mem sec_next_free = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t), &zero, &err);
+
+    /* Total sections created (stats counter) */
+    cl_mem total_sections = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t), &zero, &err);
+
+    /* Initial reset */
+    reset_section_pool(queue, sht_keys, sht_values, sec_count,
+                       sec_next_free, total_sections);
+
+    printf("GPU buffers ready\n\n");
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 1: Simple MST — 3-word sentence with chain parse
+     *
+     *  Sentence: words [10, 20, 30] (word pool indices)
+     *  MST edges: (0,1), (1,2)  — chain: 10—20—30
+     *
+     *  Expected sections:
+     *    Word 10 (pos 0): connectors = [(20, RIGHT)]
+     *      disjunct = "20+"
+     *    Word 20 (pos 1): connectors = [(10, LEFT), (30, RIGHT)]
+     *      disjunct = "10- 30+"
+     *    Word 30 (pos 2): connectors = [(20, LEFT)]
+     *      disjunct = "20-"
+     *
+     *  = 3 unique sections
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 1: Simple chain parse (3 words, 2 edges) ---\n");
+
+    uint32_t t1_words[] = {10, 20, 30};
+    uint32_t t1_sent_offsets[] = {0};
+    uint32_t t1_sent_lengths[] = {3};
+    uint32_t t1_edge_p1[] = {0, 1};
+    uint32_t t1_edge_p2[] = {1, 2};
+    uint32_t t1_edge_offsets[] = {0};
+    uint32_t t1_edge_counts[] = {2};
+    cl_uint t1_ns = 1, t1_tw = 3;
+
+    cl_mem d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t1_tw, t1_words, &err);
+    cl_mem d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t1_ns, t1_sent_offsets, &err);
+    cl_mem d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t1_ns, t1_sent_lengths, &err);
+    cl_mem d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 2, t1_edge_p1, &err);
+    cl_mem d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 2, t1_edge_p2, &err);
+    cl_mem d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t1_ns, t1_edge_offsets, &err);
+    cl_mem d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t1_ns, t1_edge_counts, &err);
+
+    /* Set kernel args */
+    clSetKernelArg(k_extract, 0,  sizeof(cl_mem), &d_flat_words);
+    clSetKernelArg(k_extract, 1,  sizeof(cl_mem), &d_sent_offsets);
+    clSetKernelArg(k_extract, 2,  sizeof(cl_mem), &d_sent_lengths);
+    clSetKernelArg(k_extract, 3,  sizeof(cl_uint), &t1_ns);
+    clSetKernelArg(k_extract, 4,  sizeof(cl_uint), &t1_tw);
+    clSetKernelArg(k_extract, 5,  sizeof(cl_mem), &d_edge_p1);
+    clSetKernelArg(k_extract, 6,  sizeof(cl_mem), &d_edge_p2);
+    clSetKernelArg(k_extract, 7,  sizeof(cl_mem), &d_edge_offsets);
+    clSetKernelArg(k_extract, 8,  sizeof(cl_mem), &d_edge_counts);
+    clSetKernelArg(k_extract, 9,  sizeof(cl_mem), &sht_keys);
+    clSetKernelArg(k_extract, 10, sizeof(cl_mem), &sht_values);
+    clSetKernelArg(k_extract, 11, sizeof(cl_mem), &sec_word);
+    clSetKernelArg(k_extract, 12, sizeof(cl_mem), &sec_disjunct_hash);
+    clSetKernelArg(k_extract, 13, sizeof(cl_mem), &sec_count);
+    clSetKernelArg(k_extract, 14, sizeof(cl_mem), &sec_next_free);
+    clSetKernelArg(k_extract, 15, sizeof(cl_mem), &total_sections);
+
+    double t0 = now_ms();
+    size_t gs = ((t1_tw + local_size - 1) / local_size) * local_size;
+    err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue extract");
+    clFinish(queue);
+    double t1 = now_ms();
+
+    /* Read back results */
+    uint32_t h_num_sections, h_total_created;
+    clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0,
+        sizeof(uint32_t), &h_num_sections, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, total_sections, CL_TRUE, 0,
+        sizeof(uint32_t), &h_total_created, 0, NULL, NULL);
+
+    /* Read section data */
+    uint32_t h_sec_words[8];
+    uint64_t h_sec_djh[8];
+    double   h_sec_counts[8];
+    clEnqueueReadBuffer(queue, sec_word, CL_TRUE, 0,
+        sizeof(uint32_t) * h_num_sections, h_sec_words, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, sec_disjunct_hash, CL_TRUE, 0,
+        sizeof(uint64_t) * h_num_sections, h_sec_djh, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0,
+        sizeof(double) * h_num_sections, h_sec_counts, 0, NULL, NULL);
+
+    /* Compute expected disjunct hashes on CPU */
+    /* Word 10 (pos 0): [(20, RIGHT=1)] */
+    uint32_t cw0[] = {20}; uint32_t cd0[] = {1};
+    uint64_t exp_djh_0 = cpu_hash_disjunct(cw0, cd0, 1);
+    /* Word 20 (pos 1): [(10, LEFT=0), (30, RIGHT=1)] — already sorted */
+    uint32_t cw1[] = {10, 30}; uint32_t cd1[] = {0, 1};
+    uint64_t exp_djh_1 = cpu_hash_disjunct(cw1, cd1, 2);
+    /* Word 30 (pos 2): [(20, LEFT=0)] */
+    uint32_t cw2[] = {20}; uint32_t cd2[] = {0};
+    uint64_t exp_djh_2 = cpu_hash_disjunct(cw2, cd2, 1);
+
+    printf("  Sections created: %u (expected 3)\n", h_num_sections);
+    printf("  Stats counter:    %u (expected 3)\n", h_total_created);
+
+    /* Verify each section exists with correct data */
+    int found_10 = 0, found_20 = 0, found_30 = 0;
+    for (uint32_t i = 0; i < h_num_sections; i++) {
+        if (h_sec_words[i] == 10 && h_sec_djh[i] == exp_djh_0 &&
+            fabs(h_sec_counts[i] - 1.0) < 0.01) found_10 = 1;
+        if (h_sec_words[i] == 20 && h_sec_djh[i] == exp_djh_1 &&
+            fabs(h_sec_counts[i] - 1.0) < 0.01) found_20 = 1;
+        if (h_sec_words[i] == 30 && h_sec_djh[i] == exp_djh_2 &&
+            fabs(h_sec_counts[i] - 1.0) < 0.01) found_30 = 1;
+    }
+
+    printf("  Section (word=10, djh=0x%016llx): %s\n",
+           (unsigned long long)exp_djh_0, found_10 ? "found" : "MISSING");
+    printf("  Section (word=20, djh=0x%016llx): %s\n",
+           (unsigned long long)exp_djh_1, found_20 ? "found" : "MISSING");
+    printf("  Section (word=30, djh=0x%016llx): %s\n",
+           (unsigned long long)exp_djh_2, found_30 ? "found" : "MISSING");
+    printf("  Time: %.2f ms\n", t1 - t0);
+
+    int t1_pass = (h_num_sections == 3) && (h_total_created == 3) &&
+                  found_10 && found_20 && found_30;
+    printf("  %s\n\n", t1_pass ? "PASS" : "FAIL");
+    if (t1_pass) pass_count++; else fail_count++;
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 2: Star parse — root word connected to all others
+     *
+     *  Sentence: words [100, 101, 102, 103, 104] (5 words)
+     *  MST edges: (2,0), (2,1), (2,3), (2,4) — word 102 is root
+     *
+     *  Expected sections:
+     *    Word 100 (pos 0): [(102, RIGHT)]   — 1 connector
+     *    Word 101 (pos 1): [(102, RIGHT)]   — 1 connector
+     *    Word 102 (pos 2): [(100, LEFT), (101, LEFT), (103, RIGHT), (104, RIGHT)]
+     *    Word 103 (pos 3): [(102, LEFT)]    — 1 connector
+     *    Word 104 (pos 4): [(102, LEFT)]    — 1 connector
+     *
+     *  = 5 sections (4 unique disjuncts: leaf-left, leaf-right, root-4conn)
+     *  But words 100 and 101 have same disjunct hash ONLY if their
+     *  partner word pool index is the same (both connect to 102) AND
+     *  direction is the same (both RIGHT). So disjunct hash matches!
+     *  → 100 and 101 have same disjunct but different words → 2 sections
+     *  Similarly 103 and 104 connect LEFT to 102 → same disjunct → 2 sections
+     *
+     *  Total unique sections (word, disjunct) pairs: 5
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 2: Star parse (5 words, root at center) ---\n");
+
+    reset_section_pool(queue, sht_keys, sht_values, sec_count,
+                       sec_next_free, total_sections);
+
+    uint32_t t2_words[] = {100, 101, 102, 103, 104};
+    uint32_t t2_sent_offsets[] = {0};
+    uint32_t t2_sent_lengths[] = {5};
+    uint32_t t2_edge_p1[] = {2, 2, 2, 2};
+    uint32_t t2_edge_p2[] = {0, 1, 3, 4};
+    uint32_t t2_edge_offsets[] = {0};
+    uint32_t t2_edge_counts[] = {4};
+    cl_uint t2_ns = 1, t2_tw = 5;
+
+    clReleaseMemObject(d_flat_words);
+    clReleaseMemObject(d_sent_offsets);
+    clReleaseMemObject(d_sent_lengths);
+    clReleaseMemObject(d_edge_p1);
+    clReleaseMemObject(d_edge_p2);
+    clReleaseMemObject(d_edge_offsets);
+    clReleaseMemObject(d_edge_counts);
+
+    d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t2_tw, t2_words, &err);
+    d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t2_ns, t2_sent_offsets, &err);
+    d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t2_ns, t2_sent_lengths, &err);
+    d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 4, t2_edge_p1, &err);
+    d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 4, t2_edge_p2, &err);
+    d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t2_ns, t2_edge_offsets, &err);
+    d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t2_ns, t2_edge_counts, &err);
+
+    /* Set kernel args */
+    clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words);
+    clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets);
+    clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths);
+    clSetKernelArg(k_extract, 3, sizeof(cl_uint), &t2_ns);
+    clSetKernelArg(k_extract, 4, sizeof(cl_uint), &t2_tw);
+    clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1);
+    clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2);
+    clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets);
+    clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts);
+
+    t0 = now_ms();
+    gs = ((t2_tw + local_size - 1) / local_size) * local_size;
+    err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue extract t2");
+    clFinish(queue);
+    t1 = now_ms();
+
+    clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0,
+        sizeof(uint32_t), &h_num_sections, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, total_sections, CL_TRUE, 0,
+        sizeof(uint32_t), &h_total_created, 0, NULL, NULL);
+
+    /* Compute expected disjunct for root word 102 (pos 2):
+     * Connectors: (100, LEFT=0), (101, LEFT=0), (103, RIGHT=1), (104, RIGHT=1)
+     * Sorted: dir 0 first sorted by word → (100,0), (101,0), (103,1), (104,1) */
+    uint32_t root_cw[] = {100, 101, 103, 104};
+    uint32_t root_cd[] = {0, 0, 1, 1};
+    uint64_t exp_root_djh = cpu_hash_disjunct(root_cw, root_cd, 4);
+
+    /* Read back all sections */
+    uint32_t h2_sec_words[8];
+    uint64_t h2_sec_djh[8];
+    double   h2_sec_counts[8];
+    uint32_t n_read = (h_num_sections < 8) ? h_num_sections : 8;
+    clEnqueueReadBuffer(queue, sec_word, CL_TRUE, 0,
+        sizeof(uint32_t) * n_read, h2_sec_words, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, sec_disjunct_hash, CL_TRUE, 0,
+        sizeof(uint64_t) * n_read, h2_sec_djh, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0,
+        sizeof(double) * n_read, h2_sec_counts, 0, NULL, NULL);
+
+    printf("  Sections created: %u (expected 5)\n", h_num_sections);
+
+    /* Find root section */
+    int found_root = 0;
+    for (uint32_t i = 0; i < n_read; i++) {
+        if (h2_sec_words[i] == 102 && h2_sec_djh[i] == exp_root_djh &&
+            fabs(h2_sec_counts[i] - 1.0) < 0.01) {
+            found_root = 1;
+        }
+    }
+    printf("  Root section (word=102, 4 connectors): %s\n",
+           found_root ? "found" : "MISSING");
+    printf("  Time: %.2f ms\n", t1 - t0);
+
+    int t2_pass = (h_num_sections == 5) && found_root;
+    printf("  %s\n\n", t2_pass ? "PASS" : "FAIL");
+    if (t2_pass) pass_count++; else fail_count++;
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 3: Multi-sentence batch (no cross-boundary sections)
+     *
+     *  Sentence 1: [10, 20, 30]   edges: (0,1), (1,2) — chain
+     *  Sentence 2: [40, 50, 60]   edges: (0,1), (0,2) — star from 40
+     *
+     *  flat_words  = [10, 20, 30, 40, 50, 60]
+     *  flat_edges  = [(0,1), (1,2), (0,1), (0,2)]
+     *  edge_offsets = [0, 2]
+     *  edge_counts  = [2, 2]
+     *
+     *  Expected: 6 sections (3 per sentence), none spanning boundary
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 3: Multi-sentence batch ---\n");
+
+    reset_section_pool(queue, sht_keys, sht_values, sec_count,
+                       sec_next_free, total_sections);
+
+    uint32_t t3_words[] = {10, 20, 30, 40, 50, 60};
+    uint32_t t3_sent_offsets[] = {0, 3};
+    uint32_t t3_sent_lengths[] = {3, 3};
+    uint32_t t3_edge_p1[] = {0, 1, 0, 0};
+    uint32_t t3_edge_p2[] = {1, 2, 1, 2};
+    uint32_t t3_edge_offsets[] = {0, 2};
+    uint32_t t3_edge_counts[] = {2, 2};
+    cl_uint t3_ns = 2, t3_tw = 6;
+
+    clReleaseMemObject(d_flat_words);
+    clReleaseMemObject(d_sent_offsets);
+    clReleaseMemObject(d_sent_lengths);
+    clReleaseMemObject(d_edge_p1);
+    clReleaseMemObject(d_edge_p2);
+    clReleaseMemObject(d_edge_offsets);
+    clReleaseMemObject(d_edge_counts);
+
+    d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t3_tw, t3_words, &err);
+    d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t3_ns, t3_sent_offsets, &err);
+    d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t3_ns, t3_sent_lengths, &err);
+    d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 4, t3_edge_p1, &err);
+    d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 4, t3_edge_p2, &err);
+    d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t3_ns, t3_edge_offsets, &err);
+    d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t3_ns, t3_edge_counts, &err);
+
+    clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words);
+    clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets);
+    clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths);
+    clSetKernelArg(k_extract, 3, sizeof(cl_uint), &t3_ns);
+    clSetKernelArg(k_extract, 4, sizeof(cl_uint), &t3_tw);
+    clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1);
+    clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2);
+    clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets);
+    clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts);
+
+    t0 = now_ms();
+    gs = ((t3_tw + local_size - 1) / local_size) * local_size;
+    err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue extract t3");
+    clFinish(queue);
+    t1 = now_ms();
+
+    clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0,
+        sizeof(uint32_t), &h_num_sections, 0, NULL, NULL);
+
+    /* Verify: sentence 1 chain gives 3 sections, sentence 2 star gives 3.
+     * But sentence 1's word 10 has disjunct "20+" and sentence 2's word 50
+     * has disjunct "40-" — different disjuncts. All 6 words produce sections.
+     * Are any (word, disjunct) pairs the same? No — different words, different
+     * disjuncts. So 6 unique sections. */
+    printf("  Sections created: %u (expected 6)\n", h_num_sections);
+    printf("  Time: %.2f ms\n", t1 - t0);
+
+    int t3_pass = (h_num_sections == 6);
+    printf("  %s\n\n", t3_pass ? "PASS" : "FAIL");
+    if (t3_pass) pass_count++; else fail_count++;
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 4: Duplicate sections — same parse seen twice
+     *
+     *  Process the SAME sentence twice → section counts should be 2.0
+     *
+     *  Sentence: [10, 20, 30]  edges: (0,1), (1,2)
+     *  Run extract_sections TWICE without resetting.
+     *
+     *  Expected: 3 sections, each with count = 2.0
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 4: Duplicate sections (same parse twice) ---\n");
+
+    reset_section_pool(queue, sht_keys, sht_values, sec_count,
+                       sec_next_free, total_sections);
+
+    uint32_t t4_words[] = {10, 20, 30};
+    uint32_t t4_sent_offsets[] = {0};
+    uint32_t t4_sent_lengths[] = {3};
+    uint32_t t4_edge_p1[] = {0, 1};
+    uint32_t t4_edge_p2[] = {1, 2};
+    uint32_t t4_edge_offsets[] = {0};
+    uint32_t t4_edge_counts[] = {2};
+    cl_uint t4_ns = 1, t4_tw = 3;
+
+    clReleaseMemObject(d_flat_words);
+    clReleaseMemObject(d_sent_offsets);
+    clReleaseMemObject(d_sent_lengths);
+    clReleaseMemObject(d_edge_p1);
+    clReleaseMemObject(d_edge_p2);
+    clReleaseMemObject(d_edge_offsets);
+    clReleaseMemObject(d_edge_counts);
+
+    d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t4_tw, t4_words, &err);
+    d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t4_ns, t4_sent_offsets, &err);
+    d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t4_ns, t4_sent_lengths, &err);
+    d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 2, t4_edge_p1, &err);
+    d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * 2, t4_edge_p2, &err);
+    d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t4_ns, t4_edge_offsets, &err);
+    d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * t4_ns, t4_edge_counts, &err);
+
+    clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words);
+    clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets);
+    clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths);
+    clSetKernelArg(k_extract, 3, sizeof(cl_uint), &t4_ns);
+    clSetKernelArg(k_extract, 4, sizeof(cl_uint), &t4_tw);
+    clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1);
+    clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2);
+    clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets);
+    clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts);
+
+    /* Run TWICE */
+    gs = ((t4_tw + local_size - 1) / local_size) * local_size;
+    err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue extract t4a");
+    clFinish(queue);
+
+    err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue extract t4b");
+    clFinish(queue);
+
+    clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0,
+        sizeof(uint32_t), &h_num_sections, 0, NULL, NULL);
+
+    /* Read counts */
+    double h4_counts[4];
+    clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0,
+        sizeof(double) * h_num_sections, h4_counts, 0, NULL, NULL);
+
+    printf("  Sections created: %u (expected 3 — dedup works)\n", h_num_sections);
+
+    int all_count_2 = 1;
+    for (uint32_t i = 0; i < h_num_sections; i++) {
+        printf("  Section %u count: %.1f (expected 2.0)\n", i, h4_counts[i]);
+        if (fabs(h4_counts[i] - 2.0) > 0.01) all_count_2 = 0;
+    }
+
+    int t4_pass = (h_num_sections == 3) && all_count_2;
+    printf("  %s\n\n", t4_pass ? "PASS" : "FAIL");
+    if (t4_pass) pass_count++; else fail_count++;
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 5: Readback kernel
+     *
+     *  Use read_sections to verify section pool data matches
+     *  what extract_sections stored. (Reuse state from test 4.)
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 5: Readback kernel ---\n");
+
+    uint32_t n_secs = h_num_sections;
+    cl_mem d_out_word = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
+        sizeof(uint32_t) * n_secs, NULL, &err);
+    cl_mem d_out_djh = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
+        sizeof(uint64_t) * n_secs, NULL, &err);
+    cl_mem d_out_count = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
+        sizeof(double) * n_secs, NULL, &err);
+
+    clSetKernelArg(k_read, 0, sizeof(cl_mem), &sec_word);
+    clSetKernelArg(k_read, 1, sizeof(cl_mem), &sec_disjunct_hash);
+    clSetKernelArg(k_read, 2, sizeof(cl_mem), &sec_count);
+    clSetKernelArg(k_read, 3, sizeof(cl_mem), &d_out_word);
+    clSetKernelArg(k_read, 4, sizeof(cl_mem), &d_out_djh);
+    clSetKernelArg(k_read, 5, sizeof(cl_mem), &d_out_count);
+    clSetKernelArg(k_read, 6, sizeof(cl_uint), &n_secs);
+
+    gs = ((n_secs + local_size - 1) / local_size) * local_size;
+    err = clEnqueueNDRangeKernel(queue, k_read, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue read_sections");
+    clFinish(queue);
+
+    uint32_t rb_words[4];
+    uint64_t rb_djh[4];
+    double   rb_counts[4];
+    clEnqueueReadBuffer(queue, d_out_word, CL_TRUE, 0,
+        sizeof(uint32_t) * n_secs, rb_words, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, d_out_djh, CL_TRUE, 0,
+        sizeof(uint64_t) * n_secs, rb_djh, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, d_out_count, CL_TRUE, 0,
+        sizeof(double) * n_secs, rb_counts, 0, NULL, NULL);
+
+    /* Should match test 4's data */
+    uint32_t h5_words[4];
+    uint64_t h5_djh[4];
+    double   h5_counts[4];
+    clEnqueueReadBuffer(queue, sec_word, CL_TRUE, 0,
+        sizeof(uint32_t) * n_secs, h5_words, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, sec_disjunct_hash, CL_TRUE, 0,
+        sizeof(uint64_t) * n_secs, h5_djh, 0, NULL, NULL);
+    clEnqueueReadBuffer(queue, sec_count, CL_TRUE, 0,
+        sizeof(double) * n_secs, h5_counts, 0, NULL, NULL);
+
+    int readback_match = 1;
+    for (uint32_t i = 0; i < n_secs; i++) {
+        if (rb_words[i] != h5_words[i] || rb_djh[i] != h5_djh[i] ||
+            fabs(rb_counts[i] - h5_counts[i]) > 0.01)
+            readback_match = 0;
+    }
+    printf("  Readback matches direct read: %s\n", readback_match ? "yes" : "NO");
+
+    int t5_pass = readback_match;
+    printf("  %s\n\n", t5_pass ? "PASS" : "FAIL");
+    if (t5_pass) pass_count++; else fail_count++;
+
+    clReleaseMemObject(d_out_word);
+    clReleaseMemObject(d_out_djh);
+    clReleaseMemObject(d_out_count);
+
+    /* ═══════════════════════════════════════════════════════════════
+     *  TEST 6: Benchmark — 1000 sentences with random MST edges
+     *
+     *  Each sentence: 10-20 words, 9-19 MST edges (chain parse)
+     *  Total: ~15000 words, ~14000 edges
+     *
+     *  Measures extract_sections throughput.
+     * ═══════════════════════════════════════════════════════════════ */
+
+    printf("--- Test 6: Benchmark (1000 sentences) ---\n");
+
+    reset_section_pool(queue, sht_keys, sht_values, sec_count,
+                       sec_next_free, total_sections);
+
+    srand(42);
+    uint32_t bench_ns = 1000;
+    uint32_t vocab_size = 500;  /* word pool indices 0..499 */
+
+    /* Generate sentences */
+    uint32_t max_words = bench_ns * 25;
+    uint32_t max_edges = bench_ns * 25;
+    uint32_t* b_words = malloc(sizeof(uint32_t) * max_words);
+    uint32_t* b_sent_offsets = malloc(sizeof(uint32_t) * bench_ns);
+    uint32_t* b_sent_lengths = malloc(sizeof(uint32_t) * bench_ns);
+    uint32_t* b_edge_p1 = malloc(sizeof(uint32_t) * max_edges);
+    uint32_t* b_edge_p2 = malloc(sizeof(uint32_t) * max_edges);
+    uint32_t* b_edge_offsets = malloc(sizeof(uint32_t) * bench_ns);
+    uint32_t* b_edge_counts = malloc(sizeof(uint32_t) * bench_ns);
+
+    uint32_t word_pos = 0, edge_pos = 0;
+    for (uint32_t s = 0; s < bench_ns; s++) {
+        uint32_t slen = 10 + (rand() % 11);  /* 10-20 words */
+        b_sent_offsets[s] = word_pos;
+        b_sent_lengths[s] = slen;
+        b_edge_offsets[s] = edge_pos;
+        b_edge_counts[s] = slen - 1;  /* chain parse */
+
+        for (uint32_t w = 0; w < slen; w++) {
+            b_words[word_pos++] = rand() % vocab_size;
+        }
+        /* Chain parse: pos 0-1, 1-2, ..., (slen-2)-(slen-1) */
+        for (uint32_t e = 0; e < slen - 1; e++) {
+            b_edge_p1[edge_pos] = e;
+            b_edge_p2[edge_pos] = e + 1;
+            edge_pos++;
+        }
+    }
+
+    printf("  Total words: %u\n", word_pos);
+    printf("  Total edges: %u\n", edge_pos);
+
+    clReleaseMemObject(d_flat_words);
+    clReleaseMemObject(d_sent_offsets);
+    clReleaseMemObject(d_sent_lengths);
+    clReleaseMemObject(d_edge_p1);
+    clReleaseMemObject(d_edge_p2);
+    clReleaseMemObject(d_edge_offsets);
+    clReleaseMemObject(d_edge_counts);
+
+    d_flat_words = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * word_pos, b_words, &err);
+    d_sent_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_sent_offsets, &err);
+    d_sent_lengths = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_sent_lengths, &err);
+    d_edge_p1 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * edge_pos, b_edge_p1, &err);
+    d_edge_p2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * edge_pos, b_edge_p2, &err);
+    d_edge_offsets = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_edge_offsets, &err);
+    d_edge_counts = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        sizeof(uint32_t) * bench_ns, b_edge_counts, &err);
+
+    cl_uint tw_bench = word_pos;
+    clSetKernelArg(k_extract, 0, sizeof(cl_mem), &d_flat_words);
+    clSetKernelArg(k_extract, 1, sizeof(cl_mem), &d_sent_offsets);
+    clSetKernelArg(k_extract, 2, sizeof(cl_mem), &d_sent_lengths);
+    clSetKernelArg(k_extract, 3, sizeof(cl_uint), &bench_ns);
+    clSetKernelArg(k_extract, 4, sizeof(cl_uint), &tw_bench);
+    clSetKernelArg(k_extract, 5, sizeof(cl_mem), &d_edge_p1);
+    clSetKernelArg(k_extract, 6, sizeof(cl_mem), &d_edge_p2);
+    clSetKernelArg(k_extract, 7, sizeof(cl_mem), &d_edge_offsets);
+    clSetKernelArg(k_extract, 8, sizeof(cl_mem), &d_edge_counts);
+
+    /* Warm up */
+    gs = ((tw_bench + local_size - 1) / local_size) * local_size;
+    clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    clFinish(queue);
+
+    /* Reset for actual benchmark */
+    reset_section_pool(queue, sht_keys, sht_values, sec_count,
+                       sec_next_free, total_sections);
+
+    t0 = now_ms();
+    err = clEnqueueNDRangeKernel(queue, k_extract, 1, NULL,
+        &gs, &local_size, 0, NULL, NULL);
+    CL_CHECK(err, "enqueue benchmark");
+    clFinish(queue);
+    t1 = now_ms();
+
+    clEnqueueReadBuffer(queue, sec_next_free, CL_TRUE, 0,
+        sizeof(uint32_t), &h_num_sections, 0, NULL, NULL);
+
+    double elapsed = t1 - t0;
+    double secs_per_sec = bench_ns / (elapsed / 1000.0);
+    double words_per_sec = word_pos / (elapsed / 1000.0);
+
+    printf("  Sections created: %u\n", h_num_sections);
+    printf("  Time: %.2f ms\n", elapsed);
+    printf("  Throughput: %.0f sentences/sec, %.0f words/sec\n",
+           secs_per_sec, words_per_sec);
+    printf("  Throughput: %.1fM sections/sec\n",
+           h_num_sections / (elapsed / 1000.0) / 1e6);
+
+    /* Sanity: every word should produce a section in a chain parse,
+     * but some (word, disjunct) pairs may collide. So sections < words
+     * but > 0. */
+    int t6_pass = (h_num_sections > 0) && (h_num_sections <= word_pos) &&
+                  (elapsed < 1000.0);
+    printf("  %s\n\n", t6_pass ? "PASS" : "FAIL");
+    if (t6_pass) pass_count++; else fail_count++;
+
+    /* ─── Summary ─── */
+
+    printf("=== Results: %d PASS, %d FAIL ===\n", pass_count, fail_count);
+
+    /* Cleanup */
+    free(b_words); free(b_sent_offsets); free(b_sent_lengths);
+    free(b_edge_p1); free(b_edge_p2); free(b_edge_offsets); free(b_edge_counts);
+    free(ht_src); free(as_src); free(sc_src); free(combined);
+
+    clReleaseMemObject(d_flat_words);
+    clReleaseMemObject(d_sent_offsets);
+    clReleaseMemObject(d_sent_lengths);
+    clReleaseMemObject(d_edge_p1);
+    clReleaseMemObject(d_edge_p2);
+    clReleaseMemObject(d_edge_offsets);
+    clReleaseMemObject(d_edge_counts);
+    clReleaseMemObject(sht_keys);
+    clReleaseMemObject(sht_values);
+    clReleaseMemObject(sec_word);
+    clReleaseMemObject(sec_disjunct_hash);
+    clReleaseMemObject(sec_count);
+    clReleaseMemObject(sec_next_free);
+    clReleaseMemObject(total_sections);
+    clReleaseKernel(k_extract);
+    clReleaseKernel(k_read);
+    clReleaseProgram(program);
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return fail_count > 0 ? 1 : 0;
+}