From 844ee469b1646c32c9ee3bb4f2d0af3cd3a1e593 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 28 Feb 2026 12:40:32 -0800
Subject: [PATCH 1/7] Optimize intern() to eliminate per-call allocations

Replace the dedup map with a generation counter (reusing
faState.closureSetGen), and add reusable scratch buffers for the
sorted-uniques slice and key bytes. On cache hits the compiler's
string(bytes) map-lookup optimization avoids the key allocation
entirely. Only cache misses allocate (key string + stored slice).

Reduces nfa2Dfa allocations by ~35-40% and wall time by ~12%.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 state_lists.go | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/state_lists.go b/state_lists.go
index d7e5f88..2859c26 100644
--- a/state_lists.go
+++ b/state_lists.go
@@ -12,6 +12,9 @@ import (
 type stateLists struct {
 	lists     map[string][]*faState
 	dfaStates map[string]*faState
+	// Scratch space reused across intern() calls
+	sortBuf []*faState // reusable sorted buffer
+	keyBuf  []byte     // reusable key bytes buffer
 }
 
 func newStateLists() *stateLists {
@@ -27,40 +30,42 @@ func newStateLists() *stateLists {
 // which either has already been computed for the set or is created and empty, and
 // a boolean indicating whether the DFA state has already been computed or not.
 func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
-	// dedupe the collection
-	uniquemap := make(map[*faState]bool)
+	// dedupe using generation counter instead of a map
+	closureGeneration++
+	gen := closureGeneration
+	sl.sortBuf = sl.sortBuf[:0]
 	for _, state := range list {
-		uniquemap[state] = true
-	}
-	uniques := make([]*faState, 0, len(uniquemap))
-	for unique := range uniquemap {
-		uniques = append(uniques, unique)
+		if state.closureSetGen != gen {
+			state.closureSetGen = gen
+			sl.sortBuf = append(sl.sortBuf, state)
+		}
 	}
 
-	// compute a key representing the set. Disclosure: My first use of an AI to help
-	// code. I had done this by Sprintf("%p")-ing the addresses and sorting/concatenating
-	// the strings. Which works fine but grabbing the raw bytes and pretending they're
-	// a string is going to produce keys that are exactly half the size
-	keyBytes := make([]byte, 0, len(uniques)*8)
-	slices.SortFunc(uniques, func(a, b *faState) int {
+	// compute a key representing the set
+	slices.SortFunc(sl.sortBuf, func(a, b *faState) int {
 		return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b)))
 	})
 
-	for _, state := range uniques {
+	sl.keyBuf = sl.keyBuf[:0]
+	for _, state := range sl.sortBuf {
 		addr := uintptr(unsafe.Pointer(state))
 		for i := 0; i < 8; i++ {
-			keyBytes = append(keyBytes, byte(addr>>(i*8)))
+			sl.keyBuf = append(sl.keyBuf, byte(addr>>(i*8)))
 		}
 	}
-	key := string(keyBytes)
 
-	// either we have already seen this or not
-	list, exists := sl.lists[key]
-	if exists {
-		return list, sl.dfaStates[key], true
+	// string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation
+	if list, exists := sl.lists[string(sl.keyBuf)]; exists {
+		return list, sl.dfaStates[string(sl.keyBuf)], true
 	}
+
+	// cache miss: allocate owned copies for the map
+	key := string(sl.keyBuf)
+	stored := make([]*faState, len(sl.sortBuf))
+	copy(stored, sl.sortBuf)
+
 	dfaState := &faState{table: newSmallTable()}
-	sl.lists[key] = uniques
+	sl.lists[key] = stored
 	sl.dfaStates[key] = dfaState
-	return uniques, dfaState, false
+	return stored, dfaState, false
 }

From dccf1baee6b1c3ea893941dfe9f07b46b94e288d Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 28 Feb 2026 12:47:47 -0800
Subject: [PATCH 2/7] Batch unpack/pack in n2dNode for 5-9x nfa2Dfa speedup

Instead of calling addByteStep (unpack, set one byte, repack) for
each of up to 256 byte values, unpack the DFA table once, set all
transitions into the unpacked table, then pack once at the end.

Also adds BenchmarkNfa2Dfa to measure the nfa2Dfa conversion cost
across patterns with varying wildcard counts.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 nfa.go                    |  6 ++++--
 state_lists_bench_test.go | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 state_lists_bench_test.go

diff --git a/nfa.go b/nfa.go
index b6ed6fa..ae84c26 100644
--- a/nfa.go
+++ b/nfa.go
@@ -193,7 +193,8 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 		nUnpacked[i] = unpackTable(nState.table)
 	}
 
-	// for each byte value
+	// unpack the DFA table once, set all byte transitions, then pack once
+	dfaUnpacked := unpackTable(dfaState.table)
 	for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ {
 		var rawStates []*faState
 
@@ -208,9 +209,10 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 		// if there were any transitions on this byte value
 		if len(rawStates) > 0 {
 			// recurse, get the DFA state for the transitions and plug it into this state
-			dfaState.table.addByteStep(byte(utf8byte), n2dNode(rawStates, sList))
+			dfaUnpacked[utf8byte] = n2dNode(rawStates, sList)
 		}
 	}
+	dfaState.table.pack(dfaUnpacked)
 
 	// load up transitions (build-time, allocation is fine)
 	seen := make(map[*fieldMatcher]bool)
diff --git a/state_lists_bench_test.go b/state_lists_bench_test.go
new file mode 100644
index 0000000..6d15b35
--- /dev/null
+++ b/state_lists_bench_test.go
@@ -0,0 +1,37 @@
+//go:build go1.24
+
+package quamina
+
+import (
+	"fmt"
+	"testing"
+)
+
+// BenchmarkNfa2Dfa measures the cost of nfa2Dfa conversion, where intern()
+// in state_lists.go typically dominates. Patterns with more wildcards produce
+// larger epsilon closures and more intern() calls.
+func BenchmarkNfa2Dfa(b *testing.B) {
+	patterns := []struct {
+		name    string
+		pattern string
+	}{
+		{"single_star", "*foo*"},
+		{"two_stars", "*foo*bar*"},
+		{"three_stars", "*a*b*c*"},
+		{"five_stars", "*a*b*c*d*e*"},
+		{"eight_stars", "*a*b*c*d*e*f*g*h*"},
+	}
+
+	pp := newPrettyPrinter(12345)
+	for _, tc := range patterns {
+		b.Run(tc.name, func(b *testing.B) {
+			nfa, _ := makeShellStyleFA([]byte(fmt.Sprintf(`"%s"`, tc.pattern)), pp)
+			epsilonClosure(nfa)
+			b.ResetTimer()
+			b.ReportAllocs()
+			for b.Loop() {
+				nfa2Dfa(nfa)
+			}
+		})
+	}
+}

From 288402dd9abbf83b46a6658a291b97807c22e76d Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 28 Feb 2026 12:55:23 -0800
Subject: [PATCH 3/7] Use PutUint64 for intern() key encoding

Replace the byte-by-byte append loop (8 appends per state, each
with bounds checks) with a pre-sized buffer and a single
binary.LittleEndian.PutUint64 per state. ~20% faster in nfa2Dfa.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 state_lists.go | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/state_lists.go b/state_lists.go
index 2859c26..5491c0e 100644
--- a/state_lists.go
+++ b/state_lists.go
@@ -2,6 +2,7 @@ package quamina
 
 import (
 	"cmp"
+	"encoding/binary"
 	"slices"
 	"unsafe"
 )
@@ -46,12 +47,14 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
 		return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b)))
 	})
 
-	sl.keyBuf = sl.keyBuf[:0]
-	for _, state := range sl.sortBuf {
-		addr := uintptr(unsafe.Pointer(state))
-		for i := 0; i < 8; i++ {
-			sl.keyBuf = append(sl.keyBuf, byte(addr>>(i*8)))
-		}
+	needed := len(sl.sortBuf) * 8
+	if cap(sl.keyBuf) < needed {
+		sl.keyBuf = make([]byte, needed)
+	} else {
+		sl.keyBuf = sl.keyBuf[:needed]
+	}
+	for i, state := range sl.sortBuf {
+		binary.LittleEndian.PutUint64(sl.keyBuf[i*8:], uint64(uintptr(unsafe.Pointer(state))))
 	}
 
 	// string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation

From b04a2f5260a39f1a5c53ab4bd8b6494112b2e73c Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 28 Feb 2026 13:00:05 -0800
Subject: [PATCH 4/7] Merge intern() maps into single entries map

Replace separate lists and dfaStates maps with a single map of
internEntry structs, eliminating the second map lookup on cache
hits. ~9-18% faster in nfa2Dfa.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 state_lists.go | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/state_lists.go b/state_lists.go
index 5491c0e..aaf9e76 100644
--- a/state_lists.go
+++ b/state_lists.go
@@ -7,12 +7,16 @@ import (
 	"unsafe"
 )
 
+type internEntry struct {
+	states   []*faState
+	dfaState *faState
+}
+
 // The idea is that in we are going to be computing the epsilon closures of NFA states, which
 // will be slices of states. There will be duplicate slices and we want to deduplicate. There's
 // probably a more idiomatic and efficient way to do this.
 type stateLists struct {
-	lists     map[string][]*faState
-	dfaStates map[string]*faState
+	entries map[string]internEntry
 	// Scratch space reused across intern() calls
 	sortBuf []*faState // reusable sorted buffer
 	keyBuf  []byte     // reusable key bytes buffer
@@ -20,8 +24,7 @@ type stateLists struct {
 
 func newStateLists() *stateLists {
 	return &stateLists{
-		lists:     make(map[string][]*faState),
-		dfaStates: make(map[string]*faState),
+		entries: make(map[string]internEntry),
 	}
 }
 
@@ -58,8 +61,8 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
 	}
 
 	// string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation
-	if list, exists := sl.lists[string(sl.keyBuf)]; exists {
-		return list, sl.dfaStates[string(sl.keyBuf)], true
+	if entry, exists := sl.entries[string(sl.keyBuf)]; exists {
+		return entry.states, entry.dfaState, true
 	}
 
 	// cache miss: allocate owned copies for the map
@@ -68,7 +71,6 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
 	copy(stored, sl.sortBuf)
 
 	dfaState := &faState{table: newSmallTable()}
-	sl.lists[key] = stored
-	sl.dfaStates[key] = dfaState
+	sl.entries[key] = internEntry{states: stored, dfaState: dfaState}
 	return stored, dfaState, false
 }

From ad771da5a8e083e6bd676113c5839313157039bb Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 28 Feb 2026 13:04:40 -0800
Subject: [PATCH 5/7] Reuse rawStates slice across byte loop in n2dNode

Hoist the rawStates slice above the 256-iteration byte loop and
reset with [:0] each iteration instead of allocating a new slice.
Eliminates ~95% of nfa2Dfa allocations, ~35-48% faster.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 nfa.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nfa.go b/nfa.go
index ae84c26..9e7fffd 100644
--- a/nfa.go
+++ b/nfa.go
@@ -195,8 +195,9 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 
 	// unpack the DFA table once, set all byte transitions, then pack once
 	dfaUnpacked := unpackTable(dfaState.table)
+	rawStates := make([]*faState, 0, len(ingredients))
 	for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ {
-		var rawStates []*faState
+		rawStates = rawStates[:0]
 
 		// for each of the unique states
 		for _, unpackedNState := range nUnpacked {

From 137fe99616e1fb764cc9732378b2b16dcb1f51bc Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 28 Feb 2026 13:06:40 -0800
Subject: [PATCH 6/7] Add multi-star test cases to TestNfa2Dfa

Cover the same patterns used in BenchmarkNfa2Dfa to verify
correctness of the optimized intern() and n2dNode paths with
larger epsilon closures and heavier dedup.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 nfa_test.go | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/nfa_test.go b/nfa_test.go
index 32e4259..88d92f4 100644
--- a/nfa_test.go
+++ b/nfa_test.go
@@ -111,6 +111,32 @@ func TestNfa2Dfa(t *testing.T) {
 			shoulds: []string{"abc", "abcfoo"},
 			nopes:   []string{"xabc", "abxbar"},
 		},
+		// multi-star patterns exercise intern() dedup more heavily
+		{
+			pattern: "*foo*",
+			shoulds: []string{"foo", "xfoo", "foox", "xfoox", "foofoofoo"},
+			nopes:   []string{"bar", "fo", "ffo"},
+		},
+		{
+			pattern: "*foo*bar*",
+			shoulds: []string{"foobar", "xfooybar", "foobarbaz", "xxfooxxbarxx"},
+			nopes:   []string{"barfoo", "foo", "bar", "fobar"},
+		},
+		{
+			pattern: "*a*b*c*",
+			shoulds: []string{"abc", "xaxbxcx", "abc123", "123abc", "aabbcc"},
+			nopes:   []string{"ab", "ac", "bc", "cba"},
+		},
+		{
+			pattern: "*a*b*c*d*e*",
+			shoulds: []string{"abcde", "xaxbxcxdxex", "aabbccddee"},
+			nopes:   []string{"abcd", "edcba", "abce"},
+		},
+		{
+			pattern: "*a*b*c*d*e*f*g*h*",
+			shoulds: []string{"abcdefgh", "xaxbxcxdxexfxgxhx"},
+			nopes:   []string{"abcdefg", "hgfedcba"},
+		},
 	}
 	pp := newPrettyPrinter(4567)
 	transitions := []*fieldMatcher{}

From c2f9c6194cc67934af5cdcab4628cc1f2fe980b0 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 28 Feb 2026 13:19:35 -0800
Subject: [PATCH 7/7] Add inline comments explaining each optimization

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 nfa.go         | 5 ++++-
 state_lists.go | 8 +++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/nfa.go b/nfa.go
index 9e7fffd..7fa5800 100644
--- a/nfa.go
+++ b/nfa.go
@@ -193,7 +193,10 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 		nUnpacked[i] = unpackTable(nState.table)
 	}
 
-	// unpack the DFA table once, set all byte transitions, then pack once
+	// Unpack the DFA table once, set all byte transitions, then pack once —
+	// the old code called addByteStep per byte which unpacked and repacked
+	// for each of up to 256 values. rawStates is allocated once and reset
+	// with [:0] each iteration to avoid per-byte-value slice allocation.
 	dfaUnpacked := unpackTable(dfaState.table)
 	rawStates := make([]*faState, 0, len(ingredients))
 	for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ {
diff --git a/state_lists.go b/state_lists.go
index aaf9e76..54c664e 100644
--- a/state_lists.go
+++ b/state_lists.go
@@ -7,6 +7,8 @@ import (
 	"unsafe"
 )
 
+// internEntry bundles the list and DFA state into one map value so that
+// cache hits require a single map lookup instead of two.
 type internEntry struct {
 	states   []*faState
 	dfaState *faState
@@ -34,7 +36,9 @@ func newStateLists() *stateLists {
 // which either has already been computed for the set or is created and empty, and
 // a boolean indicating whether the DFA state has already been computed or not.
 func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
-	// dedupe using generation counter instead of a map
+	// Dedupe using the global generation counter and faState.closureSetGen
+	// instead of allocating a map per call. Safe to reuse closureSetGen
+	// because nfa2Dfa runs after epsilon closure computation is complete.
 	closureGeneration++
 	gen := closureGeneration
 	sl.sortBuf = sl.sortBuf[:0]
@@ -50,6 +54,8 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
 		return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b)))
 	})
 
+	// Pre-size the key buffer and write pointers with PutUint64 instead of
+	// appending byte-by-byte, avoiding 8 append calls and bounds checks per state.
 	needed := len(sl.sortBuf) * 8
 	if cap(sl.keyBuf) < needed {
 		sl.keyBuf = make([]byte, needed)