timbray · timbray · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/nfa.go b/nfa.go
@@ -193,9 +193,14 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 		nUnpacked[i] = unpackTable(nState.table)
 	}
 
-	// for each byte value
+	// Unpack the DFA table once, set all byte transitions, then pack once —
+	// the old code called addByteStep per byte which unpacked and repacked
+	// for each of up to 256 values. rawStates is allocated once and reset
+	// with [:0] each iteration to avoid per-byte-value slice allocation.
+	dfaUnpacked := unpackTable(dfaState.table)
+	rawStates := make([]*faState, 0, len(ingredients))
 	for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ {
-		var rawStates []*faState
+		rawStates = rawStates[:0]
 
 		// for each of the unique states
 		for _, unpackedNState := range nUnpacked {
@@ -208,9 +213,10 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 		// if there were any transitions on this byte value
 		if len(rawStates) > 0 {
 			// recurse, get the DFA state for the transitions and plug it into this state
-			dfaState.table.addByteStep(byte(utf8byte), n2dNode(rawStates, sList))
+			dfaUnpacked[utf8byte] = n2dNode(rawStates, sList)
 		}
 	}
+	dfaState.table.pack(dfaUnpacked)
 
 	// load up transitions (build-time, allocation is fine)
 	seen := make(map[*fieldMatcher]bool)

diff --git a/nfa_test.go b/nfa_test.go
@@ -111,6 +111,32 @@ func TestNfa2Dfa(t *testing.T) {
 			shoulds: []string{"abc", "abcfoo"},
 			nopes:   []string{"xabc", "abxbar"},
 		},
+		// multi-star patterns exercise intern() dedup more heavily
+		{
+			pattern: "*foo*",
+			shoulds: []string{"foo", "xfoo", "foox", "xfoox", "foofoofoo"},
+			nopes:   []string{"bar", "fo", "ffo"},
+		},
+		{
+			pattern: "*foo*bar*",
+			shoulds: []string{"foobar", "xfooybar", "foobarbaz", "xxfooxxbarxx"},
+			nopes:   []string{"barfoo", "foo", "bar", "fobar"},
+		},
+		{
+			pattern: "*a*b*c*",
+			shoulds: []string{"abc", "xaxbxcx", "abc123", "123abc", "aabbcc"},
+			nopes:   []string{"ab", "ac", "bc", "cba"},
+		},
+		{
+			pattern: "*a*b*c*d*e*",
+			shoulds: []string{"abcde", "xaxbxcxdxex", "aabbccddee"},
+			nopes:   []string{"abcd", "edcba", "abce"},
+		},
+		{
+			pattern: "*a*b*c*d*e*f*g*h*",
+			shoulds: []string{"abcdefgh", "xaxbxcxdxexfxgxhx"},
+			nopes:   []string{"abcdefg", "hgfedcba"},
+		},
 	}
 	pp := newPrettyPrinter(4567)
 	transitions := []*fieldMatcher{}

diff --git a/state_lists.go b/state_lists.go
@@ -2,22 +2,31 @@ package quamina
 
 import (
 	"cmp"
+	"encoding/binary"
 	"slices"
 	"unsafe"
 )
 
+// internEntry bundles the list and DFA state into one map value so that
+// cache hits require a single map lookup instead of two.
+type internEntry struct {
+	states   []*faState
+	dfaState *faState
+}
+
 // The idea is that in we are going to be computing the epsilon closures of NFA states, which
 // will be slices of states. There will be duplicate slices and we want to deduplicate. There's
 // probably a more idiomatic and efficient way to do this.
 type stateLists struct {
-	lists     map[string][]*faState
-	dfaStates map[string]*faState
+	entries map[string]internEntry
+	// Scratch space reused across intern() calls
+	sortBuf []*faState // reusable sorted buffer
+	keyBuf  []byte     // reusable key bytes buffer
 }
 
 func newStateLists() *stateLists {
 	return &stateLists{
-		lists:     make(map[string][]*faState),
-		dfaStates: make(map[string]*faState),
+		entries: make(map[string]internEntry),
 	}
 }
 
@@ -27,40 +36,47 @@ func newStateLists() *stateLists {
 // which either has already been computed for the set or is created and empty, and
 // a boolean indicating whether the DFA state has already been computed or not.
 func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
-	// dedupe the collection
-	uniquemap := make(map[*faState]bool)
+	// Dedupe using the global generation counter and faState.closureSetGen
+	// instead of allocating a map per call. Safe to reuse closureSetGen
+	// because nfa2Dfa runs after epsilon closure computation is complete.
+	closureGeneration++
+	gen := closureGeneration
+	sl.sortBuf = sl.sortBuf[:0]
 	for _, state := range list {
-		uniquemap[state] = true
-	}
-	uniques := make([]*faState, 0, len(uniquemap))
-	for unique := range uniquemap {
-		uniques = append(uniques, unique)
+		if state.closureSetGen != gen {
+			state.closureSetGen = gen
+			sl.sortBuf = append(sl.sortBuf, state)
+		}
 	}
 
-	// compute a key representing the set. Disclosure: My first use of an AI to help
-	// code. I had done this by Sprintf("%p")-ing the addresses and sorting/concatenating
-	// the strings. Which works fine but grabbing the raw bytes and pretending they're
-	// a string is going to produce keys that are exactly half the size
-	keyBytes := make([]byte, 0, len(uniques)*8)
-	slices.SortFunc(uniques, func(a, b *faState) int {
+	// compute a key representing the set
+	slices.SortFunc(sl.sortBuf, func(a, b *faState) int {
 		return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b)))
 	})
 
-	for _, state := range uniques {
-		addr := uintptr(unsafe.Pointer(state))
-		for i := 0; i < 8; i++ {
-			keyBytes = append(keyBytes, byte(addr>>(i*8)))
-		}
+	// Pre-size the key buffer and write pointers with PutUint64 instead of
+	// appending byte-by-byte, avoiding 8 append calls and bounds checks per state.
+	needed := len(sl.sortBuf) * 8
+	if cap(sl.keyBuf) < needed {
+		sl.keyBuf = make([]byte, needed)
+	} else {
+		sl.keyBuf = sl.keyBuf[:needed]
+	}
+	for i, state := range sl.sortBuf {
+		binary.LittleEndian.PutUint64(sl.keyBuf[i*8:], uint64(uintptr(unsafe.Pointer(state))))
 	}
-	key := string(keyBytes)
 
-	// either we have already seen this or not
-	list, exists := sl.lists[key]
-	if exists {
-		return list, sl.dfaStates[key], true
+	// string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation
+	if entry, exists := sl.entries[string(sl.keyBuf)]; exists {
+		return entry.states, entry.dfaState, true
 	}
+
+	// cache miss: allocate owned copies for the map
+	key := string(sl.keyBuf)
+	stored := make([]*faState, len(sl.sortBuf))
+	copy(stored, sl.sortBuf)
+
 	dfaState := &faState{table: newSmallTable()}
-	sl.lists[key] = uniques
-	sl.dfaStates[key] = dfaState
-	return uniques, dfaState, false
+	sl.entries[key] = internEntry{states: stored, dfaState: dfaState}
+	return stored, dfaState, false
 }
diff --git a/state_lists_bench_test.go b/state_lists_bench_test.go
@@ -0,0 +1,37 @@
+//go:build go1.24
+
+package quamina
+
+import (
+	"fmt"
+	"testing"
+)
+
+// BenchmarkNfa2Dfa measures the cost of nfa2Dfa conversion, where intern()
+// in state_lists.go typically dominates. Patterns with more wildcards produce
+// larger epsilon closures and more intern() calls.
+func BenchmarkNfa2Dfa(b *testing.B) {
+	patterns := []struct {
+		name    string
+		pattern string
+	}{
+		{"single_star", "*foo*"},
+		{"two_stars", "*foo*bar*"},
+		{"three_stars", "*a*b*c*"},
+		{"five_stars", "*a*b*c*d*e*"},
+		{"eight_stars", "*a*b*c*d*e*f*g*h*"},
+	}
+
+	pp := newPrettyPrinter(12345)
+	for _, tc := range patterns {
+		b.Run(tc.name, func(b *testing.B) {
+			nfa, _ := makeShellStyleFA([]byte(fmt.Sprintf(`"%s"`, tc.pattern)), pp)
+			epsilonClosure(nfa)
+			b.ResetTimer()
+			b.ReportAllocs()
+			for b.Loop() {
+				nfa2Dfa(nfa)
+			}
+		})
+	}
+}