From 844ee469b1646c32c9ee3bb4f2d0af3cd3a1e593 Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Sat, 28 Feb 2026 12:40:32 -0800 Subject: [PATCH 1/7] Optimize intern() to eliminate per-call allocations Replace the dedup map with a generation counter (reusing faState.closureSetGen), and add reusable scratch buffers for the sorted-uniques slice and key bytes. On cache hits the compiler's string(bytes) map-lookup optimization avoids the key allocation entirely. Only cache misses allocate (key string + stored slice). Reduces nfa2Dfa allocations by ~35-40% and wall time by ~12%. Co-Authored-By: Claude Opus 4.6 --- state_lists.go | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/state_lists.go b/state_lists.go index d7e5f88..2859c26 100644 --- a/state_lists.go +++ b/state_lists.go @@ -12,6 +12,9 @@ import ( type stateLists struct { lists map[string][]*faState dfaStates map[string]*faState + // Scratch space reused across intern() calls + sortBuf []*faState // reusable sorted buffer + keyBuf []byte // reusable key bytes buffer } func newStateLists() *stateLists { @@ -27,40 +30,42 @@ func newStateLists() *stateLists { // which either has already been computed for the set or is created and empty, and // a boolean indicating whether the DFA state has already been computed or not. func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) { - // dedupe the collection - uniquemap := make(map[*faState]bool) + // dedupe using generation counter instead of a map + closureGeneration++ + gen := closureGeneration + sl.sortBuf = sl.sortBuf[:0] for _, state := range list { - uniquemap[state] = true - } - uniques := make([]*faState, 0, len(uniquemap)) - for unique := range uniquemap { - uniques = append(uniques, unique) + if state.closureSetGen != gen { + state.closureSetGen = gen + sl.sortBuf = append(sl.sortBuf, state) + } } - // compute a key representing the set. Disclosure: My first use of an AI to help - // code. I had done this by Sprintf("%p")-ing the addresses and sorting/concatenating - // the strings. Which works fine but grabbing the raw bytes and pretending they're - // a string is going to produce keys that are exactly half the size - keyBytes := make([]byte, 0, len(uniques)*8) - slices.SortFunc(uniques, func(a, b *faState) int { + // compute a key representing the set + slices.SortFunc(sl.sortBuf, func(a, b *faState) int { return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b))) }) - for _, state := range uniques { + sl.keyBuf = sl.keyBuf[:0] + for _, state := range sl.sortBuf { addr := uintptr(unsafe.Pointer(state)) for i := 0; i < 8; i++ { - keyBytes = append(keyBytes, byte(addr>>(i*8))) + sl.keyBuf = append(sl.keyBuf, byte(addr>>(i*8))) } } - key := string(keyBytes) - // either we have already seen this or not - list, exists := sl.lists[key] - if exists { - return list, sl.dfaStates[key], true + // string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation + if list, exists := sl.lists[string(sl.keyBuf)]; exists { + return list, sl.dfaStates[string(sl.keyBuf)], true } + + // cache miss: allocate owned copies for the map + key := string(sl.keyBuf) + stored := make([]*faState, len(sl.sortBuf)) + copy(stored, sl.sortBuf) + dfaState := &faState{table: newSmallTable()} - sl.lists[key] = uniques + sl.lists[key] = stored sl.dfaStates[key] = dfaState - return uniques, dfaState, false + return stored, dfaState, false } From dccf1baee6b1c3ea893941dfe9f07b46b94e288d Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Sat, 28 Feb 2026 12:47:47 -0800 Subject: [PATCH 2/7] Batch unpack/pack in n2dNode for 5-9x nfa2Dfa speedup Instead of calling addByteStep (unpack, set one byte, repack) for each of up to 256 byte values, unpack the DFA table once, set all transitions into the unpacked table, then pack once at the end. Also adds BenchmarkNfa2Dfa to measure the nfa2Dfa conversion cost across patterns with varying wildcard counts. Co-Authored-By: Claude Opus 4.6 --- nfa.go | 6 ++++-- state_lists_bench_test.go | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 state_lists_bench_test.go diff --git a/nfa.go b/nfa.go index b6ed6fa..ae84c26 100644 --- a/nfa.go +++ b/nfa.go @@ -193,7 +193,8 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState { nUnpacked[i] = unpackTable(nState.table) } - // for each byte value + // unpack the DFA table once, set all byte transitions, then pack once + dfaUnpacked := unpackTable(dfaState.table) for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ { var rawStates []*faState @@ -208,9 +209,10 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState { // if there were any transitions on this byte value if len(rawStates) > 0 { // recurse, get the DFA state for the transitions and plug it into this state - dfaState.table.addByteStep(byte(utf8byte), n2dNode(rawStates, sList)) + dfaUnpacked[utf8byte] = n2dNode(rawStates, sList) } } + dfaState.table.pack(dfaUnpacked) // load up transitions (build-time, allocation is fine) seen := make(map[*fieldMatcher]bool) diff --git a/state_lists_bench_test.go b/state_lists_bench_test.go new file mode 100644 index 0000000..6d15b35 --- /dev/null +++ b/state_lists_bench_test.go @@ -0,0 +1,37 @@ +//go:build go1.24 + +package quamina + +import ( + "fmt" + "testing" +) + +// BenchmarkNfa2Dfa measures the cost of nfa2Dfa conversion, where intern() +// in state_lists.go typically dominates. Patterns with more wildcards produce +// larger epsilon closures and more intern() calls. +func BenchmarkNfa2Dfa(b *testing.B) { + patterns := []struct { + name string + pattern string + }{ + {"single_star", "*foo*"}, + {"two_stars", "*foo*bar*"}, + {"three_stars", "*a*b*c*"}, + {"five_stars", "*a*b*c*d*e*"}, + {"eight_stars", "*a*b*c*d*e*f*g*h*"}, + } + + pp := newPrettyPrinter(12345) + for _, tc := range patterns { + b.Run(tc.name, func(b *testing.B) { + nfa, _ := makeShellStyleFA([]byte(fmt.Sprintf(`"%s"`, tc.pattern)), pp) + epsilonClosure(nfa) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + nfa2Dfa(nfa) + } + }) + } +} From 288402dd9abbf83b46a6658a291b97807c22e76d Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Sat, 28 Feb 2026 12:55:23 -0800 Subject: [PATCH 3/7] Use PutUint64 for intern() key encoding Replace the byte-by-byte append loop (8 appends per state, each with bounds checks) with a pre-sized buffer and a single binary.LittleEndian.PutUint64 per state. ~20% faster in nfa2Dfa. Co-Authored-By: Claude Opus 4.6 --- state_lists.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/state_lists.go b/state_lists.go index 2859c26..5491c0e 100644 --- a/state_lists.go +++ b/state_lists.go @@ -2,6 +2,7 @@ package quamina import ( "cmp" + "encoding/binary" "slices" "unsafe" ) @@ -46,12 +47,14 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) { return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b))) }) - sl.keyBuf = sl.keyBuf[:0] - for _, state := range sl.sortBuf { - addr := uintptr(unsafe.Pointer(state)) - for i := 0; i < 8; i++ { - sl.keyBuf = append(sl.keyBuf, byte(addr>>(i*8))) - } + needed := len(sl.sortBuf) * 8 + if cap(sl.keyBuf) < needed { + sl.keyBuf = make([]byte, needed) + } else { + sl.keyBuf = sl.keyBuf[:needed] + } + for i, state := range sl.sortBuf { + binary.LittleEndian.PutUint64(sl.keyBuf[i*8:], uint64(uintptr(unsafe.Pointer(state)))) } // string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation From b04a2f5260a39f1a5c53ab4bd8b6494112b2e73c Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Sat, 28 Feb 2026 13:00:05 -0800 Subject: [PATCH 4/7] Merge intern() maps into single entries map Replace separate lists and dfaStates maps with a single map of internEntry structs, eliminating the second map lookup on cache hits. ~9-18% faster in nfa2Dfa. Co-Authored-By: Claude Opus 4.6 --- state_lists.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/state_lists.go b/state_lists.go index 5491c0e..aaf9e76 100644 --- a/state_lists.go +++ b/state_lists.go @@ -7,12 +7,16 @@ import ( "unsafe" ) +type internEntry struct { + states []*faState + dfaState *faState +} + // The idea is that in we are going to be computing the epsilon closures of NFA states, which // will be slices of states. There will be duplicate slices and we want to deduplicate. There's // probably a more idiomatic and efficient way to do this. type stateLists struct { - lists map[string][]*faState - dfaStates map[string]*faState + entries map[string]internEntry // Scratch space reused across intern() calls sortBuf []*faState // reusable sorted buffer keyBuf []byte // reusable key bytes buffer @@ -20,8 +24,7 @@ type stateLists struct { func newStateLists() *stateLists { return &stateLists{ - lists: make(map[string][]*faState), - dfaStates: make(map[string]*faState), + entries: make(map[string]internEntry), } } @@ -58,8 +61,8 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) { } // string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation - if list, exists := sl.lists[string(sl.keyBuf)]; exists { - return list, sl.dfaStates[string(sl.keyBuf)], true + if entry, exists := sl.entries[string(sl.keyBuf)]; exists { + return entry.states, entry.dfaState, true } // cache miss: allocate owned copies for the map @@ -68,7 +71,6 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) { copy(stored, sl.sortBuf) dfaState := &faState{table: newSmallTable()} - sl.lists[key] = stored - sl.dfaStates[key] = dfaState + sl.entries[key] = internEntry{states: stored, dfaState: dfaState} return stored, dfaState, false } From ad771da5a8e083e6bd676113c5839313157039bb Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Sat, 28 Feb 2026 13:04:40 -0800 Subject: [PATCH 5/7] Reuse rawStates slice across byte loop in n2dNode Hoist the rawStates slice above the 256-iteration byte loop and reset with [:0] each iteration instead of allocating a new slice. Eliminates ~95% of nfa2Dfa allocations, ~35-48% faster. Co-Authored-By: Claude Opus 4.6 --- nfa.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nfa.go b/nfa.go index ae84c26..9e7fffd 100644 --- a/nfa.go +++ b/nfa.go @@ -195,8 +195,9 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState { // unpack the DFA table once, set all byte transitions, then pack once dfaUnpacked := unpackTable(dfaState.table) + rawStates := make([]*faState, 0, len(ingredients)) for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ { - var rawStates []*faState + rawStates = rawStates[:0] // for each of the unique states for _, unpackedNState := range nUnpacked { From 137fe99616e1fb764cc9732378b2b16dcb1f51bc Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Sat, 28 Feb 2026 13:06:40 -0800 Subject: [PATCH 6/7] Add multi-star test cases to TestNfa2Dfa Cover the same patterns used in BenchmarkNfa2Dfa to verify correctness of the optimized intern() and n2dNode paths with larger epsilon closures and heavier dedup. Co-Authored-By: Claude Opus 4.6 --- nfa_test.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/nfa_test.go b/nfa_test.go index 32e4259..88d92f4 100644 --- a/nfa_test.go +++ b/nfa_test.go @@ -111,6 +111,32 @@ func TestNfa2Dfa(t *testing.T) { shoulds: []string{"abc", "abcfoo"}, nopes: []string{"xabc", "abxbar"}, }, + // multi-star patterns exercise intern() dedup more heavily + { + pattern: "*foo*", + shoulds: []string{"foo", "xfoo", "foox", "xfoox", "foofoofoo"}, + nopes: []string{"bar", "fo", "ffo"}, + }, + { + pattern: "*foo*bar*", + shoulds: []string{"foobar", "xfooybar", "foobarbaz", "xxfooxxbarxx"}, + nopes: []string{"barfoo", "foo", "bar", "fobar"}, + }, + { + pattern: "*a*b*c*", + shoulds: []string{"abc", "xaxbxcx", "abc123", "123abc", "aabbcc"}, + nopes: []string{"ab", "ac", "bc", "cba"}, + }, + { + pattern: "*a*b*c*d*e*", + shoulds: []string{"abcde", "xaxbxcxdxex", "aabbccddee"}, + nopes: []string{"abcd", "edcba", "abce"}, + }, + { + pattern: "*a*b*c*d*e*f*g*h*", + shoulds: []string{"abcdefgh", "xaxbxcxdxexfxgxhx"}, + nopes: []string{"abcdefg", "hgfedcba"}, + }, } pp := newPrettyPrinter(4567) transitions := []*fieldMatcher{} From c2f9c6194cc67934af5cdcab4628cc1f2fe980b0 Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Sat, 28 Feb 2026 13:19:35 -0800 Subject: [PATCH 7/7] Add inline comments explaining each optimization Co-Authored-By: Claude Opus 4.6 --- nfa.go | 5 ++++- state_lists.go | 8 +++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/nfa.go b/nfa.go index 9e7fffd..7fa5800 100644 --- a/nfa.go +++ b/nfa.go @@ -193,7 +193,10 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState { nUnpacked[i] = unpackTable(nState.table) } - // unpack the DFA table once, set all byte transitions, then pack once + // Unpack the DFA table once, set all byte transitions, then pack once — + // the old code called addByteStep per byte which unpacked and repacked + // for each of up to 256 values. rawStates is allocated once and reset + // with [:0] each iteration to avoid per-byte-value slice allocation. dfaUnpacked := unpackTable(dfaState.table) rawStates := make([]*faState, 0, len(ingredients)) for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ { diff --git a/state_lists.go b/state_lists.go index aaf9e76..54c664e 100644 --- a/state_lists.go +++ b/state_lists.go @@ -7,6 +7,8 @@ import ( "unsafe" ) +// internEntry bundles the list and DFA state into one map value so that +// cache hits require a single map lookup instead of two. type internEntry struct { states []*faState dfaState *faState @@ -34,7 +36,9 @@ func newStateLists() *stateLists { // which either has already been computed for the set or is created and empty, and // a boolean indicating whether the DFA state has already been computed or not. func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) { - // dedupe using generation counter instead of a map + // Dedupe using the global generation counter and faState.closureSetGen + // instead of allocating a map per call. Safe to reuse closureSetGen + // because nfa2Dfa runs after epsilon closure computation is complete. closureGeneration++ gen := closureGeneration sl.sortBuf = sl.sortBuf[:0] @@ -50,6 +54,8 @@ func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) { return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b))) }) + // Pre-size the key buffer and write pointers with PutUint64 instead of + // appending byte-by-byte, avoiding 8 append calls and bounds checks per state. needed := len(sl.sortBuf) * 8 if cap(sl.keyBuf) < needed { sl.keyBuf = make([]byte, needed)