diff --git a/nfa.go b/nfa.go index b6ed6fa..7fa5800 100644 --- a/nfa.go +++ b/nfa.go @@ -193,9 +193,14 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState { nUnpacked[i] = unpackTable(nState.table) } - // for each byte value + // Unpack the DFA table once, set all byte transitions, then pack once — + // the old code called addByteStep per byte which unpacked and repacked + // for each of up to 256 values. rawStates is allocated once and reset + // with [:0] each iteration to avoid per-byte-value slice allocation. + dfaUnpacked := unpackTable(dfaState.table) + rawStates := make([]*faState, 0, len(ingredients)) for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ { - var rawStates []*faState + rawStates = rawStates[:0] // for each of the unique states for _, unpackedNState := range nUnpacked { @@ -208,9 +213,10 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState { // if there were any transitions on this byte value if len(rawStates) > 0 { // recurse, get the DFA state for the transitions and plug it into this state - dfaState.table.addByteStep(byte(utf8byte), n2dNode(rawStates, sList)) + dfaUnpacked[utf8byte] = n2dNode(rawStates, sList) } } + dfaState.table.pack(dfaUnpacked) // load up transitions (build-time, allocation is fine) seen := make(map[*fieldMatcher]bool) diff --git a/nfa_test.go b/nfa_test.go index 32e4259..88d92f4 100644 --- a/nfa_test.go +++ b/nfa_test.go @@ -111,6 +111,32 @@ func TestNfa2Dfa(t *testing.T) { shoulds: []string{"abc", "abcfoo"}, nopes: []string{"xabc", "abxbar"}, }, + // multi-star patterns exercise intern() dedup more heavily + { + pattern: "*foo*", + shoulds: []string{"foo", "xfoo", "foox", "xfoox", "foofoofoo"}, + nopes: []string{"bar", "fo", "ffo"}, + }, + { + pattern: "*foo*bar*", + shoulds: []string{"foobar", "xfooybar", "foobarbaz", "xxfooxxbarxx"}, + nopes: []string{"barfoo", "foo", "bar", "fobar"}, + }, + { + pattern: "*a*b*c*", + shoulds: []string{"abc", "xaxbxcx", "abc123", "123abc", "aabbcc"}, + nopes: []string{"ab", "ac", "bc", "cba"}, + }, + { + pattern: "*a*b*c*d*e*", + shoulds: []string{"abcde", "xaxbxcxdxex", "aabbccddee"}, + nopes: []string{"abcd", "edcba", "abce"}, + }, + { + pattern: "*a*b*c*d*e*f*g*h*", + shoulds: []string{"abcdefgh", "xaxbxcxdxexfxgxhx"}, + nopes: []string{"abcdefg", "hgfedcba"}, + }, } pp := newPrettyPrinter(4567) transitions := []*fieldMatcher{} diff --git a/state_lists.go b/state_lists.go index d7e5f88..54c664e 100644 --- a/state_lists.go +++ b/state_lists.go @@ -2,22 +2,31 @@ package quamina import ( "cmp" + "encoding/binary" "slices" "unsafe" ) +// internEntry bundles the list and DFA state into one map value so that +// cache hits require a single map lookup instead of two. +type internEntry struct { + states []*faState + dfaState *faState +} + // The idea is that in we are going to be computing the epsilon closures of NFA states, which // will be slices of states. There will be duplicate slices and we want to deduplicate. There's // probably a more idiomatic and efficient way to do this. type stateLists struct { - lists map[string][]*faState - dfaStates map[string]*faState + entries map[string]internEntry + // Scratch space reused across intern() calls + sortBuf []*faState // reusable sorted buffer + keyBuf []byte // reusable key bytes buffer } func newStateLists() *stateLists { return &stateLists{ - lists: make(map[string][]*faState), - dfaStates: make(map[string]*faState), + entries: make(map[string]internEntry), } } @@ -27,40 +36,47 @@ func newStateLists() *stateLists { // which either has already been computed for the set or is created and empty, and // a boolean indicating whether the DFA state has already been computed or not. func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) { - // dedupe the collection - uniquemap := make(map[*faState]bool) + // Dedupe using the global generation counter and faState.closureSetGen + // instead of allocating a map per call. Safe to reuse closureSetGen + // because nfa2Dfa runs after epsilon closure computation is complete. + closureGeneration++ + gen := closureGeneration + sl.sortBuf = sl.sortBuf[:0] for _, state := range list { - uniquemap[state] = true - } - uniques := make([]*faState, 0, len(uniquemap)) - for unique := range uniquemap { - uniques = append(uniques, unique) + if state.closureSetGen != gen { + state.closureSetGen = gen + sl.sortBuf = append(sl.sortBuf, state) + } } - // compute a key representing the set. Disclosure: My first use of an AI to help - // code. I had done this by Sprintf("%p")-ing the addresses and sorting/concatenating - // the strings. Which works fine but grabbing the raw bytes and pretending they're - // a string is going to produce keys that are exactly half the size - keyBytes := make([]byte, 0, len(uniques)*8) - slices.SortFunc(uniques, func(a, b *faState) int { + // compute a key representing the set + slices.SortFunc(sl.sortBuf, func(a, b *faState) int { return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b))) }) - for _, state := range uniques { - addr := uintptr(unsafe.Pointer(state)) - for i := 0; i < 8; i++ { - keyBytes = append(keyBytes, byte(addr>>(i*8))) - } + // Pre-size the key buffer and write pointers with PutUint64 instead of + // appending byte-by-byte, avoiding 8 append calls and bounds checks per state. + needed := len(sl.sortBuf) * 8 + if cap(sl.keyBuf) < needed { + sl.keyBuf = make([]byte, needed) + } else { + sl.keyBuf = sl.keyBuf[:needed] + } + for i, state := range sl.sortBuf { + binary.LittleEndian.PutUint64(sl.keyBuf[i*8:], uint64(uintptr(unsafe.Pointer(state)))) } - key := string(keyBytes) - // either we have already seen this or not - list, exists := sl.lists[key] - if exists { - return list, sl.dfaStates[key], true + // string(sl.keyBuf) in a map lookup is optimized by the compiler to avoid allocation + if entry, exists := sl.entries[string(sl.keyBuf)]; exists { + return entry.states, entry.dfaState, true } + + // cache miss: allocate owned copies for the map + key := string(sl.keyBuf) + stored := make([]*faState, len(sl.sortBuf)) + copy(stored, sl.sortBuf) + dfaState := &faState{table: newSmallTable()} - sl.lists[key] = uniques - sl.dfaStates[key] = dfaState - return uniques, dfaState, false + sl.entries[key] = internEntry{states: stored, dfaState: dfaState} + return stored, dfaState, false } diff --git a/state_lists_bench_test.go b/state_lists_bench_test.go new file mode 100644 index 0000000..6d15b35 --- /dev/null +++ b/state_lists_bench_test.go @@ -0,0 +1,37 @@ +//go:build go1.24 + +package quamina + +import ( + "fmt" + "testing" +) + +// BenchmarkNfa2Dfa measures the cost of nfa2Dfa conversion, where intern() +// in state_lists.go typically dominates. Patterns with more wildcards produce +// larger epsilon closures and more intern() calls. +func BenchmarkNfa2Dfa(b *testing.B) { + patterns := []struct { + name string + pattern string + }{ + {"single_star", "*foo*"}, + {"two_stars", "*foo*bar*"}, + {"three_stars", "*a*b*c*"}, + {"five_stars", "*a*b*c*d*e*"}, + {"eight_stars", "*a*b*c*d*e*f*g*h*"}, + } + + pp := newPrettyPrinter(12345) + for _, tc := range patterns { + b.Run(tc.name, func(b *testing.B) { + nfa, _ := makeShellStyleFA([]byte(fmt.Sprintf(`"%s"`, tc.pattern)), pp) + epsilonClosure(nfa) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + nfa2Dfa(nfa) + } + }) + } +}