@@ -49,8 +49,8 @@ func calculateRatio(matches, length int) float64 {
4949
5050func listifyString (str []byte ) (lst [][]byte ) {
5151 lst = make ([][]byte , len (str ))
52- for i , c := range str {
53- lst [i ] = [] byte { c }
52+ for i := range str {
53+ lst [i ] = str [ i : i + 1 ]
5454 }
5555 return lst
5656}
@@ -84,62 +84,79 @@ type B2J struct {
8484 b [][]byte
8585}
8686
87- func newB2J (b [][]byte ) * B2J {
88- b2j := B2J {store : map [lineHash ] [][]int {}, b : b }
89- for lineno , line := range b {
90- h := _hash (line )
87+ type lineType int8
88+ const (
89+ lineNONE lineType = 0
90+ lineNORMAL lineType = 1
91+ lineJUNK lineType = - 1
92+ linePOPULAR lineType = - 2
93+ )
94+
95+ func (b2j * B2J ) _find (line * []byte ) (h lineHash , slotIndex int ,
96+ slot []int , lt lineType ) {
97+ h = _hash (* line )
98+ for slotIndex , slot = range b2j .store [h ] {
9199 // Thanks to the qualities of sha1, the probability of having more than
92100 // one line content with the same hash is very low. Nevertheless, store
93101 // each of them in a different slot, that we can differentiate by
94102 // looking at the line contents in the b slice.
95- for slotIndex , slot := range b2j .store [h ] {
96- if bytes .Equal (line , b [slot [0 ]]) {
97- // The content already has a slot in its hash bucket. Just
98- // append the newly seen index to the slice in that slot
99- b2j.store [h ][slotIndex ] = append (slot , lineno )
100- goto cont
103+ // In place of all the line numbers where the line appears, a slot can
104+ // also contain [lineno, -1] if b[lineno] is junk.
105+ if bytes .Equal (* line , b2j .b [slot [0 ]]) {
106+ // The content already has a slot in its hash bucket.
107+ if len (slot ) == 2 && slot [1 ] < 0 {
108+ lt = lineType (slot [1 ])
109+ } else {
110+ lt = lineNORMAL
101111 }
112+ return // every return variable has the correct value
102113 }
103- // The line content still has no slot. Create one with a single value.
104- b2j .store [h ] = append (b2j .store [h ], []int {lineno })
105- cont:
106114 }
107- return & b2j
115+ // The line content still has no slot.
116+ slotIndex = - 1
117+ slot = nil
118+ lt = lineNONE
119+ return
108120}
109121
110- func (b2j * B2J ) get (line []byte ) []int {
111- // Thanks to the qualities of sha1, there should be very few (zero or one)
112- // slots, so the following loop is fast.
113- for _ , slot := range b2j .store [_hash (line )] {
114- if bytes .Equal (line , b2j .b [slot [0 ]]) {
115- return slot
116- }
122+ func newB2J (b [][]byte , isJunk func ([]byte ) bool , autoJunk bool ) * B2J {
123+ b2j := B2J {store : map [lineHash ] [][]int {}, b : b }
124+ ntest := len (b )
125+ if autoJunk && ntest >= 200 {
126+ ntest = ntest / 100 + 1
117127 }
118- return []int {}
119- }
120-
121- func (b2j * B2J ) delete (line []byte ) {
122- h := _hash (line )
123- slots := b2j .store [h ]
124- for slotIndex , slot := range slots {
125- if bytes .Equal (line , b2j .b [slot [0 ]]) {
126- // Remove the whole slot from the list of slots
127- b2j .store [h ] = append (slots [:slotIndex ], slots [slotIndex + 1 :]... )
128- return
128+ for lineno , line := range b {
129+ h , slotIndex , slot , lt := b2j ._find (& line )
130+ switch lt {
131+ case lineNORMAL :
132+ if len (slot ) >= ntest {
133+ b2j.store [h ][slotIndex ] = []int {slot [0 ], int (linePOPULAR )}
134+ } else {
135+ b2j.store [h ][slotIndex ] = append (slot , lineno )
136+ }
137+ case lineNONE :
138+ if isJunk != nil && isJunk (line ) {
139+ b2j .store [h ] = append (b2j .store [h ], []int {lineno , int (lineJUNK )})
140+ } else {
141+ b2j .store [h ] = append (b2j .store [h ], []int {lineno })
142+ }
143+ default :
129144 }
130145 }
146+ return & b2j
131147}
132148
133- func (b2j * B2J ) deleteHash (h lineHash ) {
134- delete (b2j .store , h )
149+ func (b2j * B2J ) get (line []byte ) []int {
150+ _ , _ , slot , lt := b2j ._find (& line )
151+ if lt == lineNORMAL {
152+ return slot
153+ }
154+ return []int {}
135155}
136156
137- func (b2j * B2J ) iter (hook func ([]byte , []int )) {
138- for _ , slots := range b2j .store {
139- for _ , slot := range slots {
140- hook (b2j .b [slot [0 ]], slot )
141- }
142- }
157+ func (b2j * B2J ) isBJunk (line []byte ) bool {
158+ _ , _ , _ , lt := b2j ._find (& line )
159+ return lt == lineJUNK
143160}
144161
145162// SequenceMatcher compares sequence of strings. The basic
@@ -174,10 +191,8 @@ type SequenceMatcher struct {
174191 b2j B2J
175192 IsJunk func ([]byte ) bool
176193 autoJunk bool
177- bJunk map [lineHash ]struct {}
178194 matchingBlocks []Match
179195 fullBCount map [lineHash ]int
180- bPopular []int
181196 opCodes []OpCode
182197}
183198
@@ -234,45 +249,10 @@ func (m *SequenceMatcher) SetSeq2(b [][]byte) {
234249
235250func (m * SequenceMatcher ) chainB () {
236251 // Populate line -> index mapping
237- b2j := * newB2J (m .b )
238-
239- // Purge junk elements
240- m .bJunk = map [lineHash ]struct {}{}
241- if m .IsJunk != nil {
242- junk := m .bJunk
243- b2j .iter (func (s []byte , _ []int ){
244- if m .IsJunk (s ) {
245- junk [_hash (s )] = struct {}{}
246- }
247- })
248- for h , _ := range junk {
249- b2j .deleteHash (h )
250- }
251- }
252-
253- // Purge remaining popular elements
254- popular := []int {}
255- n := len (m .b )
256- if m .autoJunk && n >= 200 {
257- ntest := n / 100 + 1
258- b2j .iter (func (s []byte , indices []int ){
259- if len (indices ) > ntest {
260- popular = append (popular , indices [0 ])
261- }
262- })
263- for _ , i := range popular {
264- b2j .delete (m .b [i ])
265- }
266- }
267- m .bPopular = popular
252+ b2j := * newB2J (m .b , m .IsJunk , m .autoJunk )
268253 m .b2j = b2j
269254}
270255
271- func (m * SequenceMatcher ) isBJunk (s []byte ) bool {
272- _ , ok := m .bJunk [_hash (s )]
273- return ok
274- }
275-
276256// Find longest matching block in a[alo:ahi] and b[blo:bhi].
277257//
278258// If IsJunk is not defined:
@@ -340,12 +320,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
340320 // "popular" non-junk elements aren't in b2j, which greatly speeds
341321 // the inner loop above, but also means "the best" match so far
342322 // doesn't contain any junk *or* popular non-junk elements.
343- for besti > alo && bestj > blo && ! m .isBJunk (m .b [bestj - 1 ]) &&
323+ for besti > alo && bestj > blo && ! m .b2j . isBJunk (m .b [bestj - 1 ]) &&
344324 bytes .Equal (m .a [besti - 1 ], m .b [bestj - 1 ]) {
345325 besti , bestj , bestsize = besti - 1 , bestj - 1 , bestsize + 1
346326 }
347327 for besti + bestsize < ahi && bestj + bestsize < bhi &&
348- ! m .isBJunk (m .b [bestj + bestsize ]) &&
328+ ! m .b2j . isBJunk (m .b [bestj + bestsize ]) &&
349329 bytes .Equal (m .a [besti + bestsize ], m .b [bestj + bestsize ]) {
350330 bestsize += 1
351331 }
@@ -357,12 +337,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
357337 // figuring out what to do with it. In the case of an empty
358338 // interesting match, this is clearly the right thing to do,
359339 // because no other kind of match is possible in the regions.
360- for besti > alo && bestj > blo && m .isBJunk (m .b [bestj - 1 ]) &&
340+ for besti > alo && bestj > blo && m .b2j . isBJunk (m .b [bestj - 1 ]) &&
361341 bytes .Equal (m .a [besti - 1 ], m .b [bestj - 1 ]) {
362342 besti , bestj , bestsize = besti - 1 , bestj - 1 , bestsize + 1
363343 }
364344 for besti + bestsize < ahi && bestj + bestsize < bhi &&
365- m .isBJunk (m .b [bestj + bestsize ]) &&
345+ m .b2j . isBJunk (m .b [bestj + bestsize ]) &&
366346 bytes .Equal (m .a [besti + bestsize ], m .b [bestj + bestsize ]) {
367347 bestsize += 1
368348 }
0 commit comments