github.com/pgavlin/text@v0.0.0-20240419000839-8438d0a47805/search.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package text
     6  
     7  // stringFinder efficiently finds strings in a source text. It's implemented
     8  // using the Boyer-Moore string search algorithm:
     9  // https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
    10  // https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
    11  // document uses 1-based indexing)
    12  type stringFinder[S String] struct {
    13  	// pattern is the string that we are searching for in the text.
    14  	pattern S
    15  
    16  	// badCharSkip[b] contains the distance between the last byte of pattern
    17  	// and the rightmost occurrence of b in pattern. If b is not in pattern,
    18  	// badCharSkip[b] is len(pattern).
    19  	//
    20  	// Whenever a mismatch is found with byte b in the text, we can safely
    21  	// shift the matching frame at least badCharSkip[b] until the next time
    22  	// the matching char could be in alignment.
    23  	badCharSkip [256]int
    24  
    25  	// goodSuffixSkip[i] defines how far we can shift the matching frame given
    26  	// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
    27  	// not. There are two cases to consider:
    28  	//
    29  	// 1. The matched suffix occurs elsewhere in pattern (with a different
    30  	// byte preceding it that we might possibly match). In this case, we can
    31  	// shift the matching frame to align with the next suffix chunk. For
    32  	// example, the pattern "mississi" has the suffix "issi" next occurring
    33  	// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
    34  	// shift+len(suffix) == 3+4 == 7.
    35  	//
    36  	// 2. If the matched suffix does not occur elsewhere in pattern, then the
    37  	// matching frame may share part of its prefix with the end of the
    38  	// matching suffix. In this case, goodSuffixSkip[i] will contain how far
    39  	// to shift the frame to align this portion of the prefix to the
    40  	// suffix. For example, in the pattern "abcxxxabc", when the first
    41  	// mismatch from the back is found to be in position 3, the matching
    42  	// suffix "xxabc" is not found elsewhere in the pattern. However, its
    43  	// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
    44  	// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
    45  	goodSuffixSkip []int
    46  }
    47  
    48  func makeStringFinder[S String](pattern S) *stringFinder[S] {
    49  	f := &stringFinder[S]{
    50  		pattern:        pattern,
    51  		goodSuffixSkip: make([]int, len(pattern)),
    52  	}
    53  	// last is the index of the last character in the pattern.
    54  	last := len(pattern) - 1
    55  
    56  	// Build bad character table.
    57  	// Bytes not in the pattern can skip one pattern's length.
    58  	for i := range f.badCharSkip {
    59  		f.badCharSkip[i] = len(pattern)
    60  	}
    61  	// The loop condition is < instead of <= so that the last byte does not
    62  	// have a zero distance to itself. Finding this byte out of place implies
    63  	// that it is not in the last position.
    64  	for i := 0; i < last; i++ {
    65  		f.badCharSkip[pattern[i]] = last - i
    66  	}
    67  
    68  	// Build good suffix table.
    69  	// First pass: set each value to the next index which starts a prefix of
    70  	// pattern.
    71  	lastPrefix := last
    72  	for i := last; i >= 0; i-- {
    73  		if HasPrefix(pattern, pattern[i+1:]) {
    74  			lastPrefix = i + 1
    75  		}
    76  		// lastPrefix is the shift, and (last-i) is len(suffix).
    77  		f.goodSuffixSkip[i] = lastPrefix + last - i
    78  	}
    79  	// Second pass: find repeats of pattern's suffix starting from the front.
    80  	for i := 0; i < last; i++ {
    81  		lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
    82  		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
    83  			// (last-i) is the shift, and lenSuffix is len(suffix).
    84  			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
    85  		}
    86  	}
    87  
    88  	return f
    89  }
    90  
    91  func longestCommonSuffix[S1, S2 String](a S1, b S2) (i int) {
    92  	for ; i < len(a) && i < len(b); i++ {
    93  		if a[len(a)-1-i] != b[len(b)-1-i] {
    94  			break
    95  		}
    96  	}
    97  	return
    98  }
    99  
   100  // next returns the index in text of the first occurrence of the pattern. If
   101  // the pattern is not found, it returns -1.
   102  func (f *stringFinder[S]) next(text S) int {
   103  	i := len(f.pattern) - 1
   104  	for i < len(text) {
   105  		// Compare backwards from the end until the first unmatching character.
   106  		j := len(f.pattern) - 1
   107  		for j >= 0 && text[i] == f.pattern[j] {
   108  			i--
   109  			j--
   110  		}
   111  		if j < 0 {
   112  			return i + 1 // match
   113  		}
   114  		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
   115  	}
   116  	return -1
   117  }
   118  
   119  func max(a, b int) int {
   120  	if a > b {
   121  		return a
   122  	}
   123  	return b
   124  }