github.com/scottcagno/storage@v1.8.0/pkg/search/boyer-moore.go (about)

     1  package search
     2  
     3  import (
     4  	"bytes"
     5  	"strings"
     6  )
     7  
     8  type BoyerMoore struct{}
     9  
    10  func NewBoyerMoore() *BoyerMoore {
    11  	return new(BoyerMoore)
    12  }
    13  
    14  func (bm *BoyerMoore) String() string {
    15  	return "BOYER-MOORE"
    16  }
    17  
    18  func (bm *BoyerMoore) FindIndex(text, pattern []byte) int {
    19  	if text == nil || pattern == nil {
    20  		return -1
    21  	}
    22  	return boyerMooreFinder(pattern, text)
    23  }
    24  
    25  func (bm *BoyerMoore) FindIndexString(text, pattern string) int {
    26  	return boyerMooreFinderString(pattern, text)
    27  }
    28  
    29  func boyerMooreFinderString(pattern, text string) int {
    30  	return makeStringFinder(pattern).next(text)
    31  }
    32  
    33  // stringFinder efficiently finds strings in a source text. It's implemented using the Boyer-Moore string
    34  // search algorithm: https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm and for more info,
    35  // check out https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf ( this source below is adapted from
    36  // the golang sourcecode that can be found here: https://go.dev/src/strings/search.go )
    37  type stringFinder struct {
    38  	// pattern is the string that we are searching for in the text.
    39  	pattern string
    40  
    41  	// badCharSkip[b] contains the distance between the last byte of pattern
    42  	// and the rightmost occurrence of b in pattern. If b is not in pattern,
    43  	// badCharSkip[b] is len(pattern).
    44  	//
    45  	// Whenever a mismatch is found with byte b in the text, we can safely
    46  	// shift the matching frame at least badCharSkip[b] until the next time
    47  	// the matching char could be in alignment.
    48  	badCharSkip [256]int
    49  
    50  	// goodSuffixSkip[i] defines how far we can shift the matching frame given
    51  	// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
    52  	// not. There are two cases to consider:
    53  	//
    54  	// 1. The matched suffix occurs elsewhere in pattern (with a different
    55  	// byte preceding it that we might possibly match). In this case, we can
    56  	// shift the matching frame to align with the next suffix chunk. For
    57  	// example, the pattern "mississi" has the suffix "issi" next occurring
    58  	// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
    59  	// shift+len(suffix) == 3+4 == 7.
    60  	//
    61  	// 2. If the matched suffix does not occur elsewhere in pattern, then the
    62  	// matching frame may share part of its prefix with the end of the
    63  	// matching suffix. In this case, goodSuffixSkip[i] will contain how far
    64  	// to shift the frame to align this portion of the prefix to the
    65  	// suffix. For example, in the pattern "abcxxxabc", when the first
    66  	// mismatch from the back is found to be in position 3, the matching
    67  	// suffix "xxabc" is not found elsewhere in the pattern. However, its
    68  	// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
    69  	// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
    70  	goodSuffixSkip []int
    71  }
    72  
    73  func makeStringFinder(pattern string) *stringFinder {
    74  	f := &stringFinder{
    75  		pattern:        pattern,
    76  		goodSuffixSkip: make([]int, len(pattern)),
    77  	}
    78  	// last is the index of the last character in the pattern.
    79  	last := len(pattern) - 1
    80  
    81  	// Build bad character table.
    82  	// Bytes not in the pattern can skip one pattern's length.
    83  	for i := range f.badCharSkip {
    84  		f.badCharSkip[i] = len(pattern)
    85  	}
    86  	// The loop condition is < instead of <= so that the last byte does not
    87  	// have a zero distance to itself. Finding this byte out of place implies
    88  	// that it is not in the last position.
    89  	for i := 0; i < last; i++ {
    90  		f.badCharSkip[pattern[i]] = last - i
    91  	}
    92  
    93  	// Build good suffix table.
    94  	// First pass: set each value to the next index which starts a prefix of
    95  	// pattern.
    96  	lastPrefix := last
    97  	for i := last; i >= 0; i-- {
    98  		if strings.HasPrefix(pattern, pattern[i+1:]) {
    99  			lastPrefix = i + 1
   100  		}
   101  		// lastPrefix is the shift, and (last-i) is len(suffix).
   102  		f.goodSuffixSkip[i] = lastPrefix + last - i
   103  	}
   104  	// Second pass: find repeats of pattern's suffix starting from the front.
   105  	for i := 0; i < last; i++ {
   106  		lenSuffix := longestCommonSuffixString(pattern, pattern[1:i+1])
   107  		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
   108  			// (last-i) is the shift, and lenSuffix is len(suffix).
   109  			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
   110  		}
   111  	}
   112  
   113  	return f
   114  }
   115  
   116  func longestCommonSuffixString(a, b string) (i int) {
   117  	for ; i < len(a) && i < len(b); i++ {
   118  		if a[len(a)-1-i] != b[len(b)-1-i] {
   119  			break
   120  		}
   121  	}
   122  	return
   123  }
   124  
   125  // next returns the index in text of the first occurrence of the pattern. If
   126  // the pattern is not found, it returns -1.
   127  func (f *stringFinder) next(text string) int {
   128  	i := len(f.pattern) - 1
   129  	for i < len(text) {
   130  		// Compare backwards from the end until the first un-matching character.
   131  		j := len(f.pattern) - 1
   132  		for j >= 0 && text[i] == f.pattern[j] {
   133  			i--
   134  			j--
   135  		}
   136  		if j < 0 {
   137  			return i + 1 // match
   138  		}
   139  		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
   140  	}
   141  	return -1
   142  }
   143  
   144  func boyerMooreFinder(pattern, text []byte) int {
   145  	return makeBytesFinder(pattern).next(text)
   146  }
   147  
   148  type bytesFinder struct {
   149  	pattern        []byte
   150  	badCharSkip    [256]int
   151  	goodSuffixSkip []int
   152  }
   153  
   154  func makeBytesFinder(pattern []byte) *bytesFinder {
   155  	f := &bytesFinder{
   156  		pattern:        pattern,
   157  		goodSuffixSkip: make([]int, len(pattern)),
   158  	}
   159  	// last is the index of the last character in the pattern.
   160  	last := len(pattern) - 1
   161  
   162  	// Build bad character table.
   163  	// Bytes not in the pattern can skip one pattern's length.
   164  	for i := range f.badCharSkip {
   165  		f.badCharSkip[i] = len(pattern)
   166  	}
   167  	// The loop condition is < instead of <= so that the last byte does not
   168  	// have a zero distance to itself. Finding this byte out of place implies
   169  	// that it is not in the last position.
   170  	for i := 0; i < last; i++ {
   171  		f.badCharSkip[pattern[i]] = last - i
   172  	}
   173  
   174  	// Build good suffix table.
   175  	// First pass: set each value to the next index which starts a prefix of
   176  	// pattern.
   177  	lastPrefix := last
   178  	for i := last; i >= 0; i-- {
   179  		if bytes.HasPrefix(pattern, pattern[i+1:]) {
   180  			lastPrefix = i + 1
   181  		}
   182  		// lastPrefix is the shift, and (last-i) is len(suffix).
   183  		f.goodSuffixSkip[i] = lastPrefix + last - i
   184  	}
   185  	// Second pass: find repeats of pattern's suffix starting from the front.
   186  	for i := 0; i < last; i++ {
   187  		lenSuffix := longestCommonSuffixBytes(pattern, pattern[1:i+1])
   188  		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
   189  			// (last-i) is the shift, and lenSuffix is len(suffix).
   190  			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
   191  		}
   192  	}
   193  
   194  	return f
   195  }
   196  
   197  func longestCommonSuffixBytes(a, b []byte) (i int) {
   198  	for ; i < len(a) && i < len(b); i++ {
   199  		if a[len(a)-1-i] != b[len(b)-1-i] {
   200  			break
   201  		}
   202  	}
   203  	return
   204  }
   205  
   206  // next returns the index in text of the first occurrence of the pattern. If
   207  // the pattern is not found, it returns -1.
   208  func (f *bytesFinder) next(text []byte) int {
   209  	i := len(f.pattern) - 1
   210  	for i < len(text) {
   211  		// Compare backwards from the end until the first un-matching character.
   212  		j := len(f.pattern) - 1
   213  		for j >= 0 && text[i] == f.pattern[j] {
   214  			i--
   215  			j--
   216  		}
   217  		if j < 0 {
   218  			return i + 1 // match
   219  		}
   220  		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
   221  	}
   222  	return -1
   223  }
   224  
   225  func max(a, b int) int {
   226  	if a > b {
   227  		return a
   228  	}
   229  	return b
   230  }