github.com/scottcagno/storage@v1.8.0/pkg/search/boyer-moore.go (about) 1 package search 2 3 import ( 4 "bytes" 5 "strings" 6 ) 7 8 type BoyerMoore struct{} 9 10 func NewBoyerMoore() *BoyerMoore { 11 return new(BoyerMoore) 12 } 13 14 func (bm *BoyerMoore) String() string { 15 return "BOYER-MOORE" 16 } 17 18 func (bm *BoyerMoore) FindIndex(text, pattern []byte) int { 19 if text == nil || pattern == nil { 20 return -1 21 } 22 return boyerMooreFinder(pattern, text) 23 } 24 25 func (bm *BoyerMoore) FindIndexString(text, pattern string) int { 26 return boyerMooreFinderString(pattern, text) 27 } 28 29 func boyerMooreFinderString(pattern, text string) int { 30 return makeStringFinder(pattern).next(text) 31 } 32 33 // stringFinder efficiently finds strings in a source text. It's implemented using the Boyer-Moore string 34 // search algorithm: https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm and for more info, 35 // check out https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf ( this source below is adapted from 36 // the golang sourcecode that can be found here: https://go.dev/src/strings/search.go ) 37 type stringFinder struct { 38 // pattern is the string that we are searching for in the text. 39 pattern string 40 41 // badCharSkip[b] contains the distance between the last byte of pattern 42 // and the rightmost occurrence of b in pattern. If b is not in pattern, 43 // badCharSkip[b] is len(pattern). 44 // 45 // Whenever a mismatch is found with byte b in the text, we can safely 46 // shift the matching frame at least badCharSkip[b] until the next time 47 // the matching char could be in alignment. 48 badCharSkip [256]int 49 50 // goodSuffixSkip[i] defines how far we can shift the matching frame given 51 // that the suffix pattern[i+1:] matches, but the byte pattern[i] does 52 // not. There are two cases to consider: 53 // 54 // 1. The matched suffix occurs elsewhere in pattern (with a different 55 // byte preceding it that we might possibly match). In this case, we can 56 // shift the matching frame to align with the next suffix chunk. For 57 // example, the pattern "mississi" has the suffix "issi" next occurring 58 // (in right-to-left order) at index 1, so goodSuffixSkip[3] == 59 // shift+len(suffix) == 3+4 == 7. 60 // 61 // 2. If the matched suffix does not occur elsewhere in pattern, then the 62 // matching frame may share part of its prefix with the end of the 63 // matching suffix. In this case, goodSuffixSkip[i] will contain how far 64 // to shift the frame to align this portion of the prefix to the 65 // suffix. For example, in the pattern "abcxxxabc", when the first 66 // mismatch from the back is found to be in position 3, the matching 67 // suffix "xxabc" is not found elsewhere in the pattern. However, its 68 // rightmost "abc" (at position 6) is a prefix of the whole pattern, so 69 // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. 70 goodSuffixSkip []int 71 } 72 73 func makeStringFinder(pattern string) *stringFinder { 74 f := &stringFinder{ 75 pattern: pattern, 76 goodSuffixSkip: make([]int, len(pattern)), 77 } 78 // last is the index of the last character in the pattern. 79 last := len(pattern) - 1 80 81 // Build bad character table. 82 // Bytes not in the pattern can skip one pattern's length. 83 for i := range f.badCharSkip { 84 f.badCharSkip[i] = len(pattern) 85 } 86 // The loop condition is < instead of <= so that the last byte does not 87 // have a zero distance to itself. Finding this byte out of place implies 88 // that it is not in the last position. 89 for i := 0; i < last; i++ { 90 f.badCharSkip[pattern[i]] = last - i 91 } 92 93 // Build good suffix table. 94 // First pass: set each value to the next index which starts a prefix of 95 // pattern. 96 lastPrefix := last 97 for i := last; i >= 0; i-- { 98 if strings.HasPrefix(pattern, pattern[i+1:]) { 99 lastPrefix = i + 1 100 } 101 // lastPrefix is the shift, and (last-i) is len(suffix). 102 f.goodSuffixSkip[i] = lastPrefix + last - i 103 } 104 // Second pass: find repeats of pattern's suffix starting from the front. 105 for i := 0; i < last; i++ { 106 lenSuffix := longestCommonSuffixString(pattern, pattern[1:i+1]) 107 if pattern[i-lenSuffix] != pattern[last-lenSuffix] { 108 // (last-i) is the shift, and lenSuffix is len(suffix). 109 f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i 110 } 111 } 112 113 return f 114 } 115 116 func longestCommonSuffixString(a, b string) (i int) { 117 for ; i < len(a) && i < len(b); i++ { 118 if a[len(a)-1-i] != b[len(b)-1-i] { 119 break 120 } 121 } 122 return 123 } 124 125 // next returns the index in text of the first occurrence of the pattern. If 126 // the pattern is not found, it returns -1. 127 func (f *stringFinder) next(text string) int { 128 i := len(f.pattern) - 1 129 for i < len(text) { 130 // Compare backwards from the end until the first un-matching character. 131 j := len(f.pattern) - 1 132 for j >= 0 && text[i] == f.pattern[j] { 133 i-- 134 j-- 135 } 136 if j < 0 { 137 return i + 1 // match 138 } 139 i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) 140 } 141 return -1 142 } 143 144 func boyerMooreFinder(pattern, text []byte) int { 145 return makeBytesFinder(pattern).next(text) 146 } 147 148 type bytesFinder struct { 149 pattern []byte 150 badCharSkip [256]int 151 goodSuffixSkip []int 152 } 153 154 func makeBytesFinder(pattern []byte) *bytesFinder { 155 f := &bytesFinder{ 156 pattern: pattern, 157 goodSuffixSkip: make([]int, len(pattern)), 158 } 159 // last is the index of the last character in the pattern. 160 last := len(pattern) - 1 161 162 // Build bad character table. 163 // Bytes not in the pattern can skip one pattern's length. 164 for i := range f.badCharSkip { 165 f.badCharSkip[i] = len(pattern) 166 } 167 // The loop condition is < instead of <= so that the last byte does not 168 // have a zero distance to itself. Finding this byte out of place implies 169 // that it is not in the last position. 170 for i := 0; i < last; i++ { 171 f.badCharSkip[pattern[i]] = last - i 172 } 173 174 // Build good suffix table. 175 // First pass: set each value to the next index which starts a prefix of 176 // pattern. 177 lastPrefix := last 178 for i := last; i >= 0; i-- { 179 if bytes.HasPrefix(pattern, pattern[i+1:]) { 180 lastPrefix = i + 1 181 } 182 // lastPrefix is the shift, and (last-i) is len(suffix). 183 f.goodSuffixSkip[i] = lastPrefix + last - i 184 } 185 // Second pass: find repeats of pattern's suffix starting from the front. 186 for i := 0; i < last; i++ { 187 lenSuffix := longestCommonSuffixBytes(pattern, pattern[1:i+1]) 188 if pattern[i-lenSuffix] != pattern[last-lenSuffix] { 189 // (last-i) is the shift, and lenSuffix is len(suffix). 190 f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i 191 } 192 } 193 194 return f 195 } 196 197 func longestCommonSuffixBytes(a, b []byte) (i int) { 198 for ; i < len(a) && i < len(b); i++ { 199 if a[len(a)-1-i] != b[len(b)-1-i] { 200 break 201 } 202 } 203 return 204 } 205 206 // next returns the index in text of the first occurrence of the pattern. If 207 // the pattern is not found, it returns -1. 208 func (f *bytesFinder) next(text []byte) int { 209 i := len(f.pattern) - 1 210 for i < len(text) { 211 // Compare backwards from the end until the first un-matching character. 212 j := len(f.pattern) - 1 213 for j >= 0 && text[i] == f.pattern[j] { 214 i-- 215 j-- 216 } 217 if j < 0 { 218 return i + 1 // match 219 } 220 i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) 221 } 222 return -1 223 } 224 225 func max(a, b int) int { 226 if a > b { 227 return a 228 } 229 return b 230 }