github.com/pgavlin/text@v0.0.0-20240419000839-8438d0a47805/search.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package text 6 7 // stringFinder efficiently finds strings in a source text. It's implemented 8 // using the Boyer-Moore string search algorithm: 9 // https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm 10 // https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged 11 // document uses 1-based indexing) 12 type stringFinder[S String] struct { 13 // pattern is the string that we are searching for in the text. 14 pattern S 15 16 // badCharSkip[b] contains the distance between the last byte of pattern 17 // and the rightmost occurrence of b in pattern. If b is not in pattern, 18 // badCharSkip[b] is len(pattern). 19 // 20 // Whenever a mismatch is found with byte b in the text, we can safely 21 // shift the matching frame at least badCharSkip[b] until the next time 22 // the matching char could be in alignment. 23 badCharSkip [256]int 24 25 // goodSuffixSkip[i] defines how far we can shift the matching frame given 26 // that the suffix pattern[i+1:] matches, but the byte pattern[i] does 27 // not. There are two cases to consider: 28 // 29 // 1. The matched suffix occurs elsewhere in pattern (with a different 30 // byte preceding it that we might possibly match). In this case, we can 31 // shift the matching frame to align with the next suffix chunk. For 32 // example, the pattern "mississi" has the suffix "issi" next occurring 33 // (in right-to-left order) at index 1, so goodSuffixSkip[3] == 34 // shift+len(suffix) == 3+4 == 7. 35 // 36 // 2. If the matched suffix does not occur elsewhere in pattern, then the 37 // matching frame may share part of its prefix with the end of the 38 // matching suffix. In this case, goodSuffixSkip[i] will contain how far 39 // to shift the frame to align this portion of the prefix to the 40 // suffix. For example, in the pattern "abcxxxabc", when the first 41 // mismatch from the back is found to be in position 3, the matching 42 // suffix "xxabc" is not found elsewhere in the pattern. However, its 43 // rightmost "abc" (at position 6) is a prefix of the whole pattern, so 44 // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. 45 goodSuffixSkip []int 46 } 47 48 func makeStringFinder[S String](pattern S) *stringFinder[S] { 49 f := &stringFinder[S]{ 50 pattern: pattern, 51 goodSuffixSkip: make([]int, len(pattern)), 52 } 53 // last is the index of the last character in the pattern. 54 last := len(pattern) - 1 55 56 // Build bad character table. 57 // Bytes not in the pattern can skip one pattern's length. 58 for i := range f.badCharSkip { 59 f.badCharSkip[i] = len(pattern) 60 } 61 // The loop condition is < instead of <= so that the last byte does not 62 // have a zero distance to itself. Finding this byte out of place implies 63 // that it is not in the last position. 64 for i := 0; i < last; i++ { 65 f.badCharSkip[pattern[i]] = last - i 66 } 67 68 // Build good suffix table. 69 // First pass: set each value to the next index which starts a prefix of 70 // pattern. 71 lastPrefix := last 72 for i := last; i >= 0; i-- { 73 if HasPrefix(pattern, pattern[i+1:]) { 74 lastPrefix = i + 1 75 } 76 // lastPrefix is the shift, and (last-i) is len(suffix). 77 f.goodSuffixSkip[i] = lastPrefix + last - i 78 } 79 // Second pass: find repeats of pattern's suffix starting from the front. 80 for i := 0; i < last; i++ { 81 lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1]) 82 if pattern[i-lenSuffix] != pattern[last-lenSuffix] { 83 // (last-i) is the shift, and lenSuffix is len(suffix). 84 f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i 85 } 86 } 87 88 return f 89 } 90 91 func longestCommonSuffix[S1, S2 String](a S1, b S2) (i int) { 92 for ; i < len(a) && i < len(b); i++ { 93 if a[len(a)-1-i] != b[len(b)-1-i] { 94 break 95 } 96 } 97 return 98 } 99 100 // next returns the index in text of the first occurrence of the pattern. If 101 // the pattern is not found, it returns -1. 102 func (f *stringFinder[S]) next(text S) int { 103 i := len(f.pattern) - 1 104 for i < len(text) { 105 // Compare backwards from the end until the first unmatching character. 106 j := len(f.pattern) - 1 107 for j >= 0 && text[i] == f.pattern[j] { 108 i-- 109 j-- 110 } 111 if j < 0 { 112 return i + 1 // match 113 } 114 i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) 115 } 116 return -1 117 } 118 119 func max(a, b int) int { 120 if a > b { 121 return a 122 } 123 return b 124 }