github.com/cockroachdb/tools@v0.0.0-20230222021103-a6d27438930d/internal/fuzzy/input.go (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package fuzzy 6 7 import ( 8 "unicode" 9 ) 10 11 // RuneRole specifies the role of a rune in the context of an input. 12 type RuneRole byte 13 14 const ( 15 // RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII). 16 RNone RuneRole = iota 17 // RSep specifies a rune with the role of segment separator. 18 RSep 19 // RTail specifies a rune which is a lower-case tail in a word in the input. 20 RTail 21 // RUCTail specifies a rune which is an upper-case tail in a word in the input. 22 RUCTail 23 // RHead specifies a rune which is the first character in a word in the input. 24 RHead 25 ) 26 27 // RuneRoles detects the roles of each byte rune in an input string and stores it in the output 28 // slice. The rune role depends on the input type. Stops when it parsed all the runes in the string 29 // or when it filled the output. If output is nil, then it gets created. 30 func RuneRoles(candidate []byte, reuse []RuneRole) []RuneRole { 31 var output []RuneRole 32 if cap(reuse) < len(candidate) { 33 output = make([]RuneRole, 0, len(candidate)) 34 } else { 35 output = reuse[:0] 36 } 37 38 prev, prev2 := rtNone, rtNone 39 for i := 0; i < len(candidate); i++ { 40 r := rune(candidate[i]) 41 42 role := RNone 43 44 curr := rtLower 45 if candidate[i] <= unicode.MaxASCII { 46 curr = runeType(rt[candidate[i]] - '0') 47 } 48 49 if curr == rtLower { 50 if prev == rtNone || prev == rtPunct { 51 role = RHead 52 } else { 53 role = RTail 54 } 55 } else if curr == rtUpper { 56 role = RHead 57 58 if prev == rtUpper { 59 // This and previous characters are both upper case. 60 61 if i+1 == len(candidate) { 62 // This is last character, previous was also uppercase -> this is UCTail 63 // i.e., (current char is C): aBC / BC / ABC 64 role = RUCTail 65 } 66 } 67 } else if curr == rtPunct { 68 switch r { 69 case '.', ':': 70 role = RSep 71 } 72 } 73 if curr != rtLower { 74 if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) { 75 // The previous two characters were uppercase. The current one is not a lower case, so the 76 // previous one can't be a HEAD. Make it a UCTail. 77 // i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB. 78 output[i-1] = RUCTail 79 } 80 } 81 82 output = append(output, role) 83 prev2 = prev 84 prev = curr 85 } 86 return output 87 } 88 89 type runeType byte 90 91 const ( 92 rtNone runeType = iota 93 rtPunct 94 rtLower 95 rtUpper 96 ) 97 98 const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000" 99 100 // LastSegment returns the substring representing the last segment from the input, where each 101 // byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol 102 // or Filename type. 103 func LastSegment(input string, roles []RuneRole) string { 104 // Exclude ending separators. 105 end := len(input) - 1 106 for end >= 0 && roles[end] == RSep { 107 end-- 108 } 109 if end < 0 { 110 return "" 111 } 112 113 start := end - 1 114 for start >= 0 && roles[start] != RSep { 115 start-- 116 } 117 118 return input[start+1 : end+1] 119 } 120 121 // fromChunks copies string chunks into the given buffer. 122 func fromChunks(chunks []string, buffer []byte) []byte { 123 ii := 0 124 for _, chunk := range chunks { 125 for i := 0; i < len(chunk); i++ { 126 if ii >= cap(buffer) { 127 break 128 } 129 buffer[ii] = chunk[i] 130 ii++ 131 } 132 } 133 return buffer[:ii] 134 } 135 136 // toLower transforms the input string to lower case, which is stored in the output byte slice. 137 // The lower casing considers only ASCII values - non ASCII values are left unmodified. 138 // Stops when parsed all input or when it filled the output slice. If output is nil, then it gets 139 // created. 140 func toLower(input []byte, reuse []byte) []byte { 141 output := reuse 142 if cap(reuse) < len(input) { 143 output = make([]byte, len(input)) 144 } 145 146 for i := 0; i < len(input); i++ { 147 r := rune(input[i]) 148 if input[i] <= unicode.MaxASCII { 149 if 'A' <= r && r <= 'Z' { 150 r += 'a' - 'A' 151 } 152 } 153 output[i] = byte(r) 154 } 155 return output[:len(input)] 156 } 157 158 // WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input 159 // (start is inclusive, end is exclusive). 160 type WordConsumer func(start, end int) 161 162 // Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset 163 // delimiters for each word are fed to the provided consumer function. 164 func Words(roles []RuneRole, consume WordConsumer) { 165 var wordStart int 166 for i, r := range roles { 167 switch r { 168 case RUCTail, RTail: 169 case RHead, RNone, RSep: 170 if i != wordStart { 171 consume(wordStart, i) 172 } 173 wordStart = i 174 if r != RHead { 175 // Skip this character. 176 wordStart = i + 1 177 } 178 } 179 } 180 if wordStart != len(roles) { 181 consume(wordStart, len(roles)) 182 } 183 }