github.com/v2fly/tools@v0.100.0/internal/lsp/fuzzy/matcher.go (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package fuzzy implements a fuzzy matching algorithm. 6 package fuzzy 7 8 import ( 9 "bytes" 10 "fmt" 11 ) 12 13 const ( 14 // MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs 15 // will be truncated to this size. 16 MaxInputSize = 127 17 // MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer 18 // inputs are truncated to this size. 19 MaxPatternSize = 63 20 ) 21 22 type scoreVal int 23 24 func (s scoreVal) val() int { 25 return int(s) >> 1 26 } 27 28 func (s scoreVal) prevK() int { 29 return int(s) & 1 30 } 31 32 func score(val int, prevK int /*0 or 1*/) scoreVal { 33 return scoreVal(val<<1 + prevK) 34 } 35 36 // Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern. 37 // The matcher does not support parallel usage. 38 type Matcher struct { 39 pattern string 40 patternLower []byte // lower-case version of the pattern 41 patternShort []byte // first characters of the pattern 42 caseSensitive bool // set if the pattern is mix-cased 43 44 patternRoles []RuneRole // the role of each character in the pattern 45 roles []RuneRole // the role of each character in the tested string 46 47 scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal 48 49 scoreScale float32 50 51 lastCandidateLen int // in bytes 52 lastCandidateMatched bool 53 54 // Here we save the last candidate in lower-case. This is basically a byte slice we reuse for 55 // performance reasons, so the slice is not reallocated for every candidate. 56 lowerBuf [MaxInputSize]byte 57 rolesBuf [MaxInputSize]RuneRole 58 } 59 60 func (m *Matcher) bestK(i, j int) int { 61 if m.scores[i][j][0].val() < m.scores[i][j][1].val() { 62 return 1 63 } 64 return 0 65 } 66 67 // NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern. 68 func NewMatcher(pattern string) *Matcher { 69 if len(pattern) > MaxPatternSize { 70 pattern = pattern[:MaxPatternSize] 71 } 72 73 m := &Matcher{ 74 pattern: pattern, 75 patternLower: ToLower(pattern, nil), 76 } 77 78 for i, c := range m.patternLower { 79 if pattern[i] != c { 80 m.caseSensitive = true 81 break 82 } 83 } 84 85 if len(pattern) > 3 { 86 m.patternShort = m.patternLower[:3] 87 } else { 88 m.patternShort = m.patternLower 89 } 90 91 m.patternRoles = RuneRoles(pattern, nil) 92 93 if len(pattern) > 0 { 94 maxCharScore := 4 95 m.scoreScale = 1 / float32(maxCharScore*len(pattern)) 96 } 97 98 return m 99 } 100 101 // Score returns the score returned by matching the candidate to the pattern. 102 // This is not designed for parallel use. Multiple candidates must be scored sequentially. 103 // Returns a score between 0 and 1 (0 - no match, 1 - perfect match). 104 func (m *Matcher) Score(candidate string) float32 { 105 if len(candidate) > MaxInputSize { 106 candidate = candidate[:MaxInputSize] 107 } 108 lower := ToLower(candidate, m.lowerBuf[:]) 109 m.lastCandidateLen = len(candidate) 110 111 if len(m.pattern) == 0 { 112 // Empty patterns perfectly match candidates. 113 return 1 114 } 115 116 if m.match(candidate, lower) { 117 sc := m.computeScore(candidate, lower) 118 if sc > minScore/2 && !m.poorMatch() { 119 m.lastCandidateMatched = true 120 if len(m.pattern) == len(candidate) { 121 // Perfect match. 122 return 1 123 } 124 125 if sc < 0 { 126 sc = 0 127 } 128 normalizedScore := float32(sc) * m.scoreScale 129 if normalizedScore > 1 { 130 normalizedScore = 1 131 } 132 133 return normalizedScore 134 } 135 } 136 137 m.lastCandidateMatched = false 138 return 0 139 } 140 141 const minScore = -10000 142 143 // MatchedRanges returns matches ranges for the last scored string as a flattened array of 144 // [begin, end) byte offset pairs. 145 func (m *Matcher) MatchedRanges() []int { 146 if len(m.pattern) == 0 || !m.lastCandidateMatched { 147 return nil 148 } 149 i, j := m.lastCandidateLen, len(m.pattern) 150 if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 { 151 return nil 152 } 153 154 var ret []int 155 k := m.bestK(i, j) 156 for i > 0 { 157 take := (k == 1) 158 k = m.scores[i][j][k].prevK() 159 if take { 160 if len(ret) == 0 || ret[len(ret)-1] != i { 161 ret = append(ret, i) 162 ret = append(ret, i-1) 163 } else { 164 ret[len(ret)-1] = i - 1 165 } 166 j-- 167 } 168 i-- 169 } 170 // Reverse slice. 171 for i := 0; i < len(ret)/2; i++ { 172 ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i] 173 } 174 return ret 175 } 176 177 func (m *Matcher) match(candidate string, candidateLower []byte) bool { 178 i, j := 0, 0 179 for ; i < len(candidateLower) && j < len(m.patternLower); i++ { 180 if candidateLower[i] == m.patternLower[j] { 181 j++ 182 } 183 } 184 if j != len(m.patternLower) { 185 return false 186 } 187 188 // The input passes the simple test against pattern, so it is time to classify its characters. 189 // Character roles are used below to find the last segment. 190 m.roles = RuneRoles(candidate, m.rolesBuf[:]) 191 192 return true 193 } 194 195 func (m *Matcher) computeScore(candidate string, candidateLower []byte) int { 196 pattLen, candLen := len(m.pattern), len(candidate) 197 198 for j := 0; j <= len(m.pattern); j++ { 199 m.scores[0][j][0] = minScore << 1 200 m.scores[0][j][1] = minScore << 1 201 } 202 m.scores[0][0][0] = score(0, 0) // Start with 0. 203 204 segmentsLeft, lastSegStart := 1, 0 205 for i := 0; i < candLen; i++ { 206 if m.roles[i] == RSep { 207 segmentsLeft++ 208 lastSegStart = i + 1 209 } 210 } 211 212 // A per-character bonus for a consecutive match. 213 consecutiveBonus := 2 214 wordIdx := 0 // Word count within segment. 215 for i := 1; i <= candLen; i++ { 216 217 role := m.roles[i-1] 218 isHead := role == RHead 219 220 if isHead { 221 wordIdx++ 222 } else if role == RSep && segmentsLeft > 1 { 223 wordIdx = 0 224 segmentsLeft-- 225 } 226 227 var skipPenalty int 228 if i == 1 || (i-1) == lastSegStart { 229 // Skipping the start of first or last segment. 230 skipPenalty++ 231 } 232 233 for j := 0; j <= pattLen; j++ { 234 // By default, we don't have a match. Fill in the skip data. 235 m.scores[i][j][1] = minScore << 1 236 237 // Compute the skip score. 238 k := 0 239 if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() { 240 k = 1 241 } 242 243 skipScore := m.scores[i-1][j][k].val() 244 // Do not penalize missing characters after the last matched segment. 245 if j != pattLen { 246 skipScore -= skipPenalty 247 } 248 m.scores[i][j][0] = score(skipScore, k) 249 250 if j == 0 || candidateLower[i-1] != m.patternLower[j-1] { 251 // Not a match. 252 continue 253 } 254 pRole := m.patternRoles[j-1] 255 256 if role == RTail && pRole == RHead { 257 if j > 1 { 258 // Not a match: a head in the pattern matches a tail character in the candidate. 259 continue 260 } 261 // Special treatment for the first character of the pattern. We allow 262 // matches in the middle of a word if they are long enough, at least 263 // min(3, pattern.length) characters. 264 if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) { 265 continue 266 } 267 } 268 269 // Compute the char score. 270 var charScore int 271 // Bonus 1: the char is in the candidate's last segment. 272 if segmentsLeft <= 1 { 273 charScore++ 274 } 275 // Bonus 2: Case match or a Head in the pattern aligns with one in the word. 276 // Single-case patterns lack segmentation signals and we assume any character 277 // can be a head of a segment. 278 if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) { 279 charScore++ 280 } 281 282 // Penalty 1: pattern char is Head, candidate char is Tail. 283 if role == RTail && pRole == RHead { 284 charScore-- 285 } 286 // Penalty 2: first pattern character matched in the middle of a word. 287 if j == 1 && role == RTail { 288 charScore -= 4 289 } 290 291 // Third dimension encodes whether there is a gap between the previous match and the current 292 // one. 293 for k := 0; k < 2; k++ { 294 sc := m.scores[i-1][j-1][k].val() + charScore 295 296 isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart 297 if isConsecutive { 298 // Bonus 3: a consecutive match. First character match also gets a bonus to 299 // ensure prefix final match score normalizes to 1.0. 300 // Logically, this is a part of charScore, but we have to compute it here because it 301 // only applies for consecutive matches (k == 1). 302 sc += consecutiveBonus 303 } 304 if k == 0 { 305 // Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack 306 // of alignment. 307 if role == RTail || role == RUCTail { 308 sc -= 3 309 } 310 } 311 312 if sc > m.scores[i][j][1].val() { 313 m.scores[i][j][1] = score(sc, k) 314 } 315 } 316 } 317 } 318 319 result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val() 320 321 return result 322 } 323 324 // ScoreTable returns the score table computed for the provided candidate. Used only for debugging. 325 func (m *Matcher) ScoreTable(candidate string) string { 326 var buf bytes.Buffer 327 328 var line1, line2, separator bytes.Buffer 329 line1.WriteString("\t") 330 line2.WriteString("\t") 331 for j := 0; j < len(m.pattern); j++ { 332 line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j])) 333 separator.WriteString("----------------") 334 } 335 336 buf.WriteString(line1.String()) 337 buf.WriteString("\n") 338 buf.WriteString(separator.String()) 339 buf.WriteString("\n") 340 341 for i := 1; i <= len(candidate); i++ { 342 line1.Reset() 343 line2.Reset() 344 345 line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1])) 346 line2.WriteString("\t") 347 348 for j := 1; j <= len(m.pattern); j++ { 349 line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK()))) 350 line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK()))) 351 } 352 buf.WriteString(line1.String()) 353 buf.WriteString("\n") 354 buf.WriteString(line2.String()) 355 buf.WriteString("\n") 356 buf.WriteString(separator.String()) 357 buf.WriteString("\n") 358 } 359 360 return buf.String() 361 } 362 363 func dir(prevK int) rune { 364 if prevK == 0 { 365 return 'M' 366 } 367 return 'H' 368 } 369 370 func (m *Matcher) poorMatch() bool { 371 if len(m.pattern) < 2 { 372 return false 373 } 374 375 i, j := m.lastCandidateLen, len(m.pattern) 376 k := m.bestK(i, j) 377 378 var counter, len int 379 for i > 0 { 380 take := (k == 1) 381 k = m.scores[i][j][k].prevK() 382 if take { 383 len++ 384 if k == 0 && len < 3 && m.roles[i-1] == RTail { 385 // Short match in the middle of a word 386 counter++ 387 if counter > 1 { 388 return true 389 } 390 } 391 j-- 392 } else { 393 len = 0 394 } 395 i-- 396 } 397 return false 398 }