github.com/v2fly/tools@v0.100.0/internal/lsp/fuzzy/input.go (about)

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package fuzzy
     6  
     7  import (
     8  	"unicode"
     9  )
    10  
    11  // RuneRole specifies the role of a rune in the context of an input.
    12  type RuneRole byte
    13  
    14  const (
    15  	// RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
    16  	RNone RuneRole = iota
    17  	// RSep specifies a rune with the role of segment separator.
    18  	RSep
    19  	// RTail specifies a rune which is a lower-case tail in a word in the input.
    20  	RTail
    21  	// RUCTail specifies a rune which is an upper-case tail in a word in the input.
    22  	RUCTail
    23  	// RHead specifies a rune which is the first character in a word in the input.
    24  	RHead
    25  )
    26  
    27  // RuneRoles detects the roles of each byte rune in an input string and stores it in the output
    28  // slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
    29  // or when it filled the output. If output is nil, then it gets created.
    30  func RuneRoles(str string, reuse []RuneRole) []RuneRole {
    31  	var output []RuneRole
    32  	if cap(reuse) < len(str) {
    33  		output = make([]RuneRole, 0, len(str))
    34  	} else {
    35  		output = reuse[:0]
    36  	}
    37  
    38  	prev, prev2 := rtNone, rtNone
    39  	for i := 0; i < len(str); i++ {
    40  		r := rune(str[i])
    41  
    42  		role := RNone
    43  
    44  		curr := rtLower
    45  		if str[i] <= unicode.MaxASCII {
    46  			curr = runeType(rt[str[i]] - '0')
    47  		}
    48  
    49  		if curr == rtLower {
    50  			if prev == rtNone || prev == rtPunct {
    51  				role = RHead
    52  			} else {
    53  				role = RTail
    54  			}
    55  		} else if curr == rtUpper {
    56  			role = RHead
    57  
    58  			if prev == rtUpper {
    59  				// This and previous characters are both upper case.
    60  
    61  				if i+1 == len(str) {
    62  					// This is last character, previous was also uppercase -> this is UCTail
    63  					// i.e., (current char is C): aBC / BC / ABC
    64  					role = RUCTail
    65  				}
    66  			}
    67  		} else if curr == rtPunct {
    68  			switch r {
    69  			case '.', ':':
    70  				role = RSep
    71  			}
    72  		}
    73  		if curr != rtLower {
    74  			if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
    75  				// The previous two characters were uppercase. The current one is not a lower case, so the
    76  				// previous one can't be a HEAD. Make it a UCTail.
    77  				// i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
    78  				output[i-1] = RUCTail
    79  			}
    80  		}
    81  
    82  		output = append(output, role)
    83  		prev2 = prev
    84  		prev = curr
    85  	}
    86  	return output
    87  }
    88  
    89  type runeType byte
    90  
    91  const (
    92  	rtNone runeType = iota
    93  	rtPunct
    94  	rtLower
    95  	rtUpper
    96  )
    97  
    98  const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
    99  
   100  // LastSegment returns the substring representing the last segment from the input, where each
   101  // byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
   102  // or Filename type.
   103  func LastSegment(input string, roles []RuneRole) string {
   104  	// Exclude ending separators.
   105  	end := len(input) - 1
   106  	for end >= 0 && roles[end] == RSep {
   107  		end--
   108  	}
   109  	if end < 0 {
   110  		return ""
   111  	}
   112  
   113  	start := end - 1
   114  	for start >= 0 && roles[start] != RSep {
   115  		start--
   116  	}
   117  
   118  	return input[start+1 : end+1]
   119  }
   120  
   121  // ToLower transforms the input string to lower case, which is stored in the output byte slice.
   122  // The lower casing considers only ASCII values - non ASCII values are left unmodified.
   123  // Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
   124  // created.
   125  func ToLower(input string, reuse []byte) []byte {
   126  	output := reuse
   127  	if cap(reuse) < len(input) {
   128  		output = make([]byte, len(input))
   129  	}
   130  
   131  	for i := 0; i < len(input); i++ {
   132  		r := rune(input[i])
   133  		if r <= unicode.MaxASCII {
   134  			if 'A' <= r && r <= 'Z' {
   135  				r += 'a' - 'A'
   136  			}
   137  		}
   138  		output[i] = byte(r)
   139  	}
   140  	return output[:len(input)]
   141  }
   142  
   143  // WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
   144  // (start is inclusive, end is exclusive).
   145  type WordConsumer func(start, end int)
   146  
   147  // Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
   148  // delimiters for each word are fed to the provided consumer function.
   149  func Words(roles []RuneRole, consume WordConsumer) {
   150  	var wordStart int
   151  	for i, r := range roles {
   152  		switch r {
   153  		case RUCTail, RTail:
   154  		case RHead, RNone, RSep:
   155  			if i != wordStart {
   156  				consume(wordStart, i)
   157  			}
   158  			wordStart = i
   159  			if r != RHead {
   160  				// Skip this character.
   161  				wordStart = i + 1
   162  			}
   163  		}
   164  	}
   165  	if wordStart != len(roles) {
   166  		consume(wordStart, len(roles))
   167  	}
   168  }