github.com/errata-ai/vale/v3@v3.4.2/internal/nlp/tokenize.go (about) 1 package nlp 2 3 import ( 4 "regexp" 5 "strings" 6 "unicode" 7 "unicode/utf8" 8 ) 9 10 type TokenTester func(string) bool 11 12 type Tokenizer interface { 13 Tokenize(string) []string 14 } 15 16 // IterTokenizer splits a sentence into words. 17 type IterTokenizer struct { 18 specialRE *regexp.Regexp 19 sanitizer *strings.Replacer 20 contractions []string 21 splitCases []string 22 suffixes []string 23 prefixes []string 24 emoticons map[string]int 25 isUnsplittable TokenTester 26 } 27 28 type TokenizerOptFunc func(*IterTokenizer) 29 30 // UsingIsUnsplittable gives a function that tests whether a token is splittable or not. 31 func UsingIsUnsplittable(x TokenTester) TokenizerOptFunc { 32 return func(tokenizer *IterTokenizer) { 33 tokenizer.isUnsplittable = x 34 } 35 } 36 37 // UsingSpecialRE sets the provided special regex for unsplittable tokens. 38 func UsingSpecialRE(x *regexp.Regexp) TokenizerOptFunc { 39 return func(tokenizer *IterTokenizer) { 40 tokenizer.specialRE = x 41 } 42 } 43 44 // UsingSanitizer sets the provided sanitizer. 45 func UsingSanitizer(x *strings.Replacer) TokenizerOptFunc { 46 return func(tokenizer *IterTokenizer) { 47 tokenizer.sanitizer = x 48 } 49 } 50 51 // UsingSuffixes sets the provided suffixes. 52 func UsingSuffixes(x []string) TokenizerOptFunc { 53 return func(tokenizer *IterTokenizer) { 54 tokenizer.suffixes = x 55 } 56 } 57 58 // UsingPrefixes sets the provided prefixes. 59 func UsingPrefixes(x []string) TokenizerOptFunc { 60 return func(tokenizer *IterTokenizer) { 61 tokenizer.prefixes = x 62 } 63 } 64 65 // UsingEmoticons sets the provided map of emoticons. 66 func UsingEmoticons(x map[string]int) TokenizerOptFunc { 67 return func(tokenizer *IterTokenizer) { 68 tokenizer.emoticons = x 69 } 70 } 71 72 // UsingContractions sets the provided contractions. 73 func UsingContractions(x []string) TokenizerOptFunc { 74 return func(tokenizer *IterTokenizer) { 75 tokenizer.contractions = x 76 } 77 } 78 79 // UsingSplitCases sets the provided splitCases. 80 func UsingSplitCases(x []string) TokenizerOptFunc { 81 return func(tokenizer *IterTokenizer) { 82 tokenizer.splitCases = x 83 } 84 } 85 86 // NewIterTokenizer creates a new iterTokenizer. 87 func NewIterTokenizer(opts ...TokenizerOptFunc) *IterTokenizer { 88 tok := new(IterTokenizer) 89 90 // Set default parameters 91 tok.emoticons = emoticons 92 tok.isUnsplittable = func(_ string) bool { return false } 93 tok.prefixes = prefixes 94 tok.sanitizer = sanitizer 95 tok.specialRE = internalRE 96 tok.suffixes = suffixes 97 98 // Apply options if provided 99 for _, applyOpt := range opts { 100 applyOpt(tok) 101 } 102 103 return tok 104 } 105 106 func addToken(s string, toks []string) []string { 107 if !allNonLetter(s) { 108 toks = append(toks, s) 109 } 110 return toks 111 } 112 113 func (t *IterTokenizer) isSpecial(token string) bool { 114 _, found := t.emoticons[token] 115 return found || t.specialRE.MatchString(token) || t.isUnsplittable(token) 116 } 117 118 func (t *IterTokenizer) doSplit(token string) []string { 119 var tokens []string 120 121 last := 0 122 for token != "" && StrLen(token) != last { 123 if t.isSpecial(token) { 124 // We've found a special case (e.g., an emoticon) -- so, we add it as a token without 125 // any further processing. 126 tokens = addToken(token, tokens) 127 break 128 } 129 last = StrLen(token) 130 lower := strings.ToLower(token) 131 if hasAnyPrefix(token, t.prefixes) { 132 // Remove prefixes -- e.g., $100 -> [$, 100]. 133 token = token[1:] 134 } else if idx := hasAnyIndex(lower, t.splitCases); idx > -1 { 135 // Handle "they'll", "I'll", "Don't", "won't", amount($). 136 // 137 // they'll -> [they, 'll]. 138 // don't -> [do, n't]. 139 // amount($) -> [amount, (, $, )]. 140 tokens = addToken(token[:idx], tokens) 141 token = token[idx:] 142 } else if hasAnySuffix(token, t.suffixes) { 143 // Remove suffixes -- e.g., Well) -> [Well, )]. 144 token = token[:len(token)-1] 145 } else { 146 tokens = addToken(token, tokens) 147 } 148 } 149 150 return tokens 151 } 152 153 // Tokenize splits a sentence into a slice of words. 154 func (t *IterTokenizer) Tokenize(text string) []string { 155 var tokens []string 156 157 clean, white := t.sanitizer.Replace(text), false 158 length := len(clean) 159 160 start, index := 0, 0 161 cache := map[string][]string{} 162 for index <= length { 163 uc, size := utf8.DecodeRuneInString(clean[index:]) 164 if size == 0 { 165 break 166 } else if index == 0 { 167 white = unicode.IsSpace(uc) 168 } 169 if unicode.IsSpace(uc) != white { 170 if start < index { 171 span := clean[start:index] 172 if toks, found := cache[span]; found { 173 tokens = append(tokens, toks...) 174 } else { 175 toks = t.doSplit(span) 176 cache[span] = toks 177 tokens = append(tokens, toks...) 178 } 179 } 180 if uc == ' ' { 181 start = index + 1 182 } else { 183 start = index 184 } 185 white = !white 186 } 187 index += size 188 } 189 190 if start < index { 191 tokens = append(tokens, t.doSplit(clean[start:index])...) 192 } 193 194 return tokens 195 }