github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/ngram/token.go (about) 1 package ngram 2 3 import ( 4 "bytes" 5 "fmt" 6 "strings" 7 "unicode" 8 "unicode/utf8" 9 ) 10 11 type Token struct { 12 Name string `json:"name"` 13 Raw string `json:"raw"` 14 Freq float64 `json:"freq"` 15 } 16 17 func (tok Token) String() string { 18 s := tok.Name 19 if s != tok.Raw { 20 s = "<" + s + "," + tok.Raw + ">" 21 } 22 return fmt.Sprintf("%s(%.3f)", s, tok.Freq) 23 } 24 25 type Results map[string]Token 26 27 func (r Results) Contains(r2 Results) bool { 28 for k := range r2 { 29 if _, ok := r[k]; !ok { 30 return false 31 } 32 } 33 return true 34 } 35 36 func (r Results) String() string { 37 var lines [][2]string 38 var max1 int 39 for k, v := range r { 40 y := "" 41 for _, r := range k { 42 if r < 128 { 43 y += fmt.Sprintf("%c ", r) 44 } else if r < 65536 { 45 y += fmt.Sprintf("\\u%04X ", r) 46 } else { 47 y += fmt.Sprintf("\\U%08X ", r) 48 } 49 } 50 z := v.String() 51 lines = append(lines, [2]string{y, z}) 52 if len(y) > max1 { 53 max1 = len(y) 54 } 55 } 56 if max1 > 50 { 57 max1 = 50 58 } 59 60 buf := &bytes.Buffer{} 61 for _, line := range lines { 62 buf.WriteString(line[0]) 63 for i := 0; i < max1-len(line[0]); i++ { 64 buf.WriteByte(' ') 65 } 66 buf.WriteString(line[1]) 67 buf.WriteString("\n") 68 } 69 return buf.String() 70 } 71 72 func (r Results) Hashes() (qs []uint64) { 73 for k := range r { 74 qs = append(qs, StrHash(k)) 75 } 76 return 77 } 78 79 func Split(text string) (res Results) { 80 return doSplit(text, false) 81 } 82 83 func SplitMore(text string) (res Results) { 84 return doSplit(text, true) 85 } 86 87 func doSplit(text string, more bool) (res Results) { 88 // text = removeAccents(text) 89 res = map[string]Token{} 90 sp := splitter{ 91 more: more, 92 freq: map[string]float64{}, 93 } 94 95 prevStart, prevRune, prevRuneNormalized := 0, utf8.RuneError, utf8.RuneError 96 // inQuote := false 97 98 var i int 99 for i < len(text) { 100 r, sz := utf8.DecodeRuneInString(text[i:]) 101 if r == utf8.RuneError { 102 goto BREAK 103 } 104 105 if eps, ok := emojiTree[r]; ok { 106 found := false 107 for _, ep := range eps { 108 if strings.HasPrefix(text[i:], ep) { 109 sp.freq[ep]++ 110 sp.total++ 111 if prevRune != utf8.RuneError { 112 sp.do(text[prevStart:i], res, false) 113 prevRune = utf8.RuneError 114 } 115 i += len(ep) 116 prevStart = i 117 found = true 118 break 119 } 120 } 121 if found { 122 continue 123 } 124 } 125 126 // fmt.Println(string(lastr), string(r), isdiff(lastr, r)) 127 if prevRune != utf8.RuneError { 128 isdiff := false 129 if IsContinue(prevRune) != IsContinue(r) { 130 isdiff = true 131 } 132 if (prevRuneNormalized <= utf8.RuneSelf) != (Normalize(r) <= utf8.RuneSelf) { 133 isdiff = true 134 } 135 // fmt.Println(text[prevStart:i], string(prevRuneNormalized), string(prevRune)) 136 if isdiff { 137 sp.do(text[prevStart:i], res, false) 138 prevStart = i 139 } 140 } 141 i += sz 142 143 if IsContinue(r) { 144 prevRune = r 145 prevRuneNormalized = Normalize(r) 146 } else { 147 if r > 65535 || (more && !unicode.IsSpace(r)) { 148 t := text[prevStart:i] 149 sp.freq[t]++ 150 sp.total++ 151 } 152 prevRune = utf8.RuneError 153 prevStart = i 154 // inQuote = r == '"' 155 } 156 } 157 sp.do(text[prevStart:], res, false) 158 159 BREAK: 160 for k, v := range sp.freq { 161 tok := res[k] 162 if tok.Name == "" { 163 tok.Name, tok.Raw = k, k 164 } 165 tok.Freq = v / float64(sp.total) 166 res[k] = tok 167 } 168 return 169 } 170 171 type splitter struct { 172 tmpbuf bytes.Buffer 173 total int 174 more bool 175 lastSplitText string 176 freq map[string]float64 177 } 178 179 func (s *splitter) do(v string, res map[string]Token, inQuote bool) { 180 if v == "" { 181 return 182 } 183 184 r, _ := utf8.DecodeRuneInString(v) 185 if s.lastSplitText != "" { 186 lastr, _ := utf8.DecodeLastRuneInString(s.lastSplitText) 187 if (lastr <= utf8.RuneSelf) != (r <= utf8.RuneSelf) { 188 lastrn := Normalize(lastr) 189 rn := Normalize(r) 190 if (lastrn <= utf8.RuneSelf) != (rn <= utf8.RuneSelf) { // test again 191 s.tmpbuf.Reset() 192 s.tmpbuf.WriteRune(unicode.ToLower(lastrn)) 193 s.tmpbuf.WriteRune(unicode.ToLower(rn)) 194 n := s.tmpbuf.Len() 195 s.tmpbuf.WriteRune(lastr) 196 s.tmpbuf.WriteRune(r) 197 x := s.tmpbuf.String() 198 199 s.freq[x[:n]]++ 200 res[x[:n]] = Token{Name: x[:n], Raw: x[n:]} // , Quoted: inQuote} 201 s.total++ 202 } 203 } 204 } 205 // fmt.Println(lastSplitText, v) 206 s.lastSplitText = v 207 208 if Normalize(r) < utf8.RuneSelf || unicode.IsLower(r) || unicode.IsUpper(r) { 209 if len(v) == 1 && !s.more { 210 return 211 } 212 213 x := v 214 if len(v) > 3 { 215 x = lemma(v) 216 } else { 217 x = strings.ToLower(removeAccents(v)) 218 } 219 if s.more { 220 for _, g := range trigram(x) { 221 s.freq[g]++ 222 s.total++ 223 } 224 } else { 225 if isCodeString(x) { 226 for _, x := range trigram(x) { 227 s.freq[x]++ 228 res[x] = Token{Name: x, Raw: x} 229 } 230 } else { 231 s.freq[x]++ 232 res[x] = Token{Name: x, Raw: v} //, Quoted: inQuote} 233 } 234 s.total++ 235 } 236 return 237 } 238 239 lastr := utf8.RuneError 240 runeCount := 0 241 for len(v) > 0 { 242 r, sz := utf8.DecodeRuneInString(v) 243 v = v[sz:] 244 245 if s.more { 246 s.tmpbuf.Reset() 247 s.tmpbuf.WriteRune(cv(r)) 248 n := s.tmpbuf.Len() 249 s.tmpbuf.WriteRune(r) 250 x := s.tmpbuf.String() 251 252 s.freq[x[:n]]++ 253 res[x[:n]] = Token{Name: x[:n], Raw: x[n:]} 254 } else { 255 if lastr != utf8.RuneError { 256 s.tmpbuf.Reset() 257 s.tmpbuf.WriteRune(cv(lastr)) 258 s.tmpbuf.WriteRune(cv(r)) 259 n := s.tmpbuf.Len() 260 s.tmpbuf.WriteRune(lastr) 261 s.tmpbuf.WriteRune(r) 262 x := s.tmpbuf.String() 263 264 s.freq[x[:n]]++ 265 res[x[:n]] = Token{Name: x[:n], Raw: x[n:]} //, Quoted: inQuote} 266 s.total++ 267 } 268 } 269 270 lastr = r 271 runeCount++ 272 } 273 } 274 275 func StrHash(s string) uint64 { 276 const offset64 = 14695981039346656037 277 const prime64 = 1099511628211 278 var hash uint64 = offset64 279 for i := 0; i < len(s); i++ { 280 hash *= prime64 281 hash ^= uint64(s[i]) 282 } 283 return uint64(hash) 284 }