github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/ngram/rune.go (about) 1 package ngram 2 3 import ( 4 "bytes" 5 _ "embed" 6 "strings" 7 "unicode" 8 "unicode/utf8" 9 10 "github.com/aaaton/golem/v4" 11 "github.com/aaaton/golem/v4/dicts/en" 12 13 "golang.org/x/text/runes" 14 "golang.org/x/text/transform" 15 "golang.org/x/text/unicode/norm" 16 ) 17 18 //go:embed TSCharacters.txt 19 var rawDictBuf []byte 20 21 //go:embed emoji_v15.txt 22 var rawEmojiBuf []byte 23 24 var runeTable = map[rune]rune{} 25 26 var emojiTree = map[rune][]string{} 27 28 var englishLemma *golem.Lemmatizer 29 30 func init() { 31 for { 32 idx := bytes.IndexByte(rawDictBuf, '\n') 33 34 var line []byte 35 if idx > 0 { 36 line = rawDictBuf[:idx] 37 rawDictBuf = rawDictBuf[idx+1:] 38 } else { 39 line = rawDictBuf 40 } 41 42 if len(line) == 0 { 43 break 44 } 45 46 sep := bytes.IndexByte(line, '\t') 47 a, _ := utf8.DecodeRune(line[:sep]) 48 b, _ := utf8.DecodeRune(line[sep+1:]) 49 runeTable[a] = b 50 } 51 52 englishLemma, _ = golem.New(en.New()) 53 54 runeTable['\u0131'] = 'i' 55 56 for { 57 idx := bytes.IndexByte(rawEmojiBuf, '\n') 58 59 var line []byte 60 if idx > 0 { 61 line = rawEmojiBuf[:idx] 62 rawEmojiBuf = rawEmojiBuf[idx+1:] 63 } else { 64 line = rawEmojiBuf 65 } 66 67 if len(line) == 0 { 68 break 69 } 70 71 parts := bytes.Split(line, []byte{' '}) 72 head := parseUnicode(parts[0]) 73 var tail []rune 74 for _, p := range parts { 75 tail = append(tail, parseUnicode(p)) 76 } 77 emojiTree[head] = append(emojiTree[head], string(tail)) 78 } 79 } 80 81 func cv(in rune) rune { 82 s, ok := runeTable[in] 83 if ok { 84 return s 85 } 86 return in 87 } 88 89 func lemma(word string) string { 90 return strings.ToLower(englishLemma.Lemma(removeAccents(word))) 91 } 92 93 func removeAccents(s string) string { 94 var accent = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 95 output, _, e := transform.String(accent, s) 96 if e != nil { 97 return s 98 } 99 return output 100 } 101 102 func Normalize(r rune) rune { 103 var accent = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) 104 var tmp [32]byte 105 n := utf8.EncodeRune(tmp[:], r) 106 output, _, e := transform.Append(accent, tmp[16:16], tmp[:n]) 107 if e != nil { 108 return r 109 } 110 nr, _ := utf8.DecodeRune(output) 111 return cv(nr) 112 } 113 114 type set func(rune) bool 115 116 func (a set) add(rt *unicode.RangeTable) set { 117 b := in(rt) 118 return func(r rune) bool { return a(r) || b(r) } 119 } 120 121 func (a set) sub(rt *unicode.RangeTable) set { 122 b := in(rt) 123 return func(r rune) bool { return a(r) && !b(r) } 124 } 125 126 func in(rt *unicode.RangeTable) set { 127 return func(r rune) bool { return unicode.Is(rt, r) } 128 } 129 130 var id_continue = set(unicode.IsLetter). 131 add(unicode.Nl). 132 add(unicode.Other_ID_Start). 133 sub(unicode.Pattern_Syntax). 134 sub(unicode.Pattern_White_Space). 135 add(unicode.Mn). 136 add(unicode.Mc). 137 add(unicode.Nd). 138 add(unicode.Pc). 139 add(unicode.Other_ID_Continue). 140 sub(unicode.Pattern_Syntax). 141 sub(unicode.Pattern_White_Space) 142 143 // isContinue checks that the rune continues an identifier. 144 func IsContinue(r rune) bool { 145 // id_continue(r) && NFKC(r) in "id_continue*" 146 if !id_continue(r) { 147 return false 148 } 149 for _, r := range norm.NFKC.String(string(r)) { 150 if !id_continue(r) { 151 return false 152 } 153 } 154 return true 155 } 156 157 func isCodeString(v string) bool { 158 // Hex string 159 for _, b := range v { 160 if ('0' <= b && b <= '9') || ('a' <= b && b <= 'f') || ('A' <= b && b <= 'F') { 161 } else { 162 goto BASE64 163 } 164 } 165 return true 166 167 // Base64 string 168 BASE64: 169 ups := 0 170 for _, b := range v { 171 if 'A' <= b && b <= 'Z' { 172 ups++ 173 } 174 } 175 if len(v) >= 6 && ups >= len(v)/3 { 176 // There are approximately equal-number of upper letters and lower letters 177 // in a base64 string 178 return true 179 } 180 return false 181 } 182 183 func trigram(v string) (res []string) { 184 orig := v 185 idx := [3]int{0, 0, 0} 186 x := 0 187 for i := 1; len(v) > 0; i++ { 188 r, sz := utf8.DecodeRuneInString(v) 189 if r == utf8.RuneError { 190 break 191 } 192 if i >= 3 { 193 res = append(res, orig[idx[(i-3)%3]:idx[(i-1)%3]+sz]) 194 } 195 x += sz 196 idx[i%3] = x 197 v = v[sz:] 198 } 199 if len(res) == 0 { 200 res = append(res, orig) 201 } 202 return 203 } 204 205 func parseUnicode(v []byte) (r rune) { 206 for i := 2; i < len(v); i++ { 207 x := v[i] 208 if x >= 'A' { 209 x = x - 'A' + 10 210 } else { 211 x -= '0' 212 } 213 r = r<<4 + rune(x) 214 } 215 return 216 }