github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/ngram/rune.go

github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/ngram/rune.go (about)

     1  package ngram
     2  
     3  import (
     4  	"bytes"
     5  	_ "embed"
     6  	"strings"
     7  	"unicode"
     8  	"unicode/utf8"
     9  
    10  	"github.com/aaaton/golem/v4"
    11  	"github.com/aaaton/golem/v4/dicts/en"
    12  
    13  	"golang.org/x/text/runes"
    14  	"golang.org/x/text/transform"
    15  	"golang.org/x/text/unicode/norm"
    16  )
    17  
    18  //go:embed TSCharacters.txt
    19  var rawDictBuf []byte
    20  
    21  //go:embed emoji_v15.txt
    22  var rawEmojiBuf []byte
    23  
    24  var runeTable = map[rune]rune{}
    25  
    26  var emojiTree = map[rune][]string{}
    27  
    28  var englishLemma *golem.Lemmatizer
    29  
    30  func init() {
    31  	for {
    32  		idx := bytes.IndexByte(rawDictBuf, '\n')
    33  
    34  		var line []byte
    35  		if idx > 0 {
    36  			line = rawDictBuf[:idx]
    37  			rawDictBuf = rawDictBuf[idx+1:]
    38  		} else {
    39  			line = rawDictBuf
    40  		}
    41  
    42  		if len(line) == 0 {
    43  			break
    44  		}
    45  
    46  		sep := bytes.IndexByte(line, '\t')
    47  		a, _ := utf8.DecodeRune(line[:sep])
    48  		b, _ := utf8.DecodeRune(line[sep+1:])
    49  		runeTable[a] = b
    50  	}
    51  
    52  	englishLemma, _ = golem.New(en.New())
    53  
    54  	runeTable['\u0131'] = 'i'
    55  
    56  	for {
    57  		idx := bytes.IndexByte(rawEmojiBuf, '\n')
    58  
    59  		var line []byte
    60  		if idx > 0 {
    61  			line = rawEmojiBuf[:idx]
    62  			rawEmojiBuf = rawEmojiBuf[idx+1:]
    63  		} else {
    64  			line = rawEmojiBuf
    65  		}
    66  
    67  		if len(line) == 0 {
    68  			break
    69  		}
    70  
    71  		parts := bytes.Split(line, []byte{' '})
    72  		head := parseUnicode(parts[0])
    73  		var tail []rune
    74  		for _, p := range parts {
    75  			tail = append(tail, parseUnicode(p))
    76  		}
    77  		emojiTree[head] = append(emojiTree[head], string(tail))
    78  	}
    79  }
    80  
    81  func cv(in rune) rune {
    82  	s, ok := runeTable[in]
    83  	if ok {
    84  		return s
    85  	}
    86  	return in
    87  }
    88  
    89  func lemma(word string) string {
    90  	return strings.ToLower(englishLemma.Lemma(removeAccents(word)))
    91  }
    92  
    93  func removeAccents(s string) string {
    94  	var accent = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
    95  	output, _, e := transform.String(accent, s)
    96  	if e != nil {
    97  		return s
    98  	}
    99  	return output
   100  }
   101  
   102  func Normalize(r rune) rune {
   103  	var accent = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
   104  	var tmp [32]byte
   105  	n := utf8.EncodeRune(tmp[:], r)
   106  	output, _, e := transform.Append(accent, tmp[16:16], tmp[:n])
   107  	if e != nil {
   108  		return r
   109  	}
   110  	nr, _ := utf8.DecodeRune(output)
   111  	return cv(nr)
   112  }
   113  
   114  type set func(rune) bool
   115  
   116  func (a set) add(rt *unicode.RangeTable) set {
   117  	b := in(rt)
   118  	return func(r rune) bool { return a(r) || b(r) }
   119  }
   120  
   121  func (a set) sub(rt *unicode.RangeTable) set {
   122  	b := in(rt)
   123  	return func(r rune) bool { return a(r) && !b(r) }
   124  }
   125  
   126  func in(rt *unicode.RangeTable) set {
   127  	return func(r rune) bool { return unicode.Is(rt, r) }
   128  }
   129  
   130  var id_continue = set(unicode.IsLetter).
   131  	add(unicode.Nl).
   132  	add(unicode.Other_ID_Start).
   133  	sub(unicode.Pattern_Syntax).
   134  	sub(unicode.Pattern_White_Space).
   135  	add(unicode.Mn).
   136  	add(unicode.Mc).
   137  	add(unicode.Nd).
   138  	add(unicode.Pc).
   139  	add(unicode.Other_ID_Continue).
   140  	sub(unicode.Pattern_Syntax).
   141  	sub(unicode.Pattern_White_Space)
   142  
   143  // isContinue checks that the rune continues an identifier.
   144  func IsContinue(r rune) bool {
   145  	// id_continue(r) && NFKC(r) in "id_continue*"
   146  	if !id_continue(r) {
   147  		return false
   148  	}
   149  	for _, r := range norm.NFKC.String(string(r)) {
   150  		if !id_continue(r) {
   151  			return false
   152  		}
   153  	}
   154  	return true
   155  }
   156  
   157  func isCodeString(v string) bool {
   158  	// Hex string
   159  	for _, b := range v {
   160  		if ('0' <= b && b <= '9') || ('a' <= b && b <= 'f') || ('A' <= b && b <= 'F') {
   161  		} else {
   162  			goto BASE64
   163  		}
   164  	}
   165  	return true
   166  
   167  	// Base64 string
   168  BASE64:
   169  	ups := 0
   170  	for _, b := range v {
   171  		if 'A' <= b && b <= 'Z' {
   172  			ups++
   173  		}
   174  	}
   175  	if len(v) >= 6 && ups >= len(v)/3 {
   176  		// There are approximately equal-number of upper letters and lower letters
   177  		// in a base64 string
   178  		return true
   179  	}
   180  	return false
   181  }
   182  
   183  func trigram(v string) (res []string) {
   184  	orig := v
   185  	idx := [3]int{0, 0, 0}
   186  	x := 0
   187  	for i := 1; len(v) > 0; i++ {
   188  		r, sz := utf8.DecodeRuneInString(v)
   189  		if r == utf8.RuneError {
   190  			break
   191  		}
   192  		if i >= 3 {
   193  			res = append(res, orig[idx[(i-3)%3]:idx[(i-1)%3]+sz])
   194  		}
   195  		x += sz
   196  		idx[i%3] = x
   197  		v = v[sz:]
   198  	}
   199  	if len(res) == 0 {
   200  		res = append(res, orig)
   201  	}
   202  	return
   203  }
   204  
   205  func parseUnicode(v []byte) (r rune) {
   206  	for i := 2; i < len(v); i++ {
   207  		x := v[i]
   208  		if x >= 'A' {
   209  			x = x - 'A' + 10
   210  		} else {
   211  			x -= '0'
   212  		}
   213  		r = r<<4 + rune(x)
   214  	}
   215  	return
   216  }