github.com/lbryio/lbcd@v0.22.119/claimtrie/normalization/case_folder.go (about) 1 package normalization 2 3 import ( 4 "bytes" 5 _ "embed" 6 "regexp" 7 "strconv" 8 "strings" 9 "unicode/utf8" 10 ) 11 12 //go:embed CaseFolding_v11.txt 13 var v11 string 14 15 var foldMap map[rune][]rune 16 17 func init() { 18 foldMap = map[rune][]rune{} 19 r, _ := regexp.Compile(`([[:xdigit:]]+?); (.); ([[:xdigit:] ]+?);`) 20 matches := r.FindAllStringSubmatch(v11, 1000000000) 21 for i := range matches { 22 if matches[i][2] == "C" || matches[i][2] == "F" { 23 key, err := strconv.ParseUint(matches[i][1], 16, len(matches[i][1])*4) 24 if err != nil { 25 panic(err) 26 } 27 splits := strings.Split(matches[i][3], " ") 28 var values []rune 29 for j := range splits { 30 value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4) 31 if err != nil { 32 panic(err) 33 } 34 values = append(values, rune(value)) 35 } 36 foldMap[rune(key)] = values 37 } 38 } 39 } 40 41 func caseFold(name []byte) []byte { 42 var b bytes.Buffer 43 b.Grow(len(name)) 44 for i := 0; i < len(name); { 45 r, w := utf8.DecodeRune(name[i:]) 46 if r == utf8.RuneError && w < 2 { 47 // HACK: their RuneError is actually a valid character if coming from a width of 2 or more 48 return name 49 } 50 replacements := foldMap[r] 51 if len(replacements) > 0 { 52 for j := range replacements { 53 b.WriteRune(replacements[j]) 54 } 55 } else { 56 b.WriteRune(r) 57 } 58 i += w 59 } 60 return b.Bytes() 61 }