github.com/lbryio/lbcd@v0.22.119/claimtrie/normalization/char_decomposer.go (about) 1 package normalization 2 3 import ( 4 "bufio" 5 _ "embed" 6 "strconv" 7 "strings" 8 "unicode/utf8" 9 ) 10 11 //go:embed NFC_v11.txt 12 var decompositions string // the data file that came from ICU 63.2 13 14 var nfdMap map[rune][]rune 15 var nfdOrder map[rune]int32 16 17 func init() { 18 nfdMap = map[rune][]rune{} 19 nfdOrder = map[rune]int32{} 20 scanner := bufio.NewScanner(strings.NewReader(decompositions)) 21 for scanner.Scan() { 22 line := scanner.Text() 23 if len(line) <= 0 || line[0] == '#' || line[0] == '*' { 24 continue 25 } 26 if strings.ContainsAny(line, ":") { 27 // it's a ordering def: 28 addOrdering(line) 29 continue 30 } 31 splits := strings.Split(line, "=") 32 if len(splits) <= 1 { 33 splits = strings.Split(line, ">") 34 if len(splits) <= 1 { 35 continue 36 } 37 } 38 key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4) 39 if err != nil { 40 panic(err) 41 } 42 splits = strings.Split(splits[1], " ") 43 values := make([]rune, 0, len(splits)) 44 for j := range splits { 45 value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4) 46 if err != nil { 47 panic(err) 48 } 49 existing := nfdMap[rune(value)] 50 if len(existing) > 0 { 51 values = append(values, existing...) 52 } else { 53 values = append(values, rune(value)) 54 } 55 } 56 nfdMap[rune(key)] = values 57 } 58 59 // run one more expansion pass to catch stragglers 60 for key, values := range nfdMap { 61 for i, value := range values { 62 other := nfdMap[value] 63 if len(other) > 0 { 64 newValues := make([]rune, len(values)+len(other)-1) 65 copy(newValues, values[:i]) 66 copy(newValues[i:i+len(other)], other) 67 copy(newValues[i+len(other):], values[i+1:]) 68 nfdMap[key] = newValues 69 } 70 } 71 } 72 73 // assert no more expansions are necessary: 74 for _, values := range nfdMap { 75 for _, value := range values { 76 other := nfdMap[value] 77 if len(other) > 0 { 78 panic("Failed in NFD expansion") 79 } 80 } 81 } 82 } 83 84 func addOrdering(line string) { 85 splits := strings.Split(line, ":") 86 ranges := strings.Split(splits[0], "..") 87 88 value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4) 89 if err != nil { 90 panic(err) 91 } 92 93 start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4) 94 if err != nil { 95 panic(err) 96 } 97 end := start 98 if len(ranges) > 1 { 99 end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4) 100 if err != nil { 101 panic(err) 102 } 103 } 104 for i := start; i <= end; i++ { 105 nfdOrder[rune(i)] = int32(value) 106 } 107 } 108 109 func decompose(name []byte) []byte { 110 // see https://unicode.org/reports/tr15/ section 1.3 111 runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length 112 for i := 0; i < len(name); { 113 r, w := utf8.DecodeRune(name[i:]) 114 if r == utf8.RuneError && w < 2 { 115 // HACK: their RuneError is actually a valid character if coming from a width of 2 or more 116 return name 117 } 118 replacements := nfdMap[r] 119 if len(replacements) > 0 { 120 runes = append(runes, replacements...) 121 } else { 122 hanguls := decomposeHangul(r) 123 if len(hanguls) > 0 { 124 runes = append(runes, hanguls...) 125 } else { 126 runes = append(runes, r) 127 } 128 } 129 i += w 130 } 131 repairOrdering(runes) 132 return []byte(string(runes)) 133 } 134 135 func decomposeHangul(s rune) []rune { 136 // see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf 137 138 const SBase int32 = 0xAC00 139 const LBase int32 = 0x1100 140 const VBase int32 = 0x1161 141 const TBase int32 = 0x11A7 142 const LCount int32 = 19 143 const VCount int32 = 21 144 const TCount int32 = 28 145 const NCount = VCount * TCount // 588 146 const SCount = LCount * NCount // 11172 147 148 SIndex := s - SBase 149 if SIndex < 0 || SIndex >= SCount { 150 return nil 151 } 152 L := LBase + SIndex/NCount 153 V := VBase + (SIndex%NCount)/TCount 154 T := TBase + SIndex%TCount 155 result := []rune{L, V} 156 if T != TBase { 157 result = append(result, T) 158 } 159 return result 160 } 161 162 func repairOrdering(runes []rune) { 163 for i := 1; i < len(runes); i++ { 164 a := runes[i-1] 165 b := runes[i] 166 oa := nfdOrder[a] 167 ob := nfdOrder[b] 168 if oa > ob && ob > 0 { 169 runes[i-1], runes[i] = b, a 170 if i >= 2 { 171 i -= 2 172 } else { 173 i = 0 174 } 175 } 176 } 177 }