github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/export/idna/gen.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 // +build ignore 7 8 // This program generates the trie for idna operations. The Unicode casing 9 // algorithm requires the lookup of various properties and mappings for each 10 // rune. The table generated by this generator combines several of the most 11 // frequently used of these into a single trie so that they can be accessed 12 // with a single lookup. 13 package main 14 15 import ( 16 "fmt" 17 "io" 18 "log" 19 "unicode" 20 "unicode/utf8" 21 22 "github.com/go-enjin/golang-org-x-text/internal/gen" 23 "github.com/go-enjin/golang-org-x-text/internal/triegen" 24 "github.com/go-enjin/golang-org-x-text/internal/ucd" 25 "github.com/go-enjin/golang-org-x-text/unicode/bidi" 26 ) 27 28 func main() { 29 gen.Init() 30 genTables() 31 gen.Repackage("gen_trieval.go", "trieval.go", "idna") 32 gen.Repackage("gen_common.go", "common_test.go", "idna") 33 } 34 35 var runes = map[rune]info{} 36 37 func genTables() { 38 t := triegen.NewTrie("idna") 39 40 ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) { 41 r := p.Rune(0) 42 if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M" 43 runes[r] = mayNeedNorm 44 } 45 }) 46 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { 47 r := p.Rune(0) 48 49 const cccVirama = 9 50 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { 51 runes[p.Rune(0)] = viramaModifier 52 } 53 switch { 54 case unicode.In(r, unicode.Mark): 55 runes[r] |= modifier | mayNeedNorm 56 } 57 // TODO: by using UnicodeData.txt we don't mark undefined codepoints 58 // that are earmarked as RTL properly. However, an undefined cp will 59 // always fail, so there is no need to store this info. 60 switch p, _ := bidi.LookupRune(r); p.Class() { 61 case bidi.R, bidi.AL, bidi.AN: 62 if x := runes[r]; x != 0 && x != mayNeedNorm { 63 log.Fatalf("%U: rune both modifier and RTL letter/number", r) 64 } 65 runes[r] = rtl 66 } 67 }) 68 69 ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { 70 switch v := p.String(1); v { 71 case "L", "D", "T", "R": 72 runes[p.Rune(0)] |= joinType[v] << joinShift 73 } 74 }) 75 76 ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { 77 r := p.Rune(0) 78 79 // The mappings table explicitly defines surrogates as invalid. 80 if !utf8.ValidRune(r) { 81 return 82 } 83 84 cat := catFromEntry(p) 85 isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation 86 if !isMapped { 87 // Only include additional category information for non-mapped 88 // runes. The additional information is only used after mapping and 89 // the bits would clash with mapping information. 90 // TODO: it would be possible to inline this data and avoid 91 // additional lookups. This is quite tedious, though, so let's first 92 // see if we need this. 93 cat |= category(runes[r]) 94 } 95 96 s := string(p.Runes(2)) 97 if s != "" && !isMapped { 98 log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) 99 } 100 t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) 101 }) 102 103 w := gen.NewCodeWriter() 104 defer w.WriteVersionedGoFile("tables.go", "idna") 105 106 gen.WriteUnicodeVersion(w) 107 108 w.WriteVar("mappings", string(mappings)) 109 w.WriteVar("mappingIndex", mappingIndex) 110 w.WriteVar("xorData", string(xorData)) 111 112 sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) 113 if err != nil { 114 log.Fatal(err) 115 } 116 w.Size += sz 117 } 118 119 var ( 120 // mappings contains replacement strings for mapped runes. 121 mappings = []byte{} 122 123 // mappingIndex contains an offset in mappingBytes representing the start 124 // of a mapping. Then next entry in mappingIndex points past the end of the 125 // string. 126 mappingIndex = []uint16{0} 127 mapCache = map[string]int{} 128 129 // xorData is like mappings, except that it contains XOR data. 130 // We split these two tables so that we don't get an overflow. 131 xorData = []byte{} 132 xorCache = map[string]int{} 133 ) 134 135 // makeEntry creates a trie entry. 136 func makeEntry(r rune, mapped string) info { 137 orig := string(r) 138 139 if len(orig) != len(mapped) { 140 // Store the mapped value as is in the mappings table. 141 index := len(mappingIndex) - 1 142 if x, ok := mapCache[mapped]; ok { 143 index = x 144 } else { 145 mapCache[mapped] = index 146 mappings = append(mappings, mapped...) 147 mappingIndex = append(mappingIndex, uint16(len(mappings))) 148 } 149 return info(index) << indexShift 150 } 151 152 // Create per-byte XOR mask. 153 var b []byte 154 for i := 0; i < len(orig); i++ { 155 b = append(b, orig[i]^mapped[i]) 156 } 157 158 // Remove leading 0 bytes, but keep at least one byte. 159 for ; len(b) > 1 && b[0] == 0; b = b[1:] { 160 } 161 162 if len(b) == 1 { 163 return xorBit | inlineXOR | info(b[0])<<indexShift 164 } 165 mapped = string(b) 166 167 // Store the mapped value as is in the mappings table. 168 index := len(xorData) 169 if x, ok := xorCache[mapped]; ok { 170 index = x 171 } else { 172 xorCache[mapped] = index 173 xorData = append(xorData, byte(len(mapped))) 174 xorData = append(xorData, mapped...) 175 } 176 return xorBit | info(index)<<indexShift 177 } 178 179 // The following code implements a triegen.Compacter that was originally 180 // designed for normalization. The IDNA table has some similarities with the 181 // norm table. Using this compacter, together with the XOR pattern approach, 182 // reduces the table size by roughly 100K. It can probably be compressed further 183 // by also including elements of the compacter used by cases, but for now it is 184 // good enough. 185 186 const maxSparseEntries = 16 187 188 type normCompacter struct { 189 sparseBlocks [][]uint64 190 sparseOffset []uint16 191 sparseCount int 192 } 193 194 func mostFrequentStride(a []uint64) int { 195 counts := make(map[int]int) 196 var v int 197 for _, x := range a { 198 if stride := int(x) - v; v != 0 && stride >= 0 { 199 counts[stride]++ 200 } 201 v = int(x) 202 } 203 var maxs, maxc int 204 for stride, cnt := range counts { 205 if cnt > maxc || (cnt == maxc && stride < maxs) { 206 maxs, maxc = stride, cnt 207 } 208 } 209 return maxs 210 } 211 212 func countSparseEntries(a []uint64) int { 213 stride := mostFrequentStride(a) 214 var v, count int 215 for _, tv := range a { 216 if int(tv)-v != stride { 217 if tv != 0 { 218 count++ 219 } 220 } 221 v = int(tv) 222 } 223 return count 224 } 225 226 func (c *normCompacter) Size(v []uint64) (sz int, ok bool) { 227 if n := countSparseEntries(v); n <= maxSparseEntries { 228 return (n+1)*4 + 2, true 229 } 230 return 0, false 231 } 232 233 func (c *normCompacter) Store(v []uint64) uint32 { 234 h := uint32(len(c.sparseOffset)) 235 c.sparseBlocks = append(c.sparseBlocks, v) 236 c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount)) 237 c.sparseCount += countSparseEntries(v) + 1 238 return h 239 } 240 241 func (c *normCompacter) Handler() string { 242 return "idnaSparse.lookup" 243 } 244 245 func (c *normCompacter) Print(w io.Writer) (retErr error) { 246 p := func(f string, x ...interface{}) { 247 if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil { 248 retErr = err 249 } 250 } 251 252 ls := len(c.sparseBlocks) 253 p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2) 254 p("var idnaSparseOffset = %#v\n\n", c.sparseOffset) 255 256 ns := c.sparseCount 257 p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4) 258 p("var idnaSparseValues = [%d]valueRange {", ns) 259 for i, b := range c.sparseBlocks { 260 p("\n// Block %#x, offset %#x", i, c.sparseOffset[i]) 261 var v int 262 stride := mostFrequentStride(b) 263 n := countSparseEntries(b) 264 p("\n{value:%#04x,lo:%#02x},", stride, uint8(n)) 265 for i, nv := range b { 266 if int(nv)-v != stride { 267 if v != 0 { 268 p(",hi:%#02x},", 0x80+i-1) 269 } 270 if nv != 0 { 271 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) 272 } 273 } 274 v = int(nv) 275 } 276 if v != 0 { 277 p(",hi:%#02x},", 0x80+len(b)-1) 278 } 279 } 280 p("\n}\n\n") 281 return 282 }