github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/export/idna/gen.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // This program generates the trie for idna operations. The Unicode casing 8 // algorithm requires the lookup of various properties and mappings for each 9 // rune. The table generated by this generator combines several of the most 10 // frequently used of these into a single trie so that they can be accessed 11 // with a single lookup. 12 package main 13 14 import ( 15 "fmt" 16 "io" 17 "log" 18 "unicode" 19 "unicode/utf8" 20 21 "github.com/go-xe2/third/golang.org/x/text/internal/gen" 22 "github.com/go-xe2/third/golang.org/x/text/internal/triegen" 23 "github.com/go-xe2/third/golang.org/x/text/internal/ucd" 24 "github.com/go-xe2/third/golang.org/x/text/unicode/bidi" 25 ) 26 27 func main() { 28 gen.Init() 29 genTables() 30 gen.Repackage("gen_trieval.go", "trieval.go", "idna") 31 gen.Repackage("gen_common.go", "common_test.go", "idna") 32 } 33 34 var runes = map[rune]info{} 35 36 func genTables() { 37 t := triegen.NewTrie("idna") 38 39 ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) { 40 r := p.Rune(0) 41 if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M" 42 runes[r] = mayNeedNorm 43 } 44 }) 45 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { 46 r := p.Rune(0) 47 48 const cccVirama = 9 49 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { 50 runes[p.Rune(0)] = viramaModifier 51 } 52 switch { 53 case unicode.In(r, unicode.Mark): 54 runes[r] |= modifier | mayNeedNorm 55 } 56 // TODO: by using UnicodeData.txt we don't mark undefined codepoints 57 // that are earmarked as RTL properly. However, an undefined cp will 58 // always fail, so there is no need to store this info. 59 switch p, _ := bidi.LookupRune(r); p.Class() { 60 case bidi.R, bidi.AL, bidi.AN: 61 if x := runes[r]; x != 0 && x != mayNeedNorm { 62 log.Fatalf("%U: rune both modifier and RTL letter/number", r) 63 } 64 runes[r] = rtl 65 } 66 }) 67 68 ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { 69 switch v := p.String(1); v { 70 case "L", "D", "T", "R": 71 runes[p.Rune(0)] |= joinType[v] << joinShift 72 } 73 }) 74 75 ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { 76 r := p.Rune(0) 77 78 // The mappings table explicitly defines surrogates as invalid. 79 if !utf8.ValidRune(r) { 80 return 81 } 82 83 cat := catFromEntry(p) 84 isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation 85 if !isMapped { 86 // Only include additional category information for non-mapped 87 // runes. The additional information is only used after mapping and 88 // the bits would clash with mapping information. 89 // TODO: it would be possible to inline this data and avoid 90 // additional lookups. This is quite tedious, though, so let's first 91 // see if we need this. 92 cat |= category(runes[r]) 93 } 94 95 s := string(p.Runes(2)) 96 if s != "" && !isMapped { 97 log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) 98 } 99 t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) 100 }) 101 102 w := gen.NewCodeWriter() 103 defer w.WriteGoFile("tables.go", "idna") 104 105 gen.WriteUnicodeVersion(w) 106 107 w.WriteVar("mappings", string(mappings)) 108 w.WriteVar("xorData", string(xorData)) 109 110 sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) 111 if err != nil { 112 log.Fatal(err) 113 } 114 w.Size += sz 115 } 116 117 var ( 118 // mappings contains replacement strings for mapped runes, each prefixed 119 // with a byte containing the length of the following string. 120 mappings = []byte{} 121 mapCache = map[string]int{} 122 123 // xorData is like mappings, except that it contains XOR data. 124 // We split these two tables so that we don't get an overflow. 125 xorData = []byte{} 126 xorCache = map[string]int{} 127 ) 128 129 // makeEntry creates a trie entry. 130 func makeEntry(r rune, mapped string) info { 131 orig := string(r) 132 133 if len(orig) != len(mapped) { 134 // Store the mapped value as is in the mappings table. 135 index := len(mappings) 136 if x, ok := mapCache[mapped]; ok { 137 index = x 138 } else { 139 mapCache[mapped] = index 140 mappings = append(mappings, byte(len(mapped))) 141 mappings = append(mappings, mapped...) 142 } 143 return info(index) << indexShift 144 } 145 146 // Create per-byte XOR mask. 147 var b []byte 148 for i := 0; i < len(orig); i++ { 149 b = append(b, orig[i]^mapped[i]) 150 } 151 152 // Remove leading 0 bytes, but keep at least one byte. 153 for ; len(b) > 1 && b[0] == 0; b = b[1:] { 154 } 155 156 if len(b) == 1 { 157 return xorBit | inlineXOR | info(b[0])<<indexShift 158 } 159 mapped = string(b) 160 161 // Store the mapped value as is in the mappings table. 162 index := len(xorData) 163 if x, ok := xorCache[mapped]; ok { 164 index = x 165 } else { 166 xorCache[mapped] = index 167 xorData = append(xorData, byte(len(mapped))) 168 xorData = append(xorData, mapped...) 169 } 170 return xorBit | info(index)<<indexShift 171 } 172 173 // The following code implements a triegen.Compacter that was originally 174 // designed for normalization. The IDNA table has some similarities with the 175 // norm table. Using this compacter, together with the XOR pattern approach, 176 // reduces the table size by roughly 100K. It can probably be compressed further 177 // by also including elements of the compacter used by cases, but for now it is 178 // good enough. 179 180 const maxSparseEntries = 16 181 182 type normCompacter struct { 183 sparseBlocks [][]uint64 184 sparseOffset []uint16 185 sparseCount int 186 } 187 188 func mostFrequentStride(a []uint64) int { 189 counts := make(map[int]int) 190 var v int 191 for _, x := range a { 192 if stride := int(x) - v; v != 0 && stride >= 0 { 193 counts[stride]++ 194 } 195 v = int(x) 196 } 197 var maxs, maxc int 198 for stride, cnt := range counts { 199 if cnt > maxc || (cnt == maxc && stride < maxs) { 200 maxs, maxc = stride, cnt 201 } 202 } 203 return maxs 204 } 205 206 func countSparseEntries(a []uint64) int { 207 stride := mostFrequentStride(a) 208 var v, count int 209 for _, tv := range a { 210 if int(tv)-v != stride { 211 if tv != 0 { 212 count++ 213 } 214 } 215 v = int(tv) 216 } 217 return count 218 } 219 220 func (c *normCompacter) Size(v []uint64) (sz int, ok bool) { 221 if n := countSparseEntries(v); n <= maxSparseEntries { 222 return (n+1)*4 + 2, true 223 } 224 return 0, false 225 } 226 227 func (c *normCompacter) Store(v []uint64) uint32 { 228 h := uint32(len(c.sparseOffset)) 229 c.sparseBlocks = append(c.sparseBlocks, v) 230 c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount)) 231 c.sparseCount += countSparseEntries(v) + 1 232 return h 233 } 234 235 func (c *normCompacter) Handler() string { 236 return "idnaSparse.lookup" 237 } 238 239 func (c *normCompacter) Print(w io.Writer) (retErr error) { 240 p := func(f string, x ...interface{}) { 241 if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil { 242 retErr = err 243 } 244 } 245 246 ls := len(c.sparseBlocks) 247 p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2) 248 p("var idnaSparseOffset = %#v\n\n", c.sparseOffset) 249 250 ns := c.sparseCount 251 p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4) 252 p("var idnaSparseValues = [%d]valueRange {", ns) 253 for i, b := range c.sparseBlocks { 254 p("\n// Block %#x, offset %#x", i, c.sparseOffset[i]) 255 var v int 256 stride := mostFrequentStride(b) 257 n := countSparseEntries(b) 258 p("\n{value:%#04x,lo:%#02x},", stride, uint8(n)) 259 for i, nv := range b { 260 if int(nv)-v != stride { 261 if v != 0 { 262 p(",hi:%#02x},", 0x80+i-1) 263 } 264 if nv != 0 { 265 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) 266 } 267 } 268 v = int(nv) 269 } 270 if v != 0 { 271 p(",hi:%#02x},", 0x80+len(b)-1) 272 } 273 } 274 p("\n}\n\n") 275 return 276 }