golang.org/x/text@v0.14.0/secure/precis/gen.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Unicode table generator. 6 // Data read from the web. 7 8 //go:build ignore 9 10 package main 11 12 import ( 13 "flag" 14 "log" 15 "unicode" 16 "unicode/utf8" 17 18 "golang.org/x/text/internal/gen" 19 "golang.org/x/text/internal/triegen" 20 "golang.org/x/text/internal/ucd" 21 "golang.org/x/text/unicode/norm" 22 "golang.org/x/text/unicode/rangetable" 23 ) 24 25 var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go") 26 27 var assigned, disallowedRunes *unicode.RangeTable 28 29 var runeCategory = map[rune]category{} 30 31 var overrides = map[category]category{ 32 viramaModifier: viramaJoinT, 33 greek: greekJoinT, 34 hebrew: hebrewJoinT, 35 } 36 37 func setCategory(r rune, cat category) { 38 if c, ok := runeCategory[r]; ok { 39 if override, ok := overrides[c]; cat == joiningT && ok { 40 cat = override 41 } else { 42 log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat) 43 } 44 } 45 runeCategory[r] = cat 46 } 47 48 func init() { 49 if numCategories > 1<<propShift { 50 log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift) 51 } 52 } 53 54 func main() { 55 gen.Init() 56 57 // Load data 58 runes := []rune{} 59 // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13 60 ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { 61 if p.String(1) == "Default_Ignorable_Code_Point" { 62 runes = append(runes, p.Rune(0)) 63 } 64 }) 65 ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) { 66 switch p.String(1) { 67 case "Noncharacter_Code_Point": 68 runes = append(runes, p.Rune(0)) 69 } 70 }) 71 // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9 72 ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { 73 switch p.String(1) { 74 case "L", "V", "T": 75 runes = append(runes, p.Rune(0)) 76 } 77 }) 78 79 disallowedRunes = rangetable.New(runes...) 80 assigned = rangetable.Assigned(unicode.Version) 81 82 // Load category data. 83 runeCategory['l'] = latinSmallL 84 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { 85 const cccVirama = 9 86 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { 87 setCategory(p.Rune(0), viramaModifier) 88 } 89 }) 90 ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) { 91 switch p.String(1) { 92 case "Greek": 93 setCategory(p.Rune(0), greek) 94 case "Hebrew": 95 setCategory(p.Rune(0), hebrew) 96 case "Hiragana", "Katakana", "Han": 97 setCategory(p.Rune(0), japanese) 98 } 99 }) 100 101 // Set the rule categories associated with exceptions. This overrides any 102 // previously set categories. The original categories are manually 103 // reintroduced in the categoryTransitions table. 104 for r, e := range exceptions { 105 if e.cat != 0 { 106 runeCategory[r] = e.cat 107 } 108 } 109 cat := map[string]category{ 110 "L": joiningL, 111 "D": joiningD, 112 "T": joiningT, 113 114 "R": joiningR, 115 } 116 ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { 117 switch v := p.String(1); v { 118 case "L", "D", "T", "R": 119 setCategory(p.Rune(0), cat[v]) 120 } 121 }) 122 123 writeTables() 124 gen.Repackage("gen_trieval.go", "trieval.go", "precis") 125 } 126 127 type exception struct { 128 prop property 129 cat category 130 } 131 132 func init() { 133 // Programmatically add the Arabic and Indic digits to the exceptions map. 134 // See comment in the exceptions map below why these are marked disallowed. 135 for i := rune(0); i <= 9; i++ { 136 exceptions[0x0660+i] = exception{ 137 prop: disallowed, 138 cat: arabicIndicDigit, 139 } 140 exceptions[0x06F0+i] = exception{ 141 prop: disallowed, 142 cat: extendedArabicIndicDigit, 143 } 144 } 145 } 146 147 // The Exceptions class as defined in RFC 5892 148 // https://tools.ietf.org/html/rfc5892#section-2.6 149 var exceptions = map[rune]exception{ 150 0x00DF: {prop: pValid}, 151 0x03C2: {prop: pValid}, 152 0x06FD: {prop: pValid}, 153 0x06FE: {prop: pValid}, 154 0x0F0B: {prop: pValid}, 155 0x3007: {prop: pValid}, 156 157 // ContextO|J rules are marked as disallowed, taking a "guilty until proven 158 // innocent" approach. The main reason for this is that the check for 159 // whether a context rule should be applied can be moved to the logic for 160 // handing disallowed runes, taken it off the common path. The exception to 161 // this rule is for katakanaMiddleDot, as the rule logic is handled without 162 // using a rule function. 163 164 // ContextJ (Join control) 165 0x200C: {prop: disallowed, cat: zeroWidthNonJoiner}, 166 0x200D: {prop: disallowed, cat: zeroWidthJoiner}, 167 168 // ContextO 169 0x00B7: {prop: disallowed, cat: middleDot}, 170 0x0375: {prop: disallowed, cat: greekLowerNumeralSign}, 171 0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh 172 0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim 173 0x30FB: {prop: pValid, cat: katakanaMiddleDot}, 174 175 // These are officially ContextO, but the implementation does not require 176 // special treatment of these, so we simply mark them as valid. 177 0x0660: {prop: pValid}, 178 0x0661: {prop: pValid}, 179 0x0662: {prop: pValid}, 180 0x0663: {prop: pValid}, 181 0x0664: {prop: pValid}, 182 0x0665: {prop: pValid}, 183 0x0666: {prop: pValid}, 184 0x0667: {prop: pValid}, 185 0x0668: {prop: pValid}, 186 0x0669: {prop: pValid}, 187 0x06F0: {prop: pValid}, 188 0x06F1: {prop: pValid}, 189 0x06F2: {prop: pValid}, 190 0x06F3: {prop: pValid}, 191 0x06F4: {prop: pValid}, 192 0x06F5: {prop: pValid}, 193 0x06F6: {prop: pValid}, 194 0x06F7: {prop: pValid}, 195 0x06F8: {prop: pValid}, 196 0x06F9: {prop: pValid}, 197 198 0x0640: {prop: disallowed}, 199 0x07FA: {prop: disallowed}, 200 0x302E: {prop: disallowed}, 201 0x302F: {prop: disallowed}, 202 0x3031: {prop: disallowed}, 203 0x3032: {prop: disallowed}, 204 0x3033: {prop: disallowed}, 205 0x3034: {prop: disallowed}, 206 0x3035: {prop: disallowed}, 207 0x303B: {prop: disallowed}, 208 } 209 210 // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1 211 // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}. 212 func isLetterDigits(r rune) bool { 213 return unicode.In(r, 214 unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters 215 unicode.Mn, unicode.Mc, // Modifiers 216 unicode.Nd, // Digits 217 ) 218 } 219 220 func isIdDisAndFreePVal(r rune) bool { 221 return unicode.In(r, 222 // OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18 223 // r in {Lt, Nl, No, Me} 224 unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers 225 unicode.Me, // Modifiers 226 227 // Spaces: https://tools.ietf.org/html/rfc7564#section-9.14 228 // r in {Zs} 229 unicode.Zs, 230 231 // Symbols: https://tools.ietf.org/html/rfc7564#section-9.15 232 // r in {Sm, Sc, Sk, So} 233 unicode.Sm, unicode.Sc, unicode.Sk, unicode.So, 234 235 // Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16 236 // r in {Pc, Pd, Ps, Pe, Pi, Pf, Po} 237 unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe, 238 unicode.Pi, unicode.Pf, unicode.Po, 239 ) 240 } 241 242 // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17 243 func hasCompat(r rune) bool { 244 return !norm.NFKC.IsNormalString(string(r)) 245 } 246 247 // From https://tools.ietf.org/html/rfc5892: 248 // 249 // If .cp. .in. Exceptions Then Exceptions(cp); 250 // Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp); 251 // Else If .cp. .in. Unassigned Then UNASSIGNED; 252 // Else If .cp. .in. ASCII7 Then PVALID; 253 // Else If .cp. .in. JoinControl Then CONTEXTJ; 254 // Else If .cp. .in. OldHangulJamo Then DISALLOWED; 255 // Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED; 256 // Else If .cp. .in. Controls Then DISALLOWED; 257 // Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL; 258 // Else If .cp. .in. LetterDigits Then PVALID; 259 // Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL; 260 // Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL; 261 // Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL; 262 // Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL; 263 // Else DISALLOWED; 264 265 func writeTables() { 266 propTrie := triegen.NewTrie("derivedProperties") 267 w := gen.NewCodeWriter() 268 defer w.WriteVersionedGoFile(*outputFile, "precis") 269 gen.WriteUnicodeVersion(w) 270 271 // Iterate over all the runes... 272 for i := rune(0); i < unicode.MaxRune; i++ { 273 r := rune(i) 274 275 if !utf8.ValidRune(r) { 276 continue 277 } 278 279 e, ok := exceptions[i] 280 p := e.prop 281 switch { 282 case ok: 283 case !unicode.In(r, assigned): 284 p = unassigned 285 case r >= 0x0021 && r <= 0x007e: // Is ASCII 7 286 p = pValid 287 case unicode.In(r, disallowedRunes, unicode.Cc): 288 p = disallowed 289 case hasCompat(r): 290 p = idDisOrFreePVal 291 case isLetterDigits(r): 292 p = pValid 293 case isIdDisAndFreePVal(r): 294 p = idDisOrFreePVal 295 default: 296 p = disallowed 297 } 298 cat := runeCategory[r] 299 // Don't set category for runes that are disallowed. 300 if p == disallowed { 301 cat = exceptions[r].cat 302 } 303 propTrie.Insert(r, uint64(p)|uint64(cat)) 304 } 305 sz, err := propTrie.Gen(w) 306 if err != nil { 307 log.Fatal(err) 308 } 309 w.Size += sz 310 }