golang.org/x/text@v0.14.0/feature/plural/gen.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 7 package main 8 9 // This file generates data for the CLDR plural rules, as defined in 10 // https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules 11 // 12 // We assume a slightly simplified grammar: 13 // 14 // condition = and_condition ('or' and_condition)* samples 15 // and_condition = relation ('and' relation)* 16 // relation = expr ('=' | '!=') range_list 17 // expr = operand ('%' '10' '0'* )? 18 // operand = 'n' | 'i' | 'f' | 't' | 'v' | 'w' 19 // range_list = (range | value) (',' range_list)* 20 // range = value'..'value 21 // value = digit+ 22 // digit = 0|1|2|3|4|5|6|7|8|9 23 // 24 // samples = ('@integer' sampleList)? 25 // ('@decimal' sampleList)? 26 // sampleList = sampleRange (',' sampleRange)* (',' ('…'|'...'))? 27 // sampleRange = decimalValue ('~' decimalValue)? 28 // decimalValue = value ('.' value)? 29 // 30 // Symbol Value 31 // n absolute value of the source number (integer and decimals). 32 // i integer digits of n. 33 // v number of visible fraction digits in n, with trailing zeros. 34 // w number of visible fraction digits in n, without trailing zeros. 35 // f visible fractional digits in n, with trailing zeros. 36 // t visible fractional digits in n, without trailing zeros. 37 // 38 // The algorithm for which the data is generated is based on the following 39 // observations 40 // 41 // - the number of different sets of numbers which the plural rules use to 42 // test inclusion is limited, 43 // - most numbers that are tested on are < 100 44 // 45 // This allows us to define a bitmap for each number < 100 where a bit i 46 // indicates whether this number is included in some defined set i. 47 // The function matchPlural in plural.go defines how we can subsequently use 48 // this data to determine inclusion. 49 // 50 // There are a few languages for which this doesn't work. For one Italian and 51 // Azerbaijan, which both test against numbers > 100 for ordinals and Breton, 52 // which considers whether numbers are multiples of hundreds. The model here 53 // could be extended to handle Italian and Azerbaijan fairly easily (by 54 // considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first 55 // 100), but for now it seems easier to just hard-code these cases. 56 57 import ( 58 "bufio" 59 "bytes" 60 "flag" 61 "fmt" 62 "log" 63 "strconv" 64 "strings" 65 66 "golang.org/x/text/internal/gen" 67 "golang.org/x/text/internal/language" 68 "golang.org/x/text/internal/language/compact" 69 "golang.org/x/text/unicode/cldr" 70 ) 71 72 var ( 73 test = flag.Bool("test", false, 74 "test existing tables; can be used to compare web data with package data.") 75 outputFile = flag.String("output", "tables.go", "output file") 76 outputTestFile = flag.String("testoutput", "data_test.go", "output file") 77 78 draft = flag.String("draft", 79 "contributed", 80 `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`) 81 ) 82 83 func main() { 84 gen.Init() 85 86 const pkg = "plural" 87 88 gen.Repackage("gen_common.go", "common.go", pkg) 89 // Read the CLDR zip file. 90 r := gen.OpenCLDRCoreZip() 91 defer r.Close() 92 93 d := &cldr.Decoder{} 94 d.SetDirFilter("supplemental", "main") 95 d.SetSectionFilter("numbers", "plurals") 96 data, err := d.DecodeZip(r) 97 if err != nil { 98 log.Fatalf("DecodeZip: %v", err) 99 } 100 101 w := gen.NewCodeWriter() 102 defer w.WriteGoFile(*outputFile, pkg) 103 104 gen.WriteCLDRVersion(w) 105 106 genPlurals(w, data) 107 108 w = gen.NewCodeWriter() 109 defer w.WriteGoFile(*outputTestFile, pkg) 110 111 genPluralsTests(w, data) 112 } 113 114 type pluralTest struct { 115 locales string // space-separated list of locales for this test 116 form int // Use int instead of Form to simplify generation. 117 integer []string // Entries of the form \d+ or \d+~\d+ 118 decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+ 119 } 120 121 func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) { 122 w.WriteType(pluralTest{}) 123 124 for _, plurals := range data.Supplemental().Plurals { 125 if plurals.Type == "" { 126 // The empty type is reserved for plural ranges. 127 continue 128 } 129 tests := []pluralTest{} 130 131 for _, pRules := range plurals.PluralRules { 132 for _, rule := range pRules.PluralRule { 133 test := pluralTest{ 134 locales: pRules.Locales, 135 form: int(countMap[rule.Count]), 136 } 137 scan := bufio.NewScanner(strings.NewReader(rule.Data())) 138 scan.Split(splitTokens) 139 var p *[]string 140 for scan.Scan() { 141 switch t := scan.Text(); t { 142 case "@integer": 143 p = &test.integer 144 case "@decimal": 145 p = &test.decimal 146 case ",", "…": 147 default: 148 if p != nil { 149 *p = append(*p, t) 150 } 151 } 152 } 153 tests = append(tests, test) 154 } 155 } 156 w.WriteVar(plurals.Type+"Tests", tests) 157 } 158 } 159 160 func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) { 161 for _, plurals := range data.Supplemental().Plurals { 162 if plurals.Type == "" { 163 continue 164 } 165 // Initialize setMap and inclusionMasks. They are already populated with 166 // a few entries to serve as an example and to assign nice numbers to 167 // common cases. 168 169 // setMap contains sets of numbers represented by boolean arrays where 170 // a true value for element i means that the number i is included. 171 setMap := map[[numN]bool]int{ 172 // The above init func adds an entry for including all numbers. 173 [numN]bool{1: true}: 1, // fix {1} to a nice value 174 [numN]bool{2: true}: 2, // fix {2} to a nice value 175 [numN]bool{0: true}: 3, // fix {0} to a nice value 176 } 177 178 // inclusionMasks contains bit masks for every number under numN to 179 // indicate in which set the number is included. Bit 1 << x will be set 180 // if it is included in set x. 181 inclusionMasks := [numN]uint64{ 182 // Note: these entries are not complete: more bits will be set along the way. 183 0: 1 << 3, 184 1: 1 << 1, 185 2: 1 << 2, 186 } 187 188 // Create set {0..99}. We will assign this set the identifier 0. 189 var all [numN]bool 190 for i := range all { 191 // Mark number i as being included in the set (which has identifier 0). 192 inclusionMasks[i] |= 1 << 0 193 // Mark number i as included in the set. 194 all[i] = true 195 } 196 // Register the identifier for the set. 197 setMap[all] = 0 198 199 rules := []pluralCheck{} 200 index := []byte{0} 201 langMap := map[compact.ID]byte{0: 0} 202 203 for _, pRules := range plurals.PluralRules { 204 // Parse the rules. 205 var conds []orCondition 206 for _, rule := range pRules.PluralRule { 207 form := countMap[rule.Count] 208 conds = parsePluralCondition(conds, rule.Data(), form) 209 } 210 // Encode the rules. 211 for _, c := range conds { 212 // If an or condition only has filters, we create an entry for 213 // this filter and the set that contains all values. 214 empty := true 215 for _, b := range c.used { 216 empty = empty && !b 217 } 218 if empty { 219 rules = append(rules, pluralCheck{ 220 cat: byte(opMod<<opShift) | byte(c.form), 221 setID: 0, // all values 222 }) 223 continue 224 } 225 // We have some entries with values. 226 for i, set := range c.set { 227 if !c.used[i] { 228 continue 229 } 230 index, ok := setMap[set] 231 if !ok { 232 index = len(setMap) 233 setMap[set] = index 234 for i := range inclusionMasks { 235 if set[i] { 236 inclusionMasks[i] |= 1 << uint64(index) 237 } 238 } 239 } 240 rules = append(rules, pluralCheck{ 241 cat: byte(i<<opShift | andNext), 242 setID: byte(index), 243 }) 244 } 245 // Now set the last entry to the plural form the rule matches. 246 rules[len(rules)-1].cat &^= formMask 247 rules[len(rules)-1].cat |= byte(c.form) 248 } 249 // Point the relevant locales to the created entries. 250 for _, loc := range strings.Split(pRules.Locales, " ") { 251 if strings.TrimSpace(loc) == "" { 252 continue 253 } 254 lang, ok := compact.FromTag(language.MustParse(loc)) 255 if !ok { 256 log.Printf("No compact index for locale %q", loc) 257 } 258 langMap[lang] = byte(len(index) - 1) 259 } 260 index = append(index, byte(len(rules))) 261 } 262 w.WriteVar(plurals.Type+"Rules", rules) 263 w.WriteVar(plurals.Type+"Index", index) 264 // Expand the values: first by using the parent relationship. 265 langToIndex := make([]byte, compact.NumCompactTags) 266 for i := range langToIndex { 267 for p := compact.ID(i); ; p = p.Parent() { 268 if x, ok := langMap[p]; ok { 269 langToIndex[i] = x 270 break 271 } 272 } 273 } 274 // Now expand by including entries with identical languages for which 275 // one isn't set. 276 for i, v := range langToIndex { 277 if v == 0 { 278 id, _ := compact.FromTag(language.Tag{ 279 LangID: compact.ID(i).Tag().LangID, 280 }) 281 if p := langToIndex[id]; p != 0 { 282 langToIndex[i] = p 283 } 284 } 285 } 286 w.WriteVar(plurals.Type+"LangToIndex", langToIndex) 287 // Need to convert array to slice because of golang.org/issue/7651. 288 // This will allow tables to be dropped when unused. This is especially 289 // relevant for the ordinal data, which I suspect won't be used as much. 290 w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:]) 291 292 if len(rules) > 0xFF { 293 log.Fatalf("Too many entries for rules: %#x", len(rules)) 294 } 295 if len(index) > 0xFF { 296 log.Fatalf("Too many entries for index: %#x", len(index)) 297 } 298 if len(setMap) > 64 { // maximum number of bits. 299 log.Fatalf("Too many entries for setMap: %d", len(setMap)) 300 } 301 w.WriteComment( 302 "Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets", 303 plurals.Type, len(rules), len(index), len(setMap)) 304 // Prevent comment from attaching to the next entry. 305 fmt.Fprint(w, "\n\n") 306 } 307 } 308 309 type orCondition struct { 310 original string // for debugging 311 312 form Form 313 used [32]bool 314 set [32][numN]bool 315 } 316 317 func (o *orCondition) add(op opID, mod int, v []int) (ok bool) { 318 ok = true 319 for _, x := range v { 320 if x >= maxMod { 321 ok = false 322 break 323 } 324 } 325 for i := 0; i < numN; i++ { 326 m := i 327 if mod != 0 { 328 m = i % mod 329 } 330 if !intIn(m, v) { 331 o.set[op][i] = false 332 } 333 } 334 if ok { 335 o.used[op] = true 336 } 337 return ok 338 } 339 340 func intIn(x int, a []int) bool { 341 for _, y := range a { 342 if x == y { 343 return true 344 } 345 } 346 return false 347 } 348 349 var operandIndex = map[string]opID{ 350 "i": opI, 351 "n": opN, 352 "f": opF, 353 "v": opV, 354 "w": opW, 355 } 356 357 // parsePluralCondition parses the condition of a single pluralRule and appends 358 // the resulting or conditions to conds. 359 // 360 // Example rules: 361 // 362 // // Category "one" in English: only allow 1 with no visible fraction 363 // i = 1 and v = 0 @integer 1 364 // 365 // // Category "few" in Czech: all numbers with visible fractions 366 // v != 0 @decimal ... 367 // 368 // // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or 369 // // numbers with a fraction 11..19 and no trailing zeros. 370 // n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ... 371 // 372 // @integer and @decimal are followed by examples and are not relevant for the 373 // rule itself. The are used here to signal the termination of the rule. 374 func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition { 375 scan := bufio.NewScanner(strings.NewReader(s)) 376 scan.Split(splitTokens) 377 for { 378 cond := orCondition{original: s, form: f} 379 // Set all numbers to be allowed for all number classes and restrict 380 // from here on. 381 for i := range cond.set { 382 for j := range cond.set[i] { 383 cond.set[i][j] = true 384 } 385 } 386 andLoop: 387 for { 388 var token string 389 scan.Scan() // Must exist. 390 switch class := scan.Text(); class { 391 case "t": 392 class = "w" // equal to w for t == 0 393 fallthrough 394 case "n", "i", "f", "v", "w": 395 op := scanToken(scan) 396 opCode := operandIndex[class] 397 mod := 0 398 if op == "%" { 399 opCode |= opMod 400 401 switch v := scanUint(scan); v { 402 case 10, 100: 403 mod = v 404 case 1000: 405 // A more general solution would be to allow checking 406 // against multiples of 100 and include entries for the 407 // numbers 100..900 in the inclusion masks. At the 408 // moment this would only help Azerbaijan and Italian. 409 410 // Italian doesn't use '%', so this must be Azerbaijan. 411 cond.used[opAzerbaijan00s] = true 412 return append(conds, cond) 413 414 case 1000000: 415 cond.used[opBretonM] = true 416 return append(conds, cond) 417 418 default: 419 log.Fatalf("Modulo value not supported %d", v) 420 } 421 op = scanToken(scan) 422 } 423 if op != "=" && op != "!=" { 424 log.Fatalf("Unexpected op %q", op) 425 } 426 if op == "!=" { 427 opCode |= opNotEqual 428 } 429 a := []int{} 430 v := scanUint(scan) 431 if class == "w" && v != 0 { 432 log.Fatalf("Must compare against zero for operand type %q", class) 433 } 434 token = scanToken(scan) 435 for { 436 switch token { 437 case "..": 438 end := scanUint(scan) 439 for ; v <= end; v++ { 440 a = append(a, v) 441 } 442 token = scanToken(scan) 443 default: // ",", "or", "and", "@..." 444 a = append(a, v) 445 } 446 if token != "," { 447 break 448 } 449 v = scanUint(scan) 450 token = scanToken(scan) 451 } 452 if !cond.add(opCode, mod, a) { 453 // Detected large numbers. As we ruled out Azerbaijan, this 454 // must be the many rule for Italian ordinals. 455 cond.set[opItalian800] = cond.set[opN] 456 cond.used[opItalian800] = true 457 } 458 459 case "@integer", "@decimal": // "other" entry: tests only. 460 return conds 461 default: 462 log.Fatalf("Unexpected operand class %q (%s)", class, s) 463 } 464 switch token { 465 case "or": 466 conds = append(conds, cond) 467 break andLoop 468 case "@integer", "@decimal": // examples 469 // There is always an example in practice, so we always terminate here. 470 if err := scan.Err(); err != nil { 471 log.Fatal(err) 472 } 473 return append(conds, cond) 474 case "and": 475 // keep accumulating 476 default: 477 log.Fatalf("Unexpected token %q", token) 478 } 479 } 480 } 481 } 482 483 func scanToken(scan *bufio.Scanner) string { 484 scan.Scan() 485 return scan.Text() 486 } 487 488 func scanUint(scan *bufio.Scanner) int { 489 scan.Scan() 490 val, err := strconv.ParseUint(scan.Text(), 10, 32) 491 if err != nil { 492 log.Fatal(err) 493 } 494 return int(val) 495 } 496 497 // splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules. 498 func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) { 499 condTokens := [][]byte{ 500 []byte(".."), 501 []byte(","), 502 []byte("!="), 503 []byte("="), 504 } 505 advance, token, err = bufio.ScanWords(data, atEOF) 506 for _, t := range condTokens { 507 if len(t) >= len(token) { 508 continue 509 } 510 switch p := bytes.Index(token, t); { 511 case p == -1: 512 case p == 0: 513 advance = len(t) 514 token = token[:len(t)] 515 return advance - len(token) + len(t), token[:len(t)], err 516 case p < advance: 517 // Don't split when "=" overlaps "!=". 518 if t[0] == '=' && token[p-1] == '!' { 519 continue 520 } 521 advance = p 522 token = token[:p] 523 } 524 } 525 return advance, token, err 526 }