golang.org/x/text@v0.14.0/internal/language/gen.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 7 // Language tag table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "log" 18 "math" 19 "reflect" 20 "regexp" 21 "sort" 22 "strconv" 23 "strings" 24 25 "golang.org/x/text/internal/gen" 26 "golang.org/x/text/internal/tag" 27 "golang.org/x/text/unicode/cldr" 28 ) 29 30 var ( 31 test = flag.Bool("test", 32 false, 33 "test existing tables; can be used to compare web data with package data.") 34 outputFile = flag.String("output", 35 "tables.go", 36 "output file for generated tables") 37 ) 38 39 var comment = []string{ 40 ` 41 lang holds an alphabetically sorted list of ISO-639 language identifiers. 42 All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. 43 For 2-byte language identifiers, the two successive bytes have the following meaning: 44 - if the first letter of the 2- and 3-letter ISO codes are the same: 45 the second and third letter of the 3-letter ISO code. 46 - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. 47 For 3-byte language identifiers the 4th byte is 0.`, 48 ` 49 langNoIndex is a bit vector of all 3-letter language codes that are not used as an index 50 in lookup tables. The language ids for these language codes are derived directly 51 from the letters and are not consecutive.`, 52 ` 53 altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives 54 to 2-letter language codes that cannot be derived using the method described above. 55 Each 3-letter code is followed by its 1-byte langID.`, 56 ` 57 altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, 58 ` 59 AliasMap maps langIDs to their suggested replacements.`, 60 ` 61 script is an alphabetically sorted list of ISO 15924 codes. The index 62 of the script in the string, divided by 4, is the internal scriptID.`, 63 ` 64 isoRegionOffset needs to be added to the index of regionISO to obtain the regionID 65 for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for 66 the UN.M49 codes used for groups.)`, 67 ` 68 regionISO holds a list of alphabetically sorted 2-letter ISO region codes. 69 Each 2-letter codes is followed by two bytes with the following meaning: 70 - [A-Z}{2}: the first letter of the 2-letter code plus these two 71 letters form the 3-letter ISO code. 72 - 0, n: index into altRegionISO3.`, 73 ` 74 regionTypes defines the status of a region for various standards.`, 75 ` 76 m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are 77 codes indicating collections of regions.`, 78 ` 79 m49Index gives indexes into fromM49 based on the three most significant bits 80 of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in 81 fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] 82 for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. 83 The region code is stored in the 9 lsb of the indexed value.`, 84 ` 85 fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, 86 ` 87 altRegionISO3 holds a list of 3-letter region codes that cannot be 88 mapped to 2-letter codes using the default algorithm. This is a short list.`, 89 ` 90 altRegionIDs holds a list of regionIDs the positions of which match those 91 of the 3-letter ISO codes in altRegionISO3.`, 92 ` 93 variantNumSpecialized is the number of specialized variants in variants.`, 94 ` 95 suppressScript is an index from langID to the dominant script for that language, 96 if it exists. If a script is given, it should be suppressed from the language tag.`, 97 ` 98 likelyLang is a lookup table, indexed by langID, for the most likely 99 scripts and regions given incomplete information. If more entries exist for a 100 given language, region and script are the index and size respectively 101 of the list in likelyLangList.`, 102 ` 103 likelyLangList holds lists info associated with likelyLang.`, 104 ` 105 likelyRegion is a lookup table, indexed by regionID, for the most likely 106 languages and scripts given incomplete information. If more entries exist 107 for a given regionID, lang and script are the index and size respectively 108 of the list in likelyRegionList. 109 TODO: exclude containers and user-definable regions from the list.`, 110 ` 111 likelyRegionList holds lists info associated with likelyRegion.`, 112 ` 113 likelyScript is a lookup table, indexed by scriptID, for the most likely 114 languages and regions given a script.`, 115 ` 116 nRegionGroups is the number of region groups.`, 117 ` 118 regionInclusion maps region identifiers to sets of regions in regionInclusionBits, 119 where each set holds all groupings that are directly connected in a region 120 containment graph.`, 121 ` 122 regionInclusionBits is an array of bit vectors where every vector represents 123 a set of region groupings. These sets are used to compute the distance 124 between two regions for the purpose of language matching.`, 125 ` 126 regionInclusionNext marks, for each entry in regionInclusionBits, the set of 127 all groups that are reachable from the groups set in the respective entry.`, 128 } 129 130 // TODO: consider changing some of these structures to tries. This can reduce 131 // memory, but may increase the need for memory allocations. This could be 132 // mitigated if we can piggyback on language tags for common cases. 133 134 func failOnError(e error) { 135 if e != nil { 136 log.Panic(e) 137 } 138 } 139 140 type setType int 141 142 const ( 143 Indexed setType = 1 + iota // all elements must be of same size 144 Linear 145 ) 146 147 type stringSet struct { 148 s []string 149 sorted, frozen bool 150 151 // We often need to update values after the creation of an index is completed. 152 // We include a convenience map for keeping track of this. 153 update map[string]string 154 typ setType // used for checking. 155 } 156 157 func (ss *stringSet) clone() stringSet { 158 c := *ss 159 c.s = append([]string(nil), c.s...) 160 return c 161 } 162 163 func (ss *stringSet) setType(t setType) { 164 if ss.typ != t && ss.typ != 0 { 165 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) 166 } 167 } 168 169 // parse parses a whitespace-separated string and initializes ss with its 170 // components. 171 func (ss *stringSet) parse(s string) { 172 scan := bufio.NewScanner(strings.NewReader(s)) 173 scan.Split(bufio.ScanWords) 174 for scan.Scan() { 175 ss.add(scan.Text()) 176 } 177 } 178 179 func (ss *stringSet) assertChangeable() { 180 if ss.frozen { 181 log.Panic("attempt to modify a frozen stringSet") 182 } 183 } 184 185 func (ss *stringSet) add(s string) { 186 ss.assertChangeable() 187 ss.s = append(ss.s, s) 188 ss.sorted = ss.frozen 189 } 190 191 func (ss *stringSet) freeze() { 192 ss.compact() 193 ss.frozen = true 194 } 195 196 func (ss *stringSet) compact() { 197 if ss.sorted { 198 return 199 } 200 a := ss.s 201 sort.Strings(a) 202 k := 0 203 for i := 1; i < len(a); i++ { 204 if a[k] != a[i] { 205 a[k+1] = a[i] 206 k++ 207 } 208 } 209 ss.s = a[:k+1] 210 ss.sorted = ss.frozen 211 } 212 213 type funcSorter struct { 214 fn func(a, b string) bool 215 sort.StringSlice 216 } 217 218 func (s funcSorter) Less(i, j int) bool { 219 return s.fn(s.StringSlice[i], s.StringSlice[j]) 220 } 221 222 func (ss *stringSet) sortFunc(f func(a, b string) bool) { 223 ss.compact() 224 sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) 225 } 226 227 func (ss *stringSet) remove(s string) { 228 ss.assertChangeable() 229 if i, ok := ss.find(s); ok { 230 copy(ss.s[i:], ss.s[i+1:]) 231 ss.s = ss.s[:len(ss.s)-1] 232 } 233 } 234 235 func (ss *stringSet) replace(ol, nu string) { 236 ss.s[ss.index(ol)] = nu 237 ss.sorted = ss.frozen 238 } 239 240 func (ss *stringSet) index(s string) int { 241 ss.setType(Indexed) 242 i, ok := ss.find(s) 243 if !ok { 244 if i < len(ss.s) { 245 log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) 246 } 247 log.Panicf("find: item %q is not in list", s) 248 249 } 250 return i 251 } 252 253 func (ss *stringSet) find(s string) (int, bool) { 254 ss.compact() 255 i := sort.SearchStrings(ss.s, s) 256 return i, i != len(ss.s) && ss.s[i] == s 257 } 258 259 func (ss *stringSet) slice() []string { 260 ss.compact() 261 return ss.s 262 } 263 264 func (ss *stringSet) updateLater(v, key string) { 265 if ss.update == nil { 266 ss.update = map[string]string{} 267 } 268 ss.update[v] = key 269 } 270 271 // join joins the string and ensures that all entries are of the same length. 272 func (ss *stringSet) join() string { 273 ss.setType(Indexed) 274 n := len(ss.s[0]) 275 for _, s := range ss.s { 276 if len(s) != n { 277 log.Panicf("join: not all entries are of the same length: %q", s) 278 } 279 } 280 ss.s = append(ss.s, strings.Repeat("\xff", n)) 281 return strings.Join(ss.s, "") 282 } 283 284 // ianaEntry holds information for an entry in the IANA Language Subtag Repository. 285 // All types use the same entry. 286 // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various 287 // fields. 288 type ianaEntry struct { 289 typ string 290 description []string 291 scope string 292 added string 293 preferred string 294 deprecated string 295 suppressScript string 296 macro string 297 prefix []string 298 } 299 300 type builder struct { 301 w *gen.CodeWriter 302 hw io.Writer // MultiWriter for w and w.Hash 303 data *cldr.CLDR 304 supp *cldr.SupplementalData 305 306 // indices 307 locale stringSet // common locales 308 lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data 309 langNoIndex stringSet // 3-letter ISO codes with no associated data 310 script stringSet // 4-letter ISO codes 311 region stringSet // 2-letter ISO or 3-digit UN M49 codes 312 variant stringSet // 4-8-alphanumeric variant code. 313 314 // Region codes that are groups with their corresponding group IDs. 315 groups map[int]index 316 317 // langInfo 318 registry map[string]*ianaEntry 319 } 320 321 type index uint 322 323 func newBuilder(w *gen.CodeWriter) *builder { 324 r := gen.OpenCLDRCoreZip() 325 defer r.Close() 326 d := &cldr.Decoder{} 327 data, err := d.DecodeZip(r) 328 failOnError(err) 329 b := builder{ 330 w: w, 331 hw: io.MultiWriter(w, w.Hash), 332 data: data, 333 supp: data.Supplemental(), 334 } 335 b.parseRegistry() 336 return &b 337 } 338 339 func (b *builder) parseRegistry() { 340 r := gen.OpenIANAFile("assignments/language-subtag-registry") 341 defer r.Close() 342 b.registry = make(map[string]*ianaEntry) 343 344 scan := bufio.NewScanner(r) 345 scan.Split(bufio.ScanWords) 346 var record *ianaEntry 347 for more := scan.Scan(); more; { 348 key := scan.Text() 349 more = scan.Scan() 350 value := scan.Text() 351 switch key { 352 case "Type:": 353 record = &ianaEntry{typ: value} 354 case "Subtag:", "Tag:": 355 if s := strings.SplitN(value, "..", 2); len(s) > 1 { 356 for a := s[0]; a <= s[1]; a = inc(a) { 357 b.addToRegistry(a, record) 358 } 359 } else { 360 b.addToRegistry(value, record) 361 } 362 case "Suppress-Script:": 363 record.suppressScript = value 364 case "Added:": 365 record.added = value 366 case "Deprecated:": 367 record.deprecated = value 368 case "Macrolanguage:": 369 record.macro = value 370 case "Preferred-Value:": 371 record.preferred = value 372 case "Prefix:": 373 record.prefix = append(record.prefix, value) 374 case "Scope:": 375 record.scope = value 376 case "Description:": 377 buf := []byte(value) 378 for more = scan.Scan(); more; more = scan.Scan() { 379 b := scan.Bytes() 380 if b[0] == '%' || b[len(b)-1] == ':' { 381 break 382 } 383 buf = append(buf, ' ') 384 buf = append(buf, b...) 385 } 386 record.description = append(record.description, string(buf)) 387 continue 388 default: 389 continue 390 } 391 more = scan.Scan() 392 } 393 if scan.Err() != nil { 394 log.Panic(scan.Err()) 395 } 396 } 397 398 func (b *builder) addToRegistry(key string, entry *ianaEntry) { 399 if info, ok := b.registry[key]; ok { 400 if info.typ != "language" || entry.typ != "extlang" { 401 log.Fatalf("parseRegistry: tag %q already exists", key) 402 } 403 } else { 404 b.registry[key] = entry 405 } 406 } 407 408 var commentIndex = make(map[string]string) 409 410 func init() { 411 for _, s := range comment { 412 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) 413 commentIndex[key] = s 414 } 415 } 416 417 func (b *builder) comment(name string) { 418 if s := commentIndex[name]; len(s) > 0 { 419 b.w.WriteComment(s) 420 } else { 421 fmt.Fprintln(b.w) 422 } 423 } 424 425 func (b *builder) pf(f string, x ...interface{}) { 426 fmt.Fprintf(b.hw, f, x...) 427 fmt.Fprint(b.hw, "\n") 428 } 429 430 func (b *builder) p(x ...interface{}) { 431 fmt.Fprintln(b.hw, x...) 432 } 433 434 func (b *builder) addSize(s int) { 435 b.w.Size += s 436 b.pf("// Size: %d bytes", s) 437 } 438 439 func (b *builder) writeConst(name string, x interface{}) { 440 b.comment(name) 441 b.w.WriteConst(name, x) 442 } 443 444 // writeConsts computes f(v) for all v in values and writes the results 445 // as constants named _v to a single constant block. 446 func (b *builder) writeConsts(f func(string) int, values ...string) { 447 b.pf("const (") 448 for _, v := range values { 449 b.pf("\t_%s = %v", v, f(v)) 450 } 451 b.pf(")") 452 } 453 454 // writeType writes the type of the given value, which must be a struct. 455 func (b *builder) writeType(value interface{}) { 456 b.comment(reflect.TypeOf(value).Name()) 457 b.w.WriteType(value) 458 } 459 460 func (b *builder) writeSlice(name string, ss interface{}) { 461 b.writeSliceAddSize(name, 0, ss) 462 } 463 464 func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { 465 b.comment(name) 466 b.w.Size += extraSize 467 v := reflect.ValueOf(ss) 468 t := v.Type().Elem() 469 b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) 470 471 fmt.Fprintf(b.w, "var %s = ", name) 472 b.w.WriteArray(ss) 473 b.p() 474 } 475 476 type FromTo struct { 477 From, To uint16 478 } 479 480 func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { 481 ss.sortFunc(func(a, b string) bool { 482 return index(a) < index(b) 483 }) 484 m := []FromTo{} 485 for _, s := range ss.s { 486 m = append(m, FromTo{index(s), index(ss.update[s])}) 487 } 488 b.writeSlice(name, m) 489 } 490 491 const base = 'z' - 'a' + 1 492 493 func strToInt(s string) uint { 494 v := uint(0) 495 for i := 0; i < len(s); i++ { 496 v *= base 497 v += uint(s[i] - 'a') 498 } 499 return v 500 } 501 502 // converts the given integer to the original ASCII string passed to strToInt. 503 // len(s) must match the number of characters obtained. 504 func intToStr(v uint, s []byte) { 505 for i := len(s) - 1; i >= 0; i-- { 506 s[i] = byte(v%base) + 'a' 507 v /= base 508 } 509 } 510 511 func (b *builder) writeBitVector(name string, ss []string) { 512 vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) 513 for _, s := range ss { 514 v := strToInt(s) 515 vec[v/8] |= 1 << (v % 8) 516 } 517 b.writeSlice(name, vec) 518 } 519 520 // TODO: convert this type into a list or two-stage trie. 521 func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { 522 b.comment(name) 523 v := reflect.ValueOf(m) 524 sz := v.Len() * (2 + int(v.Type().Key().Size())) 525 for _, k := range m { 526 sz += len(k) 527 } 528 b.addSize(sz) 529 keys := []string{} 530 b.pf(`var %s = map[string]uint16{`, name) 531 for k := range m { 532 keys = append(keys, k) 533 } 534 sort.Strings(keys) 535 for _, k := range keys { 536 b.pf("\t%q: %v,", k, f(m[k])) 537 } 538 b.p("}") 539 } 540 541 func (b *builder) writeMap(name string, m interface{}) { 542 b.comment(name) 543 v := reflect.ValueOf(m) 544 sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) 545 b.addSize(sz) 546 f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { 547 return strings.IndexRune("{}, ", r) != -1 548 }) 549 sort.Strings(f[1:]) 550 b.pf(`var %s = %s{`, name, f[0]) 551 for _, kv := range f[1:] { 552 b.pf("\t%s,", kv) 553 } 554 b.p("}") 555 } 556 557 func (b *builder) langIndex(s string) uint16 { 558 if s == "und" { 559 return 0 560 } 561 if i, ok := b.lang.find(s); ok { 562 return uint16(i) 563 } 564 return uint16(strToInt(s)) + uint16(len(b.lang.s)) 565 } 566 567 // inc advances the string to its lexicographical successor. 568 func inc(s string) string { 569 const maxTagLength = 4 570 var buf [maxTagLength]byte 571 intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) 572 for i := 0; i < len(s); i++ { 573 if s[i] <= 'Z' { 574 buf[i] -= 'a' - 'A' 575 } 576 } 577 return string(buf[:len(s)]) 578 } 579 580 func (b *builder) parseIndices() { 581 meta := b.supp.Metadata 582 583 for k, v := range b.registry { 584 var ss *stringSet 585 switch v.typ { 586 case "language": 587 if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { 588 b.lang.add(k) 589 continue 590 } else { 591 ss = &b.langNoIndex 592 } 593 case "region": 594 ss = &b.region 595 case "script": 596 ss = &b.script 597 case "variant": 598 ss = &b.variant 599 default: 600 continue 601 } 602 ss.add(k) 603 } 604 // Include any language for which there is data. 605 for _, lang := range b.data.Locales() { 606 if x := b.data.RawLDML(lang); false || 607 x.LocaleDisplayNames != nil || 608 x.Characters != nil || 609 x.Delimiters != nil || 610 x.Measurement != nil || 611 x.Dates != nil || 612 x.Numbers != nil || 613 x.Units != nil || 614 x.ListPatterns != nil || 615 x.Collations != nil || 616 x.Segmentations != nil || 617 x.Rbnf != nil || 618 x.Annotations != nil || 619 x.Metadata != nil { 620 621 from := strings.Split(lang, "_") 622 if lang := from[0]; lang != "root" { 623 b.lang.add(lang) 624 } 625 } 626 } 627 // Include locales for plural rules, which uses a different structure. 628 for _, plurals := range b.data.Supplemental().Plurals { 629 for _, rules := range plurals.PluralRules { 630 for _, lang := range strings.Split(rules.Locales, " ") { 631 if lang = strings.Split(lang, "_")[0]; lang != "root" { 632 b.lang.add(lang) 633 } 634 } 635 } 636 } 637 // Include languages in likely subtags. 638 for _, m := range b.supp.LikelySubtags.LikelySubtag { 639 from := strings.Split(m.From, "_") 640 b.lang.add(from[0]) 641 } 642 // Include ISO-639 alpha-3 bibliographic entries. 643 for _, a := range meta.Alias.LanguageAlias { 644 if a.Reason == "bibliographic" { 645 b.langNoIndex.add(a.Type) 646 } 647 } 648 // Include regions in territoryAlias (not all are in the IANA registry!) 649 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 650 if len(reg.Type) == 2 { 651 b.region.add(reg.Type) 652 } 653 } 654 655 for _, s := range b.lang.s { 656 if len(s) == 3 { 657 b.langNoIndex.remove(s) 658 } 659 } 660 b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) 661 b.writeConst("NumScripts", len(b.script.slice())) 662 b.writeConst("NumRegions", len(b.region.slice())) 663 664 // Add dummy codes at the start of each list to represent "unspecified". 665 b.lang.add("---") 666 b.script.add("----") 667 b.region.add("---") 668 669 // common locales 670 b.locale.parse(meta.DefaultContent.Locales) 671 } 672 673 // TODO: region inclusion data will probably not be use used in future matchers. 674 675 func (b *builder) computeRegionGroups() { 676 b.groups = make(map[int]index) 677 678 // Create group indices. 679 for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. 680 b.groups[i] = index(len(b.groups)) 681 } 682 for _, g := range b.supp.TerritoryContainment.Group { 683 // Skip UN and EURO zone as they are flattening the containment 684 // relationship. 685 if g.Type == "EZ" || g.Type == "UN" { 686 continue 687 } 688 group := b.region.index(g.Type) 689 if _, ok := b.groups[group]; !ok { 690 b.groups[group] = index(len(b.groups)) 691 } 692 } 693 if len(b.groups) > 64 { 694 log.Fatalf("only 64 groups supported, found %d", len(b.groups)) 695 } 696 b.writeConst("nRegionGroups", len(b.groups)) 697 } 698 699 var langConsts = []string{ 700 "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", 701 "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", 702 "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", 703 "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", 704 "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", 705 "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", 706 707 // constants for grandfathered tags (if not already defined) 708 "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", 709 "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", 710 } 711 712 // writeLanguage generates all tables needed for language canonicalization. 713 func (b *builder) writeLanguage() { 714 meta := b.supp.Metadata 715 716 b.writeConst("nonCanonicalUnd", b.lang.index("und")) 717 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 718 b.writeConst("langPrivateStart", b.langIndex("qaa")) 719 b.writeConst("langPrivateEnd", b.langIndex("qtz")) 720 721 // Get language codes that need to be mapped (overlong 3-letter codes, 722 // deprecated 2-letter codes, legacy and grandfathered tags.) 723 langAliasMap := stringSet{} 724 aliasTypeMap := map[string]AliasType{} 725 726 // altLangISO3 get the alternative ISO3 names that need to be mapped. 727 altLangISO3 := stringSet{} 728 // Add dummy start to avoid the use of index 0. 729 altLangISO3.add("---") 730 altLangISO3.updateLater("---", "aa") 731 732 lang := b.lang.clone() 733 for _, a := range meta.Alias.LanguageAlias { 734 if a.Replacement == "" { 735 a.Replacement = "und" 736 } 737 // TODO: support mapping to tags 738 repl := strings.SplitN(a.Replacement, "_", 2)[0] 739 if a.Reason == "overlong" { 740 if len(a.Replacement) == 2 && len(a.Type) == 3 { 741 lang.updateLater(a.Replacement, a.Type) 742 } 743 } else if len(a.Type) <= 3 { 744 switch a.Reason { 745 case "macrolanguage": 746 aliasTypeMap[a.Type] = Macro 747 case "deprecated": 748 // handled elsewhere 749 continue 750 case "bibliographic", "legacy": 751 if a.Type == "no" { 752 continue 753 } 754 aliasTypeMap[a.Type] = Legacy 755 default: 756 log.Fatalf("new %s alias: %s", a.Reason, a.Type) 757 } 758 langAliasMap.add(a.Type) 759 langAliasMap.updateLater(a.Type, repl) 760 } 761 } 762 // Manually add the mapping of "nb" (Norwegian) to its macro language. 763 // This can be removed if CLDR adopts this change. 764 langAliasMap.add("nb") 765 langAliasMap.updateLater("nb", "no") 766 aliasTypeMap["nb"] = Macro 767 768 for k, v := range b.registry { 769 // Also add deprecated values for 3-letter ISO codes, which CLDR omits. 770 if v.typ == "language" && v.deprecated != "" && v.preferred != "" { 771 langAliasMap.add(k) 772 langAliasMap.updateLater(k, v.preferred) 773 aliasTypeMap[k] = Deprecated 774 } 775 } 776 // Fix CLDR mappings. 777 lang.updateLater("tl", "tgl") 778 lang.updateLater("sh", "hbs") 779 lang.updateLater("mo", "mol") 780 lang.updateLater("no", "nor") 781 lang.updateLater("tw", "twi") 782 lang.updateLater("nb", "nob") 783 lang.updateLater("ak", "aka") 784 lang.updateLater("bh", "bih") 785 786 // Ensure that each 2-letter code is matched with a 3-letter code. 787 for _, v := range lang.s[1:] { 788 s, ok := lang.update[v] 789 if !ok { 790 if s, ok = lang.update[langAliasMap.update[v]]; !ok { 791 continue 792 } 793 lang.update[v] = s 794 } 795 if v[0] != s[0] { 796 altLangISO3.add(s) 797 altLangISO3.updateLater(s, v) 798 } 799 } 800 801 // Complete canonicalized language tags. 802 lang.freeze() 803 for i, v := range lang.s { 804 // We can avoid these manual entries by using the IANA registry directly. 805 // Seems easier to update the list manually, as changes are rare. 806 // The panic in this loop will trigger if we miss an entry. 807 add := "" 808 if s, ok := lang.update[v]; ok { 809 if s[0] == v[0] { 810 add = s[1:] 811 } else { 812 add = string([]byte{0, byte(altLangISO3.index(s))}) 813 } 814 } else if len(v) == 3 { 815 add = "\x00" 816 } else { 817 log.Panicf("no data for long form of %q", v) 818 } 819 lang.s[i] += add 820 } 821 b.writeConst("lang", tag.Index(lang.join())) 822 823 b.writeConst("langNoIndexOffset", len(b.lang.s)) 824 825 // space of all valid 3-letter language identifiers. 826 b.writeBitVector("langNoIndex", b.langNoIndex.slice()) 827 828 altLangIndex := []uint16{} 829 for i, s := range altLangISO3.slice() { 830 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) 831 if i > 0 { 832 idx := b.lang.index(altLangISO3.update[s]) 833 altLangIndex = append(altLangIndex, uint16(idx)) 834 } 835 } 836 b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) 837 b.writeSlice("altLangIndex", altLangIndex) 838 839 b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex) 840 types := make([]AliasType, len(langAliasMap.s)) 841 for i, s := range langAliasMap.s { 842 types[i] = aliasTypeMap[s] 843 } 844 b.writeSlice("AliasTypes", types) 845 } 846 847 var scriptConsts = []string{ 848 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 849 "Zzzz", 850 } 851 852 func (b *builder) writeScript() { 853 b.writeConsts(b.script.index, scriptConsts...) 854 b.writeConst("script", tag.Index(b.script.join())) 855 856 supp := make([]uint8, len(b.lang.slice())) 857 for i, v := range b.lang.slice()[1:] { 858 if sc := b.registry[v].suppressScript; sc != "" { 859 supp[i+1] = uint8(b.script.index(sc)) 860 } 861 } 862 b.writeSlice("suppressScript", supp) 863 864 // There is only one deprecated script in CLDR. This value is hard-coded. 865 // We check here if the code must be updated. 866 for _, a := range b.supp.Metadata.Alias.ScriptAlias { 867 if a.Type != "Qaai" { 868 log.Panicf("unexpected deprecated stript %q", a.Type) 869 } 870 } 871 } 872 873 func parseM49(s string) int16 { 874 if len(s) == 0 { 875 return 0 876 } 877 v, err := strconv.ParseUint(s, 10, 10) 878 failOnError(err) 879 return int16(v) 880 } 881 882 var regionConsts = []string{ 883 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 884 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 885 } 886 887 func (b *builder) writeRegion() { 888 b.writeConsts(b.region.index, regionConsts...) 889 890 isoOffset := b.region.index("AA") 891 m49map := make([]int16, len(b.region.slice())) 892 fromM49map := make(map[int16]int) 893 altRegionISO3 := "" 894 altRegionIDs := []uint16{} 895 896 b.writeConst("isoRegionOffset", isoOffset) 897 898 // 2-letter region lookup and mapping to numeric codes. 899 regionISO := b.region.clone() 900 regionISO.s = regionISO.s[isoOffset:] 901 regionISO.sorted = false 902 903 regionTypes := make([]byte, len(b.region.s)) 904 905 // Is the region valid BCP 47? 906 for s, e := range b.registry { 907 if len(s) == 2 && s == strings.ToUpper(s) { 908 i := b.region.index(s) 909 for _, d := range e.description { 910 if strings.Contains(d, "Private use") { 911 regionTypes[i] = iso3166UserAssigned 912 } 913 } 914 regionTypes[i] |= bcp47Region 915 } 916 } 917 918 // Is the region a valid ccTLD? 919 r := gen.OpenIANAFile("domains/root/db") 920 defer r.Close() 921 922 buf, err := io.ReadAll(r) 923 failOnError(err) 924 re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) 925 for _, m := range re.FindAllSubmatch(buf, -1) { 926 i := b.region.index(strings.ToUpper(string(m[1]))) 927 regionTypes[i] |= ccTLD 928 } 929 930 b.writeSlice("regionTypes", regionTypes) 931 932 iso3Set := make(map[string]int) 933 update := func(iso2, iso3 string) { 934 i := regionISO.index(iso2) 935 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { 936 regionISO.s[i] += iso3[1:] 937 iso3Set[iso3] = -1 938 } else { 939 if ok && j >= 0 { 940 regionISO.s[i] += string([]byte{0, byte(j)}) 941 } else { 942 iso3Set[iso3] = len(altRegionISO3) 943 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) 944 altRegionISO3 += iso3 945 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) 946 } 947 } 948 } 949 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 950 i := regionISO.index(tc.Type) + isoOffset 951 if d := m49map[i]; d != 0 { 952 log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) 953 } 954 m49 := parseM49(tc.Numeric) 955 m49map[i] = m49 956 if r := fromM49map[m49]; r == 0 { 957 fromM49map[m49] = i 958 } else if r != i { 959 dep := b.registry[regionISO.s[r-isoOffset]].deprecated 960 if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { 961 fromM49map[m49] = i 962 } 963 } 964 } 965 for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { 966 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { 967 from := parseM49(ta.Type) 968 if r := fromM49map[from]; r == 0 { 969 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset 970 } 971 } 972 } 973 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 974 if len(tc.Alpha3) == 3 { 975 update(tc.Type, tc.Alpha3) 976 } 977 } 978 // This entries are not included in territoryCodes. Mostly 3-letter variants 979 // of deleted codes and an entry for QU. 980 for _, m := range []struct{ iso2, iso3 string }{ 981 {"CT", "CTE"}, 982 {"DY", "DHY"}, 983 {"HV", "HVO"}, 984 {"JT", "JTN"}, 985 {"MI", "MID"}, 986 {"NH", "NHB"}, 987 {"NQ", "ATN"}, 988 {"PC", "PCI"}, 989 {"PU", "PUS"}, 990 {"PZ", "PCZ"}, 991 {"RH", "RHO"}, 992 {"VD", "VDR"}, 993 {"WK", "WAK"}, 994 // These three-letter codes are used for others as well. 995 {"FQ", "ATF"}, 996 } { 997 update(m.iso2, m.iso3) 998 } 999 for i, s := range regionISO.s { 1000 if len(s) != 4 { 1001 regionISO.s[i] = s + " " 1002 } 1003 } 1004 b.writeConst("regionISO", tag.Index(regionISO.join())) 1005 b.writeConst("altRegionISO3", altRegionISO3) 1006 b.writeSlice("altRegionIDs", altRegionIDs) 1007 1008 // Create list of deprecated regions. 1009 // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only 1010 // Transitionally-reserved mapping not included. 1011 regionOldMap := stringSet{} 1012 // Include regions in territoryAlias (not all are in the IANA registry!) 1013 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 1014 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { 1015 regionOldMap.add(reg.Type) 1016 regionOldMap.updateLater(reg.Type, reg.Replacement) 1017 i, _ := regionISO.find(reg.Type) 1018 j, _ := regionISO.find(reg.Replacement) 1019 if k := m49map[i+isoOffset]; k == 0 { 1020 m49map[i+isoOffset] = m49map[j+isoOffset] 1021 } 1022 } 1023 } 1024 b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { 1025 return uint16(b.region.index(s)) 1026 }) 1027 // 3-digit region lookup, groupings. 1028 for i := 1; i < isoOffset; i++ { 1029 m := parseM49(b.region.s[i]) 1030 m49map[i] = m 1031 fromM49map[m] = i 1032 } 1033 b.writeSlice("m49", m49map) 1034 1035 const ( 1036 searchBits = 7 1037 regionBits = 9 1038 ) 1039 if len(m49map) >= 1<<regionBits { 1040 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) 1041 } 1042 m49Index := [9]int16{} 1043 fromM49 := []uint16{} 1044 m49 := []int{} 1045 for k, _ := range fromM49map { 1046 m49 = append(m49, int(k)) 1047 } 1048 sort.Ints(m49) 1049 for _, k := range m49[1:] { 1050 val := (k & (1<<searchBits - 1)) << regionBits 1051 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) 1052 m49Index[1:][k>>searchBits] = int16(len(fromM49)) 1053 } 1054 b.writeSlice("m49Index", m49Index) 1055 b.writeSlice("fromM49", fromM49) 1056 } 1057 1058 const ( 1059 // TODO: put these lists in regionTypes as user data? Could be used for 1060 // various optimizations and refinements and could be exposed in the API. 1061 iso3166Except = "AC CP DG EA EU FX IC SU TA UK" 1062 iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. 1063 // DY and RH are actually not deleted, but indeterminately reserved. 1064 iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" 1065 ) 1066 1067 const ( 1068 iso3166UserAssigned = 1 << iota 1069 ccTLD 1070 bcp47Region 1071 ) 1072 1073 func find(list []string, s string) int { 1074 for i, t := range list { 1075 if t == s { 1076 return i 1077 } 1078 } 1079 return -1 1080 } 1081 1082 // writeVariant generates per-variant information and creates a map from variant 1083 // name to index value. We assign index values such that sorting multiple 1084 // variants by index value will result in the correct order. 1085 // There are two types of variants: specialized and general. Specialized variants 1086 // are only applicable to certain language or language-script pairs. Generalized 1087 // variants apply to any language. Generalized variants always sort after 1088 // specialized variants. We will therefore always assign a higher index value 1089 // to a generalized variant than any other variant. Generalized variants are 1090 // sorted alphabetically among themselves. 1091 // Specialized variants may also sort after other specialized variants. Such 1092 // variants will be ordered after any of the variants they may follow. 1093 // We assume that if a variant x is followed by a variant y, then for any prefix 1094 // p of x, p-x is a prefix of y. This allows us to order tags based on the 1095 // maximum of the length of any of its prefixes. 1096 // TODO: it is possible to define a set of Prefix values on variants such that 1097 // a total order cannot be defined to the point that this algorithm breaks. 1098 // In other words, we cannot guarantee the same order of variants for the 1099 // future using the same algorithm or for non-compliant combinations of 1100 // variants. For this reason, consider using simple alphabetic sorting 1101 // of variants and ignore Prefix restrictions altogether. 1102 func (b *builder) writeVariant() { 1103 generalized := stringSet{} 1104 specialized := stringSet{} 1105 specializedExtend := stringSet{} 1106 // Collate the variants by type and check assumptions. 1107 for _, v := range b.variant.slice() { 1108 e := b.registry[v] 1109 if len(e.prefix) == 0 { 1110 generalized.add(v) 1111 continue 1112 } 1113 c := strings.Split(e.prefix[0], "-") 1114 hasScriptOrRegion := false 1115 if len(c) > 1 { 1116 _, hasScriptOrRegion = b.script.find(c[1]) 1117 if !hasScriptOrRegion { 1118 _, hasScriptOrRegion = b.region.find(c[1]) 1119 1120 } 1121 } 1122 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { 1123 // Variant is preceded by a language. 1124 specialized.add(v) 1125 continue 1126 } 1127 // Variant is preceded by another variant. 1128 specializedExtend.add(v) 1129 prefix := c[0] + "-" 1130 if hasScriptOrRegion { 1131 prefix += c[1] 1132 } 1133 for _, p := range e.prefix { 1134 // Verify that the prefix minus the last element is a prefix of the 1135 // predecessor element. 1136 i := strings.LastIndex(p, "-") 1137 pred := b.registry[p[i+1:]] 1138 if find(pred.prefix, p[:i]) < 0 { 1139 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) 1140 } 1141 // The sorting used below does not work in the general case. It works 1142 // if we assume that variants that may be followed by others only have 1143 // prefixes of the same length. Verify this. 1144 count := strings.Count(p[:i], "-") 1145 for _, q := range pred.prefix { 1146 if c := strings.Count(q, "-"); c != count { 1147 log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) 1148 } 1149 } 1150 if !strings.HasPrefix(p, prefix) { 1151 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) 1152 } 1153 } 1154 } 1155 1156 // Sort extended variants. 1157 a := specializedExtend.s 1158 less := func(v, w string) bool { 1159 // Sort by the maximum number of elements. 1160 maxCount := func(s string) (max int) { 1161 for _, p := range b.registry[s].prefix { 1162 if c := strings.Count(p, "-"); c > max { 1163 max = c 1164 } 1165 } 1166 return 1167 } 1168 if cv, cw := maxCount(v), maxCount(w); cv != cw { 1169 return cv < cw 1170 } 1171 // Sort by name as tie breaker. 1172 return v < w 1173 } 1174 sort.Sort(funcSorter{less, sort.StringSlice(a)}) 1175 specializedExtend.frozen = true 1176 1177 // Create index from variant name to index. 1178 variantIndex := make(map[string]uint8) 1179 add := func(s []string) { 1180 for _, v := range s { 1181 variantIndex[v] = uint8(len(variantIndex)) 1182 } 1183 } 1184 add(specialized.slice()) 1185 add(specializedExtend.s) 1186 numSpecialized := len(variantIndex) 1187 add(generalized.slice()) 1188 if n := len(variantIndex); n > 255 { 1189 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) 1190 } 1191 b.writeMap("variantIndex", variantIndex) 1192 b.writeConst("variantNumSpecialized", numSpecialized) 1193 } 1194 1195 func (b *builder) writeLanguageInfo() { 1196 } 1197 1198 // writeLikelyData writes tables that are used both for finding parent relations and for 1199 // language matching. Each entry contains additional bits to indicate the status of the 1200 // data to know when it cannot be used for parent relations. 1201 func (b *builder) writeLikelyData() { 1202 const ( 1203 isList = 1 << iota 1204 scriptInFrom 1205 regionInFrom 1206 ) 1207 type ( // generated types 1208 likelyScriptRegion struct { 1209 region uint16 1210 script uint16 1211 flags uint8 1212 } 1213 likelyLangScript struct { 1214 lang uint16 1215 script uint16 1216 flags uint8 1217 } 1218 likelyLangRegion struct { 1219 lang uint16 1220 region uint16 1221 } 1222 // likelyTag is used for getting likely tags for group regions, where 1223 // the likely region might be a region contained in the group. 1224 likelyTag struct { 1225 lang uint16 1226 region uint16 1227 script uint16 1228 } 1229 ) 1230 var ( // generated variables 1231 likelyRegionGroup = make([]likelyTag, len(b.groups)) 1232 likelyLang = make([]likelyScriptRegion, len(b.lang.s)) 1233 likelyRegion = make([]likelyLangScript, len(b.region.s)) 1234 likelyScript = make([]likelyLangRegion, len(b.script.s)) 1235 likelyLangList = []likelyScriptRegion{} 1236 likelyRegionList = []likelyLangScript{} 1237 ) 1238 type fromTo struct { 1239 from, to []string 1240 } 1241 langToOther := map[int][]fromTo{} 1242 regionToOther := map[int][]fromTo{} 1243 for _, m := range b.supp.LikelySubtags.LikelySubtag { 1244 from := strings.Split(m.From, "_") 1245 to := strings.Split(m.To, "_") 1246 if len(to) != 3 { 1247 log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) 1248 } 1249 if len(from) > 3 { 1250 log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) 1251 } 1252 if from[0] != to[0] && from[0] != "und" { 1253 log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) 1254 } 1255 if len(from) == 3 { 1256 if from[2] != to[2] { 1257 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) 1258 } 1259 if from[0] != "und" { 1260 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) 1261 } 1262 } 1263 if len(from) == 1 || from[0] != "und" { 1264 id := 0 1265 if from[0] != "und" { 1266 id = b.lang.index(from[0]) 1267 } 1268 langToOther[id] = append(langToOther[id], fromTo{from, to}) 1269 } else if len(from) == 2 && len(from[1]) == 4 { 1270 sid := b.script.index(from[1]) 1271 likelyScript[sid].lang = uint16(b.langIndex(to[0])) 1272 likelyScript[sid].region = uint16(b.region.index(to[2])) 1273 } else { 1274 r := b.region.index(from[len(from)-1]) 1275 if id, ok := b.groups[r]; ok { 1276 if from[0] != "und" { 1277 log.Fatalf("region changed unexpectedly: %s -> %s", from, to) 1278 } 1279 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) 1280 likelyRegionGroup[id].script = uint16(b.script.index(to[1])) 1281 likelyRegionGroup[id].region = uint16(b.region.index(to[2])) 1282 } else { 1283 regionToOther[r] = append(regionToOther[r], fromTo{from, to}) 1284 } 1285 } 1286 } 1287 b.writeType(likelyLangRegion{}) 1288 b.writeSlice("likelyScript", likelyScript) 1289 1290 for id := range b.lang.s { 1291 list := langToOther[id] 1292 if len(list) == 1 { 1293 likelyLang[id].region = uint16(b.region.index(list[0].to[2])) 1294 likelyLang[id].script = uint16(b.script.index(list[0].to[1])) 1295 } else if len(list) > 1 { 1296 likelyLang[id].flags = isList 1297 likelyLang[id].region = uint16(len(likelyLangList)) 1298 likelyLang[id].script = uint16(len(list)) 1299 for _, x := range list { 1300 flags := uint8(0) 1301 if len(x.from) > 1 { 1302 if x.from[1] == x.to[2] { 1303 flags = regionInFrom 1304 } else { 1305 flags = scriptInFrom 1306 } 1307 } 1308 likelyLangList = append(likelyLangList, likelyScriptRegion{ 1309 region: uint16(b.region.index(x.to[2])), 1310 script: uint16(b.script.index(x.to[1])), 1311 flags: flags, 1312 }) 1313 } 1314 } 1315 } 1316 // TODO: merge suppressScript data with this table. 1317 b.writeType(likelyScriptRegion{}) 1318 b.writeSlice("likelyLang", likelyLang) 1319 b.writeSlice("likelyLangList", likelyLangList) 1320 1321 for id := range b.region.s { 1322 list := regionToOther[id] 1323 if len(list) == 1 { 1324 likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) 1325 likelyRegion[id].script = uint16(b.script.index(list[0].to[1])) 1326 if len(list[0].from) > 2 { 1327 likelyRegion[id].flags = scriptInFrom 1328 } 1329 } else if len(list) > 1 { 1330 likelyRegion[id].flags = isList 1331 likelyRegion[id].lang = uint16(len(likelyRegionList)) 1332 likelyRegion[id].script = uint16(len(list)) 1333 for i, x := range list { 1334 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { 1335 log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) 1336 } 1337 x := likelyLangScript{ 1338 lang: uint16(b.langIndex(x.to[0])), 1339 script: uint16(b.script.index(x.to[1])), 1340 } 1341 if len(list[0].from) > 2 { 1342 x.flags = scriptInFrom 1343 } 1344 likelyRegionList = append(likelyRegionList, x) 1345 } 1346 } 1347 } 1348 b.writeType(likelyLangScript{}) 1349 b.writeSlice("likelyRegion", likelyRegion) 1350 b.writeSlice("likelyRegionList", likelyRegionList) 1351 1352 b.writeType(likelyTag{}) 1353 b.writeSlice("likelyRegionGroup", likelyRegionGroup) 1354 } 1355 1356 func (b *builder) writeRegionInclusionData() { 1357 var ( 1358 // mm holds for each group the set of groups with a distance of 1. 1359 mm = make(map[int][]index) 1360 1361 // containment holds for each group the transitive closure of 1362 // containment of other groups. 1363 containment = make(map[index][]index) 1364 ) 1365 for _, g := range b.supp.TerritoryContainment.Group { 1366 // Skip UN and EURO zone as they are flattening the containment 1367 // relationship. 1368 if g.Type == "EZ" || g.Type == "UN" { 1369 continue 1370 } 1371 group := b.region.index(g.Type) 1372 groupIdx := b.groups[group] 1373 for _, mem := range strings.Split(g.Contains, " ") { 1374 r := b.region.index(mem) 1375 mm[r] = append(mm[r], groupIdx) 1376 if g, ok := b.groups[r]; ok { 1377 mm[group] = append(mm[group], g) 1378 containment[groupIdx] = append(containment[groupIdx], g) 1379 } 1380 } 1381 } 1382 1383 regionContainment := make([]uint64, len(b.groups)) 1384 for _, g := range b.groups { 1385 l := containment[g] 1386 1387 // Compute the transitive closure of containment. 1388 for i := 0; i < len(l); i++ { 1389 l = append(l, containment[l[i]]...) 1390 } 1391 1392 // Compute the bitmask. 1393 regionContainment[g] = 1 << g 1394 for _, v := range l { 1395 regionContainment[g] |= 1 << v 1396 } 1397 } 1398 b.writeSlice("regionContainment", regionContainment) 1399 1400 regionInclusion := make([]uint8, len(b.region.s)) 1401 bvs := make(map[uint64]index) 1402 // Make the first bitvector positions correspond with the groups. 1403 for r, i := range b.groups { 1404 bv := uint64(1 << i) 1405 for _, g := range mm[r] { 1406 bv |= 1 << g 1407 } 1408 bvs[bv] = i 1409 regionInclusion[r] = uint8(bvs[bv]) 1410 } 1411 for r := 1; r < len(b.region.s); r++ { 1412 if _, ok := b.groups[r]; !ok { 1413 bv := uint64(0) 1414 for _, g := range mm[r] { 1415 bv |= 1 << g 1416 } 1417 if bv == 0 { 1418 // Pick the world for unspecified regions. 1419 bv = 1 << b.groups[b.region.index("001")] 1420 } 1421 if _, ok := bvs[bv]; !ok { 1422 bvs[bv] = index(len(bvs)) 1423 } 1424 regionInclusion[r] = uint8(bvs[bv]) 1425 } 1426 } 1427 b.writeSlice("regionInclusion", regionInclusion) 1428 regionInclusionBits := make([]uint64, len(bvs)) 1429 for k, v := range bvs { 1430 regionInclusionBits[v] = uint64(k) 1431 } 1432 // Add bit vectors for increasingly large distances until a fixed point is reached. 1433 regionInclusionNext := []uint8{} 1434 for i := 0; i < len(regionInclusionBits); i++ { 1435 bits := regionInclusionBits[i] 1436 next := bits 1437 for i := uint(0); i < uint(len(b.groups)); i++ { 1438 if bits&(1<<i) != 0 { 1439 next |= regionInclusionBits[i] 1440 } 1441 } 1442 if _, ok := bvs[next]; !ok { 1443 bvs[next] = index(len(bvs)) 1444 regionInclusionBits = append(regionInclusionBits, next) 1445 } 1446 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) 1447 } 1448 b.writeSlice("regionInclusionBits", regionInclusionBits) 1449 b.writeSlice("regionInclusionNext", regionInclusionNext) 1450 } 1451 1452 type parentRel struct { 1453 lang uint16 1454 script uint16 1455 maxScript uint16 1456 toRegion uint16 1457 fromRegion []uint16 1458 } 1459 1460 func (b *builder) writeParents() { 1461 b.writeType(parentRel{}) 1462 1463 parents := []parentRel{} 1464 1465 // Construct parent overrides. 1466 n := 0 1467 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { 1468 // Skipping non-standard scripts to root is implemented using addTags. 1469 if p.Parent == "root" { 1470 continue 1471 } 1472 1473 sub := strings.Split(p.Parent, "_") 1474 parent := parentRel{lang: b.langIndex(sub[0])} 1475 if len(sub) == 2 { 1476 // TODO: check that all undefined scripts are indeed Latn in these 1477 // cases. 1478 parent.maxScript = uint16(b.script.index("Latn")) 1479 parent.toRegion = uint16(b.region.index(sub[1])) 1480 } else { 1481 parent.script = uint16(b.script.index(sub[1])) 1482 parent.maxScript = parent.script 1483 parent.toRegion = uint16(b.region.index(sub[2])) 1484 } 1485 for _, c := range strings.Split(p.Locales, " ") { 1486 region := b.region.index(c[strings.LastIndex(c, "_")+1:]) 1487 parent.fromRegion = append(parent.fromRegion, uint16(region)) 1488 } 1489 parents = append(parents, parent) 1490 n += len(parent.fromRegion) 1491 } 1492 b.writeSliceAddSize("parents", n*2, parents) 1493 } 1494 1495 func main() { 1496 gen.Init() 1497 1498 gen.Repackage("gen_common.go", "common.go", "language") 1499 1500 w := gen.NewCodeWriter() 1501 defer w.WriteGoFile("tables.go", "language") 1502 1503 fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`) 1504 1505 b := newBuilder(w) 1506 gen.WriteCLDRVersion(w) 1507 1508 b.parseIndices() 1509 b.writeType(FromTo{}) 1510 b.writeLanguage() 1511 b.writeScript() 1512 b.writeRegion() 1513 b.writeVariant() 1514 // TODO: b.writeLocale() 1515 b.computeRegionGroups() 1516 b.writeLikelyData() 1517 b.writeRegionInclusionData() 1518 b.writeParents() 1519 }