github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/language/maketables.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Language tag table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "io/ioutil" 18 "log" 19 "math" 20 "reflect" 21 "regexp" 22 "sort" 23 "strconv" 24 "strings" 25 26 "golang.org/x/text/internal/gen" 27 "golang.org/x/text/internal/tag" 28 "golang.org/x/text/unicode/cldr" 29 ) 30 31 var ( 32 test = flag.Bool("test", 33 false, 34 "test existing tables; can be used to compare web data with package data.") 35 outputFile = flag.String("output", 36 "tables.go", 37 "output file for generated tables") 38 ) 39 40 var comment = []string{ 41 ` 42 lang holds an alphabetically sorted list of ISO-639 language identifiers. 43 All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. 44 For 2-byte language identifiers, the two successive bytes have the following meaning: 45 - if the first letter of the 2- and 3-letter ISO codes are the same: 46 the second and third letter of the 3-letter ISO code. 47 - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. 48 For 3-byte language identifiers the 4th byte is 0.`, 49 ` 50 langNoIndex is a bit vector of all 3-letter language codes that are not used as an index 51 in lookup tables. The language ids for these language codes are derived directly 52 from the letters and are not consecutive.`, 53 ` 54 altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives 55 to 2-letter language codes that cannot be derived using the method described above. 56 Each 3-letter code is followed by its 1-byte langID.`, 57 ` 58 altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, 59 ` 60 langAliasMap maps langIDs to their suggested replacements.`, 61 ` 62 script is an alphabetically sorted list of ISO 15924 codes. The index 63 of the script in the string, divided by 4, is the internal scriptID.`, 64 ` 65 isoRegionOffset needs to be added to the index of regionISO to obtain the regionID 66 for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for 67 the UN.M49 codes used for groups.)`, 68 ` 69 regionISO holds a list of alphabetically sorted 2-letter ISO region codes. 70 Each 2-letter codes is followed by two bytes with the following meaning: 71 - [A-Z}{2}: the first letter of the 2-letter code plus these two 72 letters form the 3-letter ISO code. 73 - 0, n: index into altRegionISO3.`, 74 ` 75 regionTypes defines the status of a region for various standards.`, 76 ` 77 m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are 78 codes indicating collections of regions.`, 79 ` 80 m49Index gives indexes into fromM49 based on the three most significant bits 81 of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in 82 fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] 83 for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. 84 The region code is stored in the 9 lsb of the indexed value.`, 85 ` 86 fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, 87 ` 88 altRegionISO3 holds a list of 3-letter region codes that cannot be 89 mapped to 2-letter codes using the default algorithm. This is a short list.`, 90 ` 91 altRegionIDs holds a list of regionIDs the positions of which match those 92 of the 3-letter ISO codes in altRegionISO3.`, 93 ` 94 variantNumSpecialized is the number of specialized variants in variants.`, 95 ` 96 suppressScript is an index from langID to the dominant script for that language, 97 if it exists. If a script is given, it should be suppressed from the language tag.`, 98 ` 99 likelyLang is a lookup table, indexed by langID, for the most likely 100 scripts and regions given incomplete information. If more entries exist for a 101 given language, region and script are the index and size respectively 102 of the list in likelyLangList.`, 103 ` 104 likelyLangList holds lists info associated with likelyLang.`, 105 ` 106 likelyRegion is a lookup table, indexed by regionID, for the most likely 107 languages and scripts given incomplete information. If more entries exist 108 for a given regionID, lang and script are the index and size respectively 109 of the list in likelyRegionList. 110 TODO: exclude containers and user-definable regions from the list.`, 111 ` 112 likelyRegionList holds lists info associated with likelyRegion.`, 113 ` 114 likelyScript is a lookup table, indexed by scriptID, for the most likely 115 languages and regions given a script.`, 116 ` 117 matchLang holds pairs of langIDs of base languages that are typically 118 mutually intelligible. Each pair is associated with a confidence and 119 whether the intelligibility goes one or both ways.`, 120 ` 121 matchScript holds pairs of scriptIDs where readers of one script 122 can typically also read the other. Each is associated with a confidence.`, 123 ` 124 nRegionGroups is the number of region groups.`, 125 ` 126 regionInclusion maps region identifiers to sets of regions in regionInclusionBits, 127 where each set holds all groupings that are directly connected in a region 128 containment graph.`, 129 ` 130 regionInclusionBits is an array of bit vectors where every vector represents 131 a set of region groupings. These sets are used to compute the distance 132 between two regions for the purpose of language matching.`, 133 ` 134 regionInclusionNext marks, for each entry in regionInclusionBits, the set of 135 all groups that are reachable from the groups set in the respective entry.`, 136 } 137 138 // TODO: consider changing some of these structures to tries. This can reduce 139 // memory, but may increase the need for memory allocations. This could be 140 // mitigated if we can piggyback on language tags for common cases. 141 142 func failOnError(e error) { 143 if e != nil { 144 log.Panic(e) 145 } 146 } 147 148 type setType int 149 150 const ( 151 Indexed setType = 1 + iota // all elements must be of same size 152 Linear 153 ) 154 155 type stringSet struct { 156 s []string 157 sorted, frozen bool 158 159 // We often need to update values after the creation of an index is completed. 160 // We include a convenience map for keeping track of this. 161 update map[string]string 162 typ setType // used for checking. 163 } 164 165 func (ss *stringSet) clone() stringSet { 166 c := *ss 167 c.s = append([]string(nil), c.s...) 168 return c 169 } 170 171 func (ss *stringSet) setType(t setType) { 172 if ss.typ != t && ss.typ != 0 { 173 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) 174 } 175 } 176 177 // parse parses a whitespace-separated string and initializes ss with its 178 // components. 179 func (ss *stringSet) parse(s string) { 180 scan := bufio.NewScanner(strings.NewReader(s)) 181 scan.Split(bufio.ScanWords) 182 for scan.Scan() { 183 ss.add(scan.Text()) 184 } 185 } 186 187 func (ss *stringSet) assertChangeable() { 188 if ss.frozen { 189 log.Panic("attempt to modify a frozen stringSet") 190 } 191 } 192 193 func (ss *stringSet) add(s string) { 194 ss.assertChangeable() 195 ss.s = append(ss.s, s) 196 ss.sorted = ss.frozen 197 } 198 199 func (ss *stringSet) freeze() { 200 ss.compact() 201 ss.frozen = true 202 } 203 204 func (ss *stringSet) compact() { 205 if ss.sorted { 206 return 207 } 208 a := ss.s 209 sort.Strings(a) 210 k := 0 211 for i := 1; i < len(a); i++ { 212 if a[k] != a[i] { 213 a[k+1] = a[i] 214 k++ 215 } 216 } 217 ss.s = a[:k+1] 218 ss.sorted = ss.frozen 219 } 220 221 type funcSorter struct { 222 fn func(a, b string) bool 223 sort.StringSlice 224 } 225 226 func (s funcSorter) Less(i, j int) bool { 227 return s.fn(s.StringSlice[i], s.StringSlice[j]) 228 } 229 230 func (ss *stringSet) sortFunc(f func(a, b string) bool) { 231 ss.compact() 232 sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) 233 } 234 235 func (ss *stringSet) remove(s string) { 236 ss.assertChangeable() 237 if i, ok := ss.find(s); ok { 238 copy(ss.s[i:], ss.s[i+1:]) 239 ss.s = ss.s[:len(ss.s)-1] 240 } 241 } 242 243 func (ss *stringSet) replace(ol, nu string) { 244 ss.s[ss.index(ol)] = nu 245 ss.sorted = ss.frozen 246 } 247 248 func (ss *stringSet) index(s string) int { 249 ss.setType(Indexed) 250 i, ok := ss.find(s) 251 if !ok { 252 if i < len(ss.s) { 253 log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) 254 } 255 log.Panicf("find: item %q is not in list", s) 256 257 } 258 return i 259 } 260 261 func (ss *stringSet) find(s string) (int, bool) { 262 ss.compact() 263 i := sort.SearchStrings(ss.s, s) 264 return i, i != len(ss.s) && ss.s[i] == s 265 } 266 267 func (ss *stringSet) slice() []string { 268 ss.compact() 269 return ss.s 270 } 271 272 func (ss *stringSet) updateLater(v, key string) { 273 if ss.update == nil { 274 ss.update = map[string]string{} 275 } 276 ss.update[v] = key 277 } 278 279 // join joins the string and ensures that all entries are of the same length. 280 func (ss *stringSet) join() string { 281 ss.setType(Indexed) 282 n := len(ss.s[0]) 283 for _, s := range ss.s { 284 if len(s) != n { 285 log.Panicf("join: not all entries are of the same length: %q", s) 286 } 287 } 288 ss.s = append(ss.s, strings.Repeat("\xff", n)) 289 return strings.Join(ss.s, "") 290 } 291 292 // ianaEntry holds information for an entry in the IANA Language Subtag Repository. 293 // All types use the same entry. 294 // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various 295 // fields. 296 type ianaEntry struct { 297 typ string 298 description []string 299 scope string 300 added string 301 preferred string 302 deprecated string 303 suppressScript string 304 macro string 305 prefix []string 306 } 307 308 type builder struct { 309 w *gen.CodeWriter 310 hw io.Writer // MultiWriter for w and w.Hash 311 data *cldr.CLDR 312 supp *cldr.SupplementalData 313 314 // indices 315 locale stringSet // common locales 316 lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data 317 langNoIndex stringSet // 3-letter ISO codes with no associated data 318 script stringSet // 4-letter ISO codes 319 region stringSet // 2-letter ISO or 3-digit UN M49 codes 320 variant stringSet // 4-8-alphanumeric variant code. 321 322 // Region codes that are groups with their corresponding group IDs. 323 groups map[int]index 324 325 // langInfo 326 registry map[string]*ianaEntry 327 } 328 329 type index uint 330 331 func newBuilder(w *gen.CodeWriter) *builder { 332 r := gen.OpenCLDRCoreZip() 333 defer r.Close() 334 d := &cldr.Decoder{} 335 data, err := d.DecodeZip(r) 336 failOnError(err) 337 b := builder{ 338 w: w, 339 hw: io.MultiWriter(w, w.Hash), 340 data: data, 341 supp: data.Supplemental(), 342 } 343 b.parseRegistry() 344 return &b 345 } 346 347 func (b *builder) parseRegistry() { 348 r := gen.OpenIANAFile("assignments/language-subtag-registry") 349 defer r.Close() 350 b.registry = make(map[string]*ianaEntry) 351 352 scan := bufio.NewScanner(r) 353 scan.Split(bufio.ScanWords) 354 var record *ianaEntry 355 for more := scan.Scan(); more; { 356 key := scan.Text() 357 more = scan.Scan() 358 value := scan.Text() 359 switch key { 360 case "Type:": 361 record = &ianaEntry{typ: value} 362 case "Subtag:", "Tag:": 363 if s := strings.SplitN(value, "..", 2); len(s) > 1 { 364 for a := s[0]; a <= s[1]; a = inc(a) { 365 b.addToRegistry(a, record) 366 } 367 } else { 368 b.addToRegistry(value, record) 369 } 370 case "Suppress-Script:": 371 record.suppressScript = value 372 case "Added:": 373 record.added = value 374 case "Deprecated:": 375 record.deprecated = value 376 case "Macrolanguage:": 377 record.macro = value 378 case "Preferred-Value:": 379 record.preferred = value 380 case "Prefix:": 381 record.prefix = append(record.prefix, value) 382 case "Scope:": 383 record.scope = value 384 case "Description:": 385 buf := []byte(value) 386 for more = scan.Scan(); more; more = scan.Scan() { 387 b := scan.Bytes() 388 if b[0] == '%' || b[len(b)-1] == ':' { 389 break 390 } 391 buf = append(buf, ' ') 392 buf = append(buf, b...) 393 } 394 record.description = append(record.description, string(buf)) 395 continue 396 default: 397 continue 398 } 399 more = scan.Scan() 400 } 401 if scan.Err() != nil { 402 log.Panic(scan.Err()) 403 } 404 } 405 406 func (b *builder) addToRegistry(key string, entry *ianaEntry) { 407 if info, ok := b.registry[key]; ok { 408 if info.typ != "language" || entry.typ != "extlang" { 409 log.Fatalf("parseRegistry: tag %q already exists", key) 410 } 411 } else { 412 b.registry[key] = entry 413 } 414 } 415 416 var commentIndex = make(map[string]string) 417 418 func init() { 419 for _, s := range comment { 420 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) 421 commentIndex[key] = s 422 } 423 } 424 425 func (b *builder) comment(name string) { 426 if s := commentIndex[name]; len(s) > 0 { 427 b.w.WriteComment(s) 428 } else { 429 fmt.Fprintln(b.w) 430 } 431 } 432 433 func (b *builder) pf(f string, x ...interface{}) { 434 fmt.Fprintf(b.hw, f, x...) 435 fmt.Fprint(b.hw, "\n") 436 } 437 438 func (b *builder) p(x ...interface{}) { 439 fmt.Fprintln(b.hw, x...) 440 } 441 442 func (b *builder) addSize(s int) { 443 b.w.Size += s 444 b.pf("// Size: %d bytes", s) 445 } 446 447 func (b *builder) writeConst(name string, x interface{}) { 448 b.comment(name) 449 b.w.WriteConst(name, x) 450 } 451 452 // writeConsts computes f(v) for all v in values and writes the results 453 // as constants named _v to a single constant block. 454 func (b *builder) writeConsts(f func(string) int, values ...string) { 455 b.pf("const (") 456 for _, v := range values { 457 b.pf("\t_%s = %v", v, f(v)) 458 } 459 b.pf(")") 460 } 461 462 // writeType writes the type of the given value, which must be a struct. 463 func (b *builder) writeType(value interface{}) { 464 b.comment(reflect.TypeOf(value).Name()) 465 b.w.WriteType(value) 466 } 467 468 func (b *builder) writeSlice(name string, ss interface{}) { 469 b.writeSliceAddSize(name, 0, ss) 470 } 471 472 func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { 473 b.comment(name) 474 b.w.Size += extraSize 475 v := reflect.ValueOf(ss) 476 t := v.Type().Elem() 477 b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) 478 479 fmt.Fprintf(b.w, "var %s = ", name) 480 b.w.WriteArray(ss) 481 b.p() 482 } 483 484 type fromTo struct { 485 from, to uint16 486 } 487 488 func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { 489 ss.sortFunc(func(a, b string) bool { 490 return index(a) < index(b) 491 }) 492 m := []fromTo{} 493 for _, s := range ss.s { 494 m = append(m, fromTo{index(s), index(ss.update[s])}) 495 } 496 b.writeSlice(name, m) 497 } 498 499 const base = 'z' - 'a' + 1 500 501 func strToInt(s string) uint { 502 v := uint(0) 503 for i := 0; i < len(s); i++ { 504 v *= base 505 v += uint(s[i] - 'a') 506 } 507 return v 508 } 509 510 // converts the given integer to the original ASCII string passed to strToInt. 511 // len(s) must match the number of characters obtained. 512 func intToStr(v uint, s []byte) { 513 for i := len(s) - 1; i >= 0; i-- { 514 s[i] = byte(v%base) + 'a' 515 v /= base 516 } 517 } 518 519 func (b *builder) writeBitVector(name string, ss []string) { 520 vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) 521 for _, s := range ss { 522 v := strToInt(s) 523 vec[v/8] |= 1 << (v % 8) 524 } 525 b.writeSlice(name, vec) 526 } 527 528 // TODO: convert this type into a list or two-stage trie. 529 func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { 530 b.comment(name) 531 v := reflect.ValueOf(m) 532 sz := v.Len() * (2 + int(v.Type().Key().Size())) 533 for _, k := range m { 534 sz += len(k) 535 } 536 b.addSize(sz) 537 keys := []string{} 538 b.pf(`var %s = map[string]uint16{`, name) 539 for k := range m { 540 keys = append(keys, k) 541 } 542 sort.Strings(keys) 543 for _, k := range keys { 544 b.pf("\t%q: %v,", k, f(m[k])) 545 } 546 b.p("}") 547 } 548 549 func (b *builder) writeMap(name string, m interface{}) { 550 b.comment(name) 551 v := reflect.ValueOf(m) 552 sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) 553 b.addSize(sz) 554 f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { 555 return strings.IndexRune("{}, ", r) != -1 556 }) 557 sort.Strings(f[1:]) 558 b.pf(`var %s = %s{`, name, f[0]) 559 for _, kv := range f[1:] { 560 b.pf("\t%s,", kv) 561 } 562 b.p("}") 563 } 564 565 func (b *builder) langIndex(s string) uint16 { 566 if s == "und" { 567 return 0 568 } 569 if i, ok := b.lang.find(s); ok { 570 return uint16(i) 571 } 572 return uint16(strToInt(s)) + uint16(len(b.lang.s)) 573 } 574 575 // inc advances the string to its lexicographical successor. 576 func inc(s string) string { 577 const maxTagLength = 4 578 var buf [maxTagLength]byte 579 intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) 580 for i := 0; i < len(s); i++ { 581 if s[i] <= 'Z' { 582 buf[i] -= 'a' - 'A' 583 } 584 } 585 return string(buf[:len(s)]) 586 } 587 588 func (b *builder) parseIndices() { 589 meta := b.supp.Metadata 590 591 for k, v := range b.registry { 592 var ss *stringSet 593 switch v.typ { 594 case "language": 595 if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { 596 b.lang.add(k) 597 continue 598 } else { 599 ss = &b.langNoIndex 600 } 601 case "region": 602 ss = &b.region 603 case "script": 604 ss = &b.script 605 case "variant": 606 ss = &b.variant 607 default: 608 continue 609 } 610 ss.add(k) 611 } 612 // Include any language for which there is data. 613 for _, lang := range b.data.Locales() { 614 if x := b.data.RawLDML(lang); false || 615 x.LocaleDisplayNames != nil || 616 x.Characters != nil || 617 x.Delimiters != nil || 618 x.Measurement != nil || 619 x.Dates != nil || 620 x.Numbers != nil || 621 x.Units != nil || 622 x.ListPatterns != nil || 623 x.Collations != nil || 624 x.Segmentations != nil || 625 x.Rbnf != nil || 626 x.Annotations != nil || 627 x.Metadata != nil { 628 629 from := strings.Split(lang, "_") 630 if lang := from[0]; lang != "root" { 631 b.lang.add(lang) 632 } 633 } 634 } 635 // Include languages in likely subtags. 636 for _, m := range b.supp.LikelySubtags.LikelySubtag { 637 from := strings.Split(m.From, "_") 638 b.lang.add(from[0]) 639 } 640 // Include ISO-639 alpha-3 bibliographic entries. 641 for _, a := range meta.Alias.LanguageAlias { 642 if a.Reason == "bibliographic" { 643 b.langNoIndex.add(a.Type) 644 } 645 } 646 // Include regions in territoryAlias (not all are in the IANA registry!) 647 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 648 if len(reg.Type) == 2 { 649 b.region.add(reg.Type) 650 } 651 } 652 653 for _, s := range b.lang.s { 654 if len(s) == 3 { 655 b.langNoIndex.remove(s) 656 } 657 } 658 b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) 659 b.writeConst("numScripts", len(b.script.slice())) 660 b.writeConst("numRegions", len(b.region.slice())) 661 662 // Add dummy codes at the start of each list to represent "unspecified". 663 b.lang.add("---") 664 b.script.add("----") 665 b.region.add("---") 666 667 // common locales 668 b.locale.parse(meta.DefaultContent.Locales) 669 } 670 671 func (b *builder) computeRegionGroups() { 672 b.groups = make(map[int]index) 673 674 // Create group indices. 675 for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. 676 b.groups[i] = index(len(b.groups)) 677 } 678 for _, g := range b.supp.TerritoryContainment.Group { 679 group := b.region.index(g.Type) 680 if _, ok := b.groups[group]; !ok { 681 b.groups[group] = index(len(b.groups)) 682 } 683 } 684 if len(b.groups) > 32 { 685 log.Fatalf("only 32 groups supported, found %d", len(b.groups)) 686 } 687 b.writeConst("nRegionGroups", len(b.groups)) 688 } 689 690 var langConsts = []string{ 691 "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", 692 "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", 693 "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", 694 "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", 695 "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", 696 "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", 697 698 // constants for grandfathered tags (if not already defined) 699 "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", 700 "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", 701 } 702 703 // writeLanguage generates all tables needed for language canonicalization. 704 func (b *builder) writeLanguage() { 705 meta := b.supp.Metadata 706 707 b.writeConst("nonCanonicalUnd", b.lang.index("und")) 708 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 709 b.writeConst("langPrivateStart", b.langIndex("qaa")) 710 b.writeConst("langPrivateEnd", b.langIndex("qtz")) 711 712 // Get language codes that need to be mapped (overlong 3-letter codes, 713 // deprecated 2-letter codes, legacy and grandfathered tags.) 714 langAliasMap := stringSet{} 715 aliasTypeMap := map[string]langAliasType{} 716 717 // altLangISO3 get the alternative ISO3 names that need to be mapped. 718 altLangISO3 := stringSet{} 719 // Add dummy start to avoid the use of index 0. 720 altLangISO3.add("---") 721 altLangISO3.updateLater("---", "aa") 722 723 lang := b.lang.clone() 724 for _, a := range meta.Alias.LanguageAlias { 725 if a.Replacement == "" { 726 a.Replacement = "und" 727 } 728 // TODO: support mapping to tags 729 repl := strings.SplitN(a.Replacement, "_", 2)[0] 730 if a.Reason == "overlong" { 731 if len(a.Replacement) == 2 && len(a.Type) == 3 { 732 lang.updateLater(a.Replacement, a.Type) 733 } 734 } else if len(a.Type) <= 3 { 735 switch a.Reason { 736 case "macrolanguage": 737 aliasTypeMap[a.Type] = langMacro 738 case "deprecated": 739 // handled elsewhere 740 continue 741 case "bibliographic", "legacy": 742 if a.Type == "no" { 743 continue 744 } 745 aliasTypeMap[a.Type] = langLegacy 746 default: 747 log.Fatalf("new %s alias: %s", a.Reason, a.Type) 748 } 749 langAliasMap.add(a.Type) 750 langAliasMap.updateLater(a.Type, repl) 751 } 752 } 753 // Manually add the mapping of "nb" (Norwegian) to its macro language. 754 // This can be removed if CLDR adopts this change. 755 langAliasMap.add("nb") 756 langAliasMap.updateLater("nb", "no") 757 aliasTypeMap["nb"] = langMacro 758 759 for k, v := range b.registry { 760 // Also add deprecated values for 3-letter ISO codes, which CLDR omits. 761 if v.typ == "language" && v.deprecated != "" && v.preferred != "" { 762 langAliasMap.add(k) 763 langAliasMap.updateLater(k, v.preferred) 764 aliasTypeMap[k] = langDeprecated 765 } 766 } 767 // Fix CLDR mappings. 768 lang.updateLater("tl", "tgl") 769 lang.updateLater("sh", "hbs") 770 lang.updateLater("mo", "mol") 771 lang.updateLater("no", "nor") 772 lang.updateLater("tw", "twi") 773 lang.updateLater("nb", "nob") 774 lang.updateLater("ak", "aka") 775 776 // Ensure that each 2-letter code is matched with a 3-letter code. 777 for _, v := range lang.s[1:] { 778 s, ok := lang.update[v] 779 if !ok { 780 if s, ok = lang.update[langAliasMap.update[v]]; !ok { 781 continue 782 } 783 lang.update[v] = s 784 } 785 if v[0] != s[0] { 786 altLangISO3.add(s) 787 altLangISO3.updateLater(s, v) 788 } 789 } 790 791 // Complete canonialized language tags. 792 lang.freeze() 793 for i, v := range lang.s { 794 // We can avoid these manual entries by using the IANI registry directly. 795 // Seems easier to update the list manually, as changes are rare. 796 // The panic in this loop will trigger if we miss an entry. 797 add := "" 798 if s, ok := lang.update[v]; ok { 799 if s[0] == v[0] { 800 add = s[1:] 801 } else { 802 add = string([]byte{0, byte(altLangISO3.index(s))}) 803 } 804 } else if len(v) == 3 { 805 add = "\x00" 806 } else { 807 log.Panicf("no data for long form of %q", v) 808 } 809 lang.s[i] += add 810 } 811 b.writeConst("lang", tag.Index(lang.join())) 812 813 b.writeConst("langNoIndexOffset", len(b.lang.s)) 814 815 // space of all valid 3-letter language identifiers. 816 b.writeBitVector("langNoIndex", b.langNoIndex.slice()) 817 818 altLangIndex := []uint16{} 819 for i, s := range altLangISO3.slice() { 820 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) 821 if i > 0 { 822 idx := b.lang.index(altLangISO3.update[s]) 823 altLangIndex = append(altLangIndex, uint16(idx)) 824 } 825 } 826 b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) 827 b.writeSlice("altLangIndex", altLangIndex) 828 829 b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex) 830 types := make([]langAliasType, len(langAliasMap.s)) 831 for i, s := range langAliasMap.s { 832 types[i] = aliasTypeMap[s] 833 } 834 b.writeSlice("langAliasTypes", types) 835 } 836 837 var scriptConsts = []string{ 838 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 839 "Zzzz", 840 } 841 842 func (b *builder) writeScript() { 843 b.writeConsts(b.script.index, scriptConsts...) 844 b.writeConst("script", tag.Index(b.script.join())) 845 846 supp := make([]uint8, len(b.lang.slice())) 847 for i, v := range b.lang.slice()[1:] { 848 if sc := b.registry[v].suppressScript; sc != "" { 849 supp[i+1] = uint8(b.script.index(sc)) 850 } 851 } 852 b.writeSlice("suppressScript", supp) 853 854 // There is only one deprecated script in CLDR. This value is hard-coded. 855 // We check here if the code must be updated. 856 for _, a := range b.supp.Metadata.Alias.ScriptAlias { 857 if a.Type != "Qaai" { 858 log.Panicf("unexpected deprecated stript %q", a.Type) 859 } 860 } 861 } 862 863 func parseM49(s string) int16 { 864 if len(s) == 0 { 865 return 0 866 } 867 v, err := strconv.ParseUint(s, 10, 10) 868 failOnError(err) 869 return int16(v) 870 } 871 872 var regionConsts = []string{ 873 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 874 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 875 } 876 877 func (b *builder) writeRegion() { 878 b.writeConsts(b.region.index, regionConsts...) 879 880 isoOffset := b.region.index("AA") 881 m49map := make([]int16, len(b.region.slice())) 882 fromM49map := make(map[int16]int) 883 altRegionISO3 := "" 884 altRegionIDs := []uint16{} 885 886 b.writeConst("isoRegionOffset", isoOffset) 887 888 // 2-letter region lookup and mapping to numeric codes. 889 regionISO := b.region.clone() 890 regionISO.s = regionISO.s[isoOffset:] 891 regionISO.sorted = false 892 893 regionTypes := make([]byte, len(b.region.s)) 894 895 // Is the region valid BCP 47? 896 for s, e := range b.registry { 897 if len(s) == 2 && s == strings.ToUpper(s) { 898 i := b.region.index(s) 899 for _, d := range e.description { 900 if strings.Contains(d, "Private use") { 901 regionTypes[i] = iso3166UserAssgined 902 } 903 } 904 regionTypes[i] |= bcp47Region 905 } 906 } 907 908 // Is the region a valid ccTLD? 909 r := gen.OpenIANAFile("domains/root/db") 910 defer r.Close() 911 912 buf, err := ioutil.ReadAll(r) 913 failOnError(err) 914 re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) 915 for _, m := range re.FindAllSubmatch(buf, -1) { 916 i := b.region.index(strings.ToUpper(string(m[1]))) 917 regionTypes[i] |= ccTLD 918 } 919 920 b.writeSlice("regionTypes", regionTypes) 921 922 iso3Set := make(map[string]int) 923 update := func(iso2, iso3 string) { 924 i := regionISO.index(iso2) 925 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { 926 regionISO.s[i] += iso3[1:] 927 iso3Set[iso3] = -1 928 } else { 929 if ok && j >= 0 { 930 regionISO.s[i] += string([]byte{0, byte(j)}) 931 } else { 932 iso3Set[iso3] = len(altRegionISO3) 933 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) 934 altRegionISO3 += iso3 935 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) 936 } 937 } 938 } 939 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 940 i := regionISO.index(tc.Type) + isoOffset 941 if d := m49map[i]; d != 0 { 942 log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) 943 } 944 m49 := parseM49(tc.Numeric) 945 m49map[i] = m49 946 if r := fromM49map[m49]; r == 0 { 947 fromM49map[m49] = i 948 } else if r != i { 949 dep := b.registry[regionISO.s[r-isoOffset]].deprecated 950 if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { 951 fromM49map[m49] = i 952 } 953 } 954 } 955 for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { 956 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { 957 from := parseM49(ta.Type) 958 if r := fromM49map[from]; r == 0 { 959 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset 960 } 961 } 962 } 963 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 964 if len(tc.Alpha3) == 3 { 965 update(tc.Type, tc.Alpha3) 966 } 967 } 968 // This entries are not included in territoryCodes. Mostly 3-letter variants 969 // of deleted codes and an entry for QU. 970 for _, m := range []struct{ iso2, iso3 string }{ 971 {"CT", "CTE"}, 972 {"DY", "DHY"}, 973 {"HV", "HVO"}, 974 {"JT", "JTN"}, 975 {"MI", "MID"}, 976 {"NH", "NHB"}, 977 {"NQ", "ATN"}, 978 {"PC", "PCI"}, 979 {"PU", "PUS"}, 980 {"PZ", "PCZ"}, 981 {"RH", "RHO"}, 982 {"VD", "VDR"}, 983 {"WK", "WAK"}, 984 // These three-letter codes are used for others as well. 985 {"FQ", "ATF"}, 986 } { 987 update(m.iso2, m.iso3) 988 } 989 for i, s := range regionISO.s { 990 if len(s) != 4 { 991 regionISO.s[i] = s + " " 992 } 993 } 994 b.writeConst("regionISO", tag.Index(regionISO.join())) 995 b.writeConst("altRegionISO3", altRegionISO3) 996 b.writeSlice("altRegionIDs", altRegionIDs) 997 998 // Create list of deprecated regions. 999 // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only 1000 // Transitionally-reserved mapping not included. 1001 regionOldMap := stringSet{} 1002 // Include regions in territoryAlias (not all are in the IANA registry!) 1003 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 1004 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { 1005 regionOldMap.add(reg.Type) 1006 regionOldMap.updateLater(reg.Type, reg.Replacement) 1007 i, _ := regionISO.find(reg.Type) 1008 j, _ := regionISO.find(reg.Replacement) 1009 if k := m49map[i+isoOffset]; k == 0 { 1010 m49map[i+isoOffset] = m49map[j+isoOffset] 1011 } 1012 } 1013 } 1014 b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { 1015 return uint16(b.region.index(s)) 1016 }) 1017 // 3-digit region lookup, groupings. 1018 for i := 1; i < isoOffset; i++ { 1019 m := parseM49(b.region.s[i]) 1020 m49map[i] = m 1021 fromM49map[m] = i 1022 } 1023 b.writeSlice("m49", m49map) 1024 1025 const ( 1026 searchBits = 7 1027 regionBits = 9 1028 ) 1029 if len(m49map) >= 1<<regionBits { 1030 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) 1031 } 1032 m49Index := [9]int16{} 1033 fromM49 := []uint16{} 1034 m49 := []int{} 1035 for k, _ := range fromM49map { 1036 m49 = append(m49, int(k)) 1037 } 1038 sort.Ints(m49) 1039 for _, k := range m49[1:] { 1040 val := (k & (1<<searchBits - 1)) << regionBits 1041 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) 1042 m49Index[1:][k>>searchBits] = int16(len(fromM49)) 1043 } 1044 b.writeSlice("m49Index", m49Index) 1045 b.writeSlice("fromM49", fromM49) 1046 } 1047 1048 const ( 1049 // TODO: put these lists in regionTypes as user data? Could be used for 1050 // various optimizations and refinements and could be exposed in the API. 1051 iso3166Except = "AC CP DG EA EU FX IC SU TA UK" 1052 iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. 1053 // DY and RH are actually not deleted, but indeterminately reserved. 1054 iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" 1055 ) 1056 1057 const ( 1058 iso3166UserAssgined = 1 << iota 1059 ccTLD 1060 bcp47Region 1061 ) 1062 1063 func find(list []string, s string) int { 1064 for i, t := range list { 1065 if t == s { 1066 return i 1067 } 1068 } 1069 return -1 1070 } 1071 1072 // writeVariants generates per-variant information and creates a map from variant 1073 // name to index value. We assign index values such that sorting multiple 1074 // variants by index value will result in the correct order. 1075 // There are two types of variants: specialized and general. Specialized variants 1076 // are only applicable to certain language or language-script pairs. Generalized 1077 // variants apply to any language. Generalized variants always sort after 1078 // specialized variants. We will therefore always assign a higher index value 1079 // to a generalized variant than any other variant. Generalized variants are 1080 // sorted alphabetically among themselves. 1081 // Specialized variants may also sort after other specialized variants. Such 1082 // variants will be ordered after any of the variants they may follow. 1083 // We assume that if a variant x is followed by a variant y, then for any prefix 1084 // p of x, p-x is a prefix of y. This allows us to order tags based on the 1085 // maximum of the length of any of its prefixes. 1086 // TODO: it is possible to define a set of Prefix values on variants such that 1087 // a total order cannot be defined to the point that this algorithm breaks. 1088 // In other words, we cannot guarantee the same order of variants for the 1089 // future using the same algorithm or for non-compliant combinations of 1090 // variants. For this reason, consider using simple alphabetic sorting 1091 // of variants and ignore Prefix restrictions altogether. 1092 func (b *builder) writeVariant() { 1093 generalized := stringSet{} 1094 specialized := stringSet{} 1095 specializedExtend := stringSet{} 1096 // Collate the variants by type and check assumptions. 1097 for _, v := range b.variant.slice() { 1098 e := b.registry[v] 1099 if len(e.prefix) == 0 { 1100 generalized.add(v) 1101 continue 1102 } 1103 c := strings.Split(e.prefix[0], "-") 1104 hasScriptOrRegion := false 1105 if len(c) > 1 { 1106 _, hasScriptOrRegion = b.script.find(c[1]) 1107 if !hasScriptOrRegion { 1108 _, hasScriptOrRegion = b.region.find(c[1]) 1109 1110 } 1111 } 1112 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { 1113 // Variant is preceded by a language. 1114 specialized.add(v) 1115 continue 1116 } 1117 // Variant is preceded by another variant. 1118 specializedExtend.add(v) 1119 prefix := c[0] + "-" 1120 if hasScriptOrRegion { 1121 prefix += c[1] 1122 } 1123 for _, p := range e.prefix { 1124 // Verify that the prefix minus the last element is a prefix of the 1125 // predecessor element. 1126 i := strings.LastIndex(p, "-") 1127 pred := b.registry[p[i+1:]] 1128 if find(pred.prefix, p[:i]) < 0 { 1129 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) 1130 } 1131 // The sorting used below does not work in the general case. It works 1132 // if we assume that variants that may be followed by others only have 1133 // prefixes of the same length. Verify this. 1134 count := strings.Count(p[:i], "-") 1135 for _, q := range pred.prefix { 1136 if c := strings.Count(q, "-"); c != count { 1137 log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) 1138 } 1139 } 1140 if !strings.HasPrefix(p, prefix) { 1141 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) 1142 } 1143 } 1144 } 1145 1146 // Sort extended variants. 1147 a := specializedExtend.s 1148 less := func(v, w string) bool { 1149 // Sort by the maximum number of elements. 1150 maxCount := func(s string) (max int) { 1151 for _, p := range b.registry[s].prefix { 1152 if c := strings.Count(p, "-"); c > max { 1153 max = c 1154 } 1155 } 1156 return 1157 } 1158 if cv, cw := maxCount(v), maxCount(w); cv != cw { 1159 return cv < cw 1160 } 1161 // Sort by name as tie breaker. 1162 return v < w 1163 } 1164 sort.Sort(funcSorter{less, sort.StringSlice(a)}) 1165 specializedExtend.frozen = true 1166 1167 // Create index from variant name to index. 1168 variantIndex := make(map[string]uint8) 1169 add := func(s []string) { 1170 for _, v := range s { 1171 variantIndex[v] = uint8(len(variantIndex)) 1172 } 1173 } 1174 add(specialized.slice()) 1175 add(specializedExtend.s) 1176 numSpecialized := len(variantIndex) 1177 add(generalized.slice()) 1178 if n := len(variantIndex); n > 255 { 1179 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) 1180 } 1181 b.writeMap("variantIndex", variantIndex) 1182 b.writeConst("variantNumSpecialized", numSpecialized) 1183 } 1184 1185 func (b *builder) writeLanguageInfo() { 1186 } 1187 1188 // writeLikelyData writes tables that are used both for finding parent relations and for 1189 // language matching. Each entry contains additional bits to indicate the status of the 1190 // data to know when it cannot be used for parent relations. 1191 func (b *builder) writeLikelyData() { 1192 const ( 1193 isList = 1 << iota 1194 scriptInFrom 1195 regionInFrom 1196 ) 1197 type ( // generated types 1198 likelyScriptRegion struct { 1199 region uint16 1200 script uint8 1201 flags uint8 1202 } 1203 likelyLangScript struct { 1204 lang uint16 1205 script uint8 1206 flags uint8 1207 } 1208 likelyLangRegion struct { 1209 lang uint16 1210 region uint16 1211 } 1212 // likelyTag is used for getting likely tags for group regions, where 1213 // the likely region might be a region contained in the group. 1214 likelyTag struct { 1215 lang uint16 1216 region uint16 1217 script uint8 1218 } 1219 ) 1220 var ( // generated variables 1221 likelyRegionGroup = make([]likelyTag, len(b.groups)) 1222 likelyLang = make([]likelyScriptRegion, len(b.lang.s)) 1223 likelyRegion = make([]likelyLangScript, len(b.region.s)) 1224 likelyScript = make([]likelyLangRegion, len(b.script.s)) 1225 likelyLangList = []likelyScriptRegion{} 1226 likelyRegionList = []likelyLangScript{} 1227 ) 1228 type fromTo struct { 1229 from, to []string 1230 } 1231 langToOther := map[int][]fromTo{} 1232 regionToOther := map[int][]fromTo{} 1233 for _, m := range b.supp.LikelySubtags.LikelySubtag { 1234 from := strings.Split(m.From, "_") 1235 to := strings.Split(m.To, "_") 1236 if len(to) != 3 { 1237 log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) 1238 } 1239 if len(from) > 3 { 1240 log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) 1241 } 1242 if from[0] != to[0] && from[0] != "und" { 1243 log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) 1244 } 1245 if len(from) == 3 { 1246 if from[2] != to[2] { 1247 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) 1248 } 1249 if from[0] != "und" { 1250 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) 1251 } 1252 } 1253 if len(from) == 1 || from[0] != "und" { 1254 id := 0 1255 if from[0] != "und" { 1256 id = b.lang.index(from[0]) 1257 } 1258 langToOther[id] = append(langToOther[id], fromTo{from, to}) 1259 } else if len(from) == 2 && len(from[1]) == 4 { 1260 sid := b.script.index(from[1]) 1261 likelyScript[sid].lang = uint16(b.langIndex(to[0])) 1262 likelyScript[sid].region = uint16(b.region.index(to[2])) 1263 } else { 1264 r := b.region.index(from[len(from)-1]) 1265 if id, ok := b.groups[r]; ok { 1266 if from[0] != "und" { 1267 log.Fatalf("region changed unexpectedly: %s -> %s", from, to) 1268 } 1269 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) 1270 likelyRegionGroup[id].script = uint8(b.script.index(to[1])) 1271 likelyRegionGroup[id].region = uint16(b.region.index(to[2])) 1272 } else { 1273 regionToOther[r] = append(regionToOther[r], fromTo{from, to}) 1274 } 1275 } 1276 } 1277 b.writeType(likelyLangRegion{}) 1278 b.writeSlice("likelyScript", likelyScript) 1279 1280 for id := range b.lang.s { 1281 list := langToOther[id] 1282 if len(list) == 1 { 1283 likelyLang[id].region = uint16(b.region.index(list[0].to[2])) 1284 likelyLang[id].script = uint8(b.script.index(list[0].to[1])) 1285 } else if len(list) > 1 { 1286 likelyLang[id].flags = isList 1287 likelyLang[id].region = uint16(len(likelyLangList)) 1288 likelyLang[id].script = uint8(len(list)) 1289 for _, x := range list { 1290 flags := uint8(0) 1291 if len(x.from) > 1 { 1292 if x.from[1] == x.to[2] { 1293 flags = regionInFrom 1294 } else { 1295 flags = scriptInFrom 1296 } 1297 } 1298 likelyLangList = append(likelyLangList, likelyScriptRegion{ 1299 region: uint16(b.region.index(x.to[2])), 1300 script: uint8(b.script.index(x.to[1])), 1301 flags: flags, 1302 }) 1303 } 1304 } 1305 } 1306 // TODO: merge suppressScript data with this table. 1307 b.writeType(likelyScriptRegion{}) 1308 b.writeSlice("likelyLang", likelyLang) 1309 b.writeSlice("likelyLangList", likelyLangList) 1310 1311 for id := range b.region.s { 1312 list := regionToOther[id] 1313 if len(list) == 1 { 1314 likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) 1315 likelyRegion[id].script = uint8(b.script.index(list[0].to[1])) 1316 if len(list[0].from) > 2 { 1317 likelyRegion[id].flags = scriptInFrom 1318 } 1319 } else if len(list) > 1 { 1320 likelyRegion[id].flags = isList 1321 likelyRegion[id].lang = uint16(len(likelyRegionList)) 1322 likelyRegion[id].script = uint8(len(list)) 1323 for i, x := range list { 1324 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { 1325 log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) 1326 } 1327 x := likelyLangScript{ 1328 lang: uint16(b.langIndex(x.to[0])), 1329 script: uint8(b.script.index(x.to[1])), 1330 } 1331 if len(list[0].from) > 2 { 1332 x.flags = scriptInFrom 1333 } 1334 likelyRegionList = append(likelyRegionList, x) 1335 } 1336 } 1337 } 1338 b.writeType(likelyLangScript{}) 1339 b.writeSlice("likelyRegion", likelyRegion) 1340 b.writeSlice("likelyRegionList", likelyRegionList) 1341 1342 b.writeType(likelyTag{}) 1343 b.writeSlice("likelyRegionGroup", likelyRegionGroup) 1344 } 1345 1346 type mutualIntelligibility struct { 1347 want, have uint16 1348 conf uint8 1349 oneway bool 1350 } 1351 1352 type scriptIntelligibility struct { 1353 lang uint16 // langID or 0 if * 1354 want, have uint8 1355 conf uint8 1356 } 1357 1358 type sortByConf []mutualIntelligibility 1359 1360 func (l sortByConf) Less(a, b int) bool { 1361 return l[a].conf > l[b].conf 1362 } 1363 1364 func (l sortByConf) Swap(a, b int) { 1365 l[a], l[b] = l[b], l[a] 1366 } 1367 1368 func (l sortByConf) Len() int { 1369 return len(l) 1370 } 1371 1372 // toConf converts a percentage value [0, 100] to a confidence class. 1373 func toConf(pct uint8) uint8 { 1374 switch { 1375 case pct == 100: 1376 return 3 // Exact 1377 case pct >= 90: 1378 return 2 // High 1379 case pct > 50: 1380 return 1 // Low 1381 default: 1382 return 0 // No 1383 } 1384 } 1385 1386 // writeMatchData writes tables with languages and scripts for which there is 1387 // mutual intelligibility. The data is based on CLDR's languageMatching data. 1388 // Note that we use a different algorithm than the one defined by CLDR and that 1389 // we slightly modify the data. For example, we convert scores to confidence levels. 1390 // We also drop all region-related data as we use a different algorithm to 1391 // determine region equivalence. 1392 func (b *builder) writeMatchData() { 1393 b.writeType(mutualIntelligibility{}) 1394 b.writeType(scriptIntelligibility{}) 1395 lm := b.supp.LanguageMatching.LanguageMatches 1396 cldr.MakeSlice(&lm).SelectAnyOf("type", "written") 1397 1398 matchLang := []mutualIntelligibility{} 1399 matchScript := []scriptIntelligibility{} 1400 // Convert the languageMatch entries in lists keyed by desired language. 1401 for _, m := range lm[0].LanguageMatch { 1402 // Different versions of CLDR use different separators. 1403 desired := strings.Replace(m.Desired, "-", "_", -1) 1404 supported := strings.Replace(m.Supported, "-", "_", -1) 1405 d := strings.Split(desired, "_") 1406 s := strings.Split(supported, "_") 1407 if len(d) != len(s) || len(d) > 2 { 1408 // Skip all entries with regions and work around CLDR bug. 1409 continue 1410 } 1411 pct, _ := strconv.ParseInt(m.Percent, 10, 8) 1412 if len(d) == 2 && d[0] == s[0] && len(d[1]) == 4 { 1413 // language-script pair. 1414 lang := uint16(0) 1415 if d[0] != "*" { 1416 lang = uint16(b.langIndex(d[0])) 1417 } 1418 matchScript = append(matchScript, scriptIntelligibility{ 1419 lang: lang, 1420 want: uint8(b.script.index(d[1])), 1421 have: uint8(b.script.index(s[1])), 1422 conf: toConf(uint8(pct)), 1423 }) 1424 if m.Oneway != "true" { 1425 matchScript = append(matchScript, scriptIntelligibility{ 1426 lang: lang, 1427 want: uint8(b.script.index(s[1])), 1428 have: uint8(b.script.index(d[1])), 1429 conf: toConf(uint8(pct)), 1430 }) 1431 } 1432 } else if len(d) == 1 && d[0] != "*" { 1433 if pct == 100 { 1434 // nb == no is already handled by macro mapping. Check there 1435 // really is only this case. 1436 if d[0] != "no" || s[0] != "nb" { 1437 log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) 1438 } 1439 continue 1440 } 1441 matchLang = append(matchLang, mutualIntelligibility{ 1442 want: uint16(b.langIndex(d[0])), 1443 have: uint16(b.langIndex(s[0])), 1444 conf: uint8(pct), 1445 oneway: m.Oneway == "true", 1446 }) 1447 } else { 1448 // TODO: Handle other mappings. 1449 a := []string{"*;*", "*_*;*_*", "es_MX;es_419"} 1450 s := strings.Join([]string{desired, supported}, ";") 1451 if i := sort.SearchStrings(a, s); i == len(a) || a[i] != s { 1452 log.Printf("%q not handled", s) 1453 } 1454 } 1455 } 1456 sort.Sort(sortByConf(matchLang)) 1457 // collapse percentage into confidence classes 1458 for i, m := range matchLang { 1459 matchLang[i].conf = toConf(m.conf) 1460 } 1461 b.writeSlice("matchLang", matchLang) 1462 b.writeSlice("matchScript", matchScript) 1463 } 1464 1465 func (b *builder) writeRegionInclusionData() { 1466 var ( 1467 // mm holds for each group the set of groups with a distance of 1. 1468 mm = make(map[int][]index) 1469 1470 // containment holds for each group the transitive closure of 1471 // containment of other groups. 1472 containment = make(map[index][]index) 1473 ) 1474 for _, g := range b.supp.TerritoryContainment.Group { 1475 group := b.region.index(g.Type) 1476 groupIdx := b.groups[group] 1477 for _, mem := range strings.Split(g.Contains, " ") { 1478 r := b.region.index(mem) 1479 mm[r] = append(mm[r], groupIdx) 1480 if g, ok := b.groups[r]; ok { 1481 mm[group] = append(mm[group], g) 1482 containment[groupIdx] = append(containment[groupIdx], g) 1483 } 1484 } 1485 } 1486 1487 regionContainment := make([]uint32, len(b.groups)) 1488 for _, g := range b.groups { 1489 l := containment[g] 1490 1491 // Compute the transitive closure of containment. 1492 for i := 0; i < len(l); i++ { 1493 l = append(l, containment[l[i]]...) 1494 } 1495 1496 // Compute the bitmask. 1497 regionContainment[g] = 1 << g 1498 for _, v := range l { 1499 regionContainment[g] |= 1 << v 1500 } 1501 // log.Printf("%d: %X", g, regionContainment[g]) 1502 } 1503 b.writeSlice("regionContainment", regionContainment) 1504 1505 regionInclusion := make([]uint8, len(b.region.s)) 1506 bvs := make(map[uint32]index) 1507 // Make the first bitvector positions correspond with the groups. 1508 for r, i := range b.groups { 1509 bv := uint32(1 << i) 1510 for _, g := range mm[r] { 1511 bv |= 1 << g 1512 } 1513 bvs[bv] = i 1514 regionInclusion[r] = uint8(bvs[bv]) 1515 } 1516 for r := 1; r < len(b.region.s); r++ { 1517 if _, ok := b.groups[r]; !ok { 1518 bv := uint32(0) 1519 for _, g := range mm[r] { 1520 bv |= 1 << g 1521 } 1522 if bv == 0 { 1523 // Pick the world for unspecified regions. 1524 bv = 1 << b.groups[b.region.index("001")] 1525 } 1526 if _, ok := bvs[bv]; !ok { 1527 bvs[bv] = index(len(bvs)) 1528 } 1529 regionInclusion[r] = uint8(bvs[bv]) 1530 } 1531 } 1532 b.writeSlice("regionInclusion", regionInclusion) 1533 regionInclusionBits := make([]uint32, len(bvs)) 1534 for k, v := range bvs { 1535 regionInclusionBits[v] = uint32(k) 1536 } 1537 // Add bit vectors for increasingly large distances until a fixed point is reached. 1538 regionInclusionNext := []uint8{} 1539 for i := 0; i < len(regionInclusionBits); i++ { 1540 bits := regionInclusionBits[i] 1541 next := bits 1542 for i := uint(0); i < uint(len(b.groups)); i++ { 1543 if bits&(1<<i) != 0 { 1544 next |= regionInclusionBits[i] 1545 } 1546 } 1547 if _, ok := bvs[next]; !ok { 1548 bvs[next] = index(len(bvs)) 1549 regionInclusionBits = append(regionInclusionBits, next) 1550 } 1551 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) 1552 } 1553 b.writeSlice("regionInclusionBits", regionInclusionBits) 1554 b.writeSlice("regionInclusionNext", regionInclusionNext) 1555 } 1556 1557 type parentRel struct { 1558 lang uint16 1559 script uint8 1560 maxScript uint8 1561 toRegion uint16 1562 fromRegion []uint16 1563 } 1564 1565 func (b *builder) writeParents() { 1566 b.writeType(parentRel{}) 1567 1568 parents := []parentRel{} 1569 1570 // Construct parent overrides. 1571 n := 0 1572 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { 1573 // Skipping non-standard scripts to root is implemented using addTags. 1574 if p.Parent == "root" { 1575 continue 1576 } 1577 1578 sub := strings.Split(p.Parent, "_") 1579 parent := parentRel{lang: b.langIndex(sub[0])} 1580 if len(sub) == 2 { 1581 // TODO: check that all undefined scripts are indeed Latn in these 1582 // cases. 1583 parent.maxScript = uint8(b.script.index("Latn")) 1584 parent.toRegion = uint16(b.region.index(sub[1])) 1585 } else { 1586 parent.script = uint8(b.script.index(sub[1])) 1587 parent.maxScript = parent.script 1588 parent.toRegion = uint16(b.region.index(sub[2])) 1589 } 1590 for _, c := range strings.Split(p.Locales, " ") { 1591 region := b.region.index(c[strings.LastIndex(c, "_")+1:]) 1592 parent.fromRegion = append(parent.fromRegion, uint16(region)) 1593 } 1594 parents = append(parents, parent) 1595 n += len(parent.fromRegion) 1596 } 1597 b.writeSliceAddSize("parents", n*2, parents) 1598 } 1599 1600 func main() { 1601 gen.Init() 1602 1603 gen.Repackage("gen_common.go", "common.go", "language") 1604 1605 w := gen.NewCodeWriter() 1606 defer w.WriteGoFile("tables.go", "language") 1607 1608 fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`) 1609 1610 b := newBuilder(w) 1611 gen.WriteCLDRVersion(w) 1612 1613 b.parseIndices() 1614 b.writeType(fromTo{}) 1615 b.writeLanguage() 1616 b.writeScript() 1617 b.writeRegion() 1618 b.writeVariant() 1619 // TODO: b.writeLocale() 1620 b.computeRegionGroups() 1621 b.writeLikelyData() 1622 b.writeMatchData() 1623 b.writeRegionInclusionData() 1624 b.writeParents() 1625 }