github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/language/maketables.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Language tag table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "io/ioutil" 18 "log" 19 "math" 20 "reflect" 21 "regexp" 22 "sort" 23 "strconv" 24 "strings" 25 26 "github.com/insionng/yougam/libraries/x/text/internal/gen" 27 "github.com/insionng/yougam/libraries/x/text/internal/tag" 28 "github.com/insionng/yougam/libraries/x/text/unicode/cldr" 29 ) 30 31 var ( 32 test = flag.Bool("test", 33 false, 34 "test existing tables; can be used to compare web data with package data.") 35 outputFile = flag.String("output", 36 "tables.go", 37 "output file for generated tables") 38 ) 39 40 var comment = []string{ 41 ` 42 lang holds an alphabetically sorted list of ISO-639 language identifiers. 43 All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. 44 For 2-byte language identifiers, the two successive bytes have the following meaning: 45 - if the first letter of the 2- and 3-letter ISO codes are the same: 46 the second and third letter of the 3-letter ISO code. 47 - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. 48 For 3-byte language identifiers the 4th byte is 0.`, 49 ` 50 langNoIndex is a bit vector of all 3-letter language codes that are not used as an index 51 in lookup tables. The language ids for these language codes are derived directly 52 from the letters and are not consecutive.`, 53 ` 54 altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives 55 to 2-letter language codes that cannot be derived using the method described above. 56 Each 3-letter code is followed by its 1-byte langID.`, 57 ` 58 altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, 59 ` 60 langAliasMap maps langIDs to their suggested replacements.`, 61 ` 62 script is an alphabetically sorted list of ISO 15924 codes. The index 63 of the script in the string, divided by 4, is the internal scriptID.`, 64 ` 65 isoRegionOffset needs to be added to the index of regionISO to obtain the regionID 66 for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for 67 the UN.M49 codes used for groups.)`, 68 ` 69 regionISO holds a list of alphabetically sorted 2-letter ISO region codes. 70 Each 2-letter codes is followed by two bytes with the following meaning: 71 - [A-Z}{2}: the first letter of the 2-letter code plus these two 72 letters form the 3-letter ISO code. 73 - 0, n: index into altRegionISO3.`, 74 ` 75 regionTypes defines the status of a region for various standards.`, 76 ` 77 m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are 78 codes indicating collections of regions.`, 79 ` 80 m49Index gives indexes into fromM49 based on the three most significant bits 81 of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in 82 fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] 83 for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. 84 The region code is stored in the 9 lsb of the indexed value.`, 85 ` 86 fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, 87 ` 88 altRegionISO3 holds a list of 3-letter region codes that cannot be 89 mapped to 2-letter codes using the default algorithm. This is a short list.`, 90 ` 91 altRegionIDs holds a list of regionIDs the positions of which match those 92 of the 3-letter ISO codes in altRegionISO3.`, 93 ` 94 variantNumSpecialized is the number of specialized variants in variants.`, 95 ` 96 suppressScript is an index from langID to the dominant script for that language, 97 if it exists. If a script is given, it should be suppressed from the language tag.`, 98 ` 99 likelyLang is a lookup table, indexed by langID, for the most likely 100 scripts and regions given incomplete information. If more entries exist for a 101 given language, region and script are the index and size respectively 102 of the list in likelyLangList.`, 103 ` 104 likelyLangList holds lists info associated with likelyLang.`, 105 ` 106 likelyRegion is a lookup table, indexed by regionID, for the most likely 107 languages and scripts given incomplete information. If more entries exist 108 for a given regionID, lang and script are the index and size respectively 109 of the list in likelyRegionList. 110 TODO: exclude containers and user-definable regions from the list.`, 111 ` 112 likelyRegionList holds lists info associated with likelyRegion.`, 113 ` 114 likelyScript is a lookup table, indexed by scriptID, for the most likely 115 languages and regions given a script.`, 116 ` 117 matchLang holds pairs of langIDs of base languages that are typically 118 mutually intelligible. Each pair is associated with a confidence and 119 whether the intelligibility goes one or both ways.`, 120 ` 121 matchScript holds pairs of scriptIDs where readers of one script 122 can typically also read the other. Each is associated with a confidence.`, 123 ` 124 nRegionGroups is the number of region groups.`, 125 ` 126 regionInclusion maps region identifiers to sets of regions in regionInclusionBits, 127 where each set holds all groupings that are directly connected in a region 128 containment graph.`, 129 ` 130 regionInclusionBits is an array of bit vectors where every vector represents 131 a set of region groupings. These sets are used to compute the distance 132 between two regions for the purpose of language matching.`, 133 ` 134 regionInclusionNext marks, for each entry in regionInclusionBits, the set of 135 all groups that are reachable from the groups set in the respective entry.`, 136 } 137 138 // TODO: consider changing some of these structures to tries. This can reduce 139 // memory, but may increase the need for memory allocations. This could be 140 // mitigated if we can piggyback on language tags for common cases. 141 142 func failOnError(e error) { 143 if e != nil { 144 log.Panic(e) 145 } 146 } 147 148 type setType int 149 150 const ( 151 Indexed setType = 1 + iota // all elements must be of same size 152 Linear 153 ) 154 155 type stringSet struct { 156 s []string 157 sorted, frozen bool 158 159 // We often need to update values after the creation of an index is completed. 160 // We include a convenience map for keeping track of this. 161 update map[string]string 162 typ setType // used for checking. 163 } 164 165 func (ss *stringSet) clone() stringSet { 166 c := *ss 167 c.s = append([]string(nil), c.s...) 168 return c 169 } 170 171 func (ss *stringSet) setType(t setType) { 172 if ss.typ != t && ss.typ != 0 { 173 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) 174 } 175 } 176 177 // parse parses a whitespace-separated string and initializes ss with its 178 // components. 179 func (ss *stringSet) parse(s string) { 180 scan := bufio.NewScanner(strings.NewReader(s)) 181 scan.Split(bufio.ScanWords) 182 for scan.Scan() { 183 ss.add(scan.Text()) 184 } 185 } 186 187 func (ss *stringSet) assertChangeable() { 188 if ss.frozen { 189 log.Panic("attempt to modify a frozen stringSet") 190 } 191 } 192 193 func (ss *stringSet) add(s string) { 194 ss.assertChangeable() 195 ss.s = append(ss.s, s) 196 ss.sorted = ss.frozen 197 } 198 199 func (ss *stringSet) freeze() { 200 ss.compact() 201 ss.frozen = true 202 } 203 204 func (ss *stringSet) compact() { 205 if ss.sorted { 206 return 207 } 208 a := ss.s 209 sort.Strings(a) 210 k := 0 211 for i := 1; i < len(a); i++ { 212 if a[k] != a[i] { 213 a[k+1] = a[i] 214 k++ 215 } 216 } 217 ss.s = a[:k+1] 218 ss.sorted = ss.frozen 219 } 220 221 type funcSorter struct { 222 fn func(a, b string) bool 223 sort.StringSlice 224 } 225 226 func (s funcSorter) Less(i, j int) bool { 227 return s.fn(s.StringSlice[i], s.StringSlice[j]) 228 } 229 230 func (ss *stringSet) sortFunc(f func(a, b string) bool) { 231 ss.compact() 232 sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) 233 } 234 235 func (ss *stringSet) remove(s string) { 236 ss.assertChangeable() 237 if i, ok := ss.find(s); ok { 238 copy(ss.s[i:], ss.s[i+1:]) 239 ss.s = ss.s[:len(ss.s)-1] 240 } 241 } 242 243 func (ss *stringSet) replace(ol, nu string) { 244 ss.s[ss.index(ol)] = nu 245 ss.sorted = ss.frozen 246 } 247 248 func (ss *stringSet) index(s string) int { 249 ss.setType(Indexed) 250 i, ok := ss.find(s) 251 if !ok { 252 if i < len(ss.s) { 253 log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) 254 } 255 log.Panicf("find: item %q is not in list", s) 256 257 } 258 return i 259 } 260 261 func (ss *stringSet) find(s string) (int, bool) { 262 ss.compact() 263 i := sort.SearchStrings(ss.s, s) 264 return i, i != len(ss.s) && ss.s[i] == s 265 } 266 267 func (ss *stringSet) slice() []string { 268 ss.compact() 269 return ss.s 270 } 271 272 func (ss *stringSet) updateLater(v, key string) { 273 if ss.update == nil { 274 ss.update = map[string]string{} 275 } 276 ss.update[v] = key 277 } 278 279 // join joins the string and ensures that all entries are of the same length. 280 func (ss *stringSet) join() string { 281 ss.setType(Indexed) 282 n := len(ss.s[0]) 283 for _, s := range ss.s { 284 if len(s) != n { 285 log.Panicf("join: not all entries are of the same length: %q", s) 286 } 287 } 288 ss.s = append(ss.s, strings.Repeat("\xff", n)) 289 return strings.Join(ss.s, "") 290 } 291 292 // ianaEntry holds information for an entry in the IANA Language Subtag Repository. 293 // All types use the same entry. 294 // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various 295 // fields. 296 type ianaEntry struct { 297 typ string 298 description []string 299 scope string 300 added string 301 preferred string 302 deprecated string 303 suppressScript string 304 macro string 305 prefix []string 306 } 307 308 type builder struct { 309 w *gen.CodeWriter 310 hw io.Writer // MultiWriter for w and w.Hash 311 data *cldr.CLDR 312 supp *cldr.SupplementalData 313 314 // indices 315 locale stringSet // common locales 316 lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data 317 langNoIndex stringSet // 3-letter ISO codes with no associated data 318 script stringSet // 4-letter ISO codes 319 region stringSet // 2-letter ISO or 3-digit UN M49 codes 320 variant stringSet // 4-8-alphanumeric variant code. 321 322 // Region codes that are groups with their corresponding group IDs. 323 groups map[int]index 324 325 // langInfo 326 registry map[string]*ianaEntry 327 } 328 329 type index uint 330 331 func newBuilder(w *gen.CodeWriter) *builder { 332 r := gen.OpenCLDRCoreZip() 333 defer r.Close() 334 d := &cldr.Decoder{} 335 data, err := d.DecodeZip(r) 336 failOnError(err) 337 b := builder{ 338 w: w, 339 hw: io.MultiWriter(w, w.Hash), 340 data: data, 341 supp: data.Supplemental(), 342 } 343 b.parseRegistry() 344 return &b 345 } 346 347 func (b *builder) parseRegistry() { 348 r := gen.OpenIANAFile("assignments/language-subtag-registry") 349 defer r.Close() 350 b.registry = make(map[string]*ianaEntry) 351 352 scan := bufio.NewScanner(r) 353 scan.Split(bufio.ScanWords) 354 var record *ianaEntry 355 for more := scan.Scan(); more; { 356 key := scan.Text() 357 more = scan.Scan() 358 value := scan.Text() 359 switch key { 360 case "Type:": 361 record = &ianaEntry{typ: value} 362 case "Subtag:", "Tag:": 363 if s := strings.SplitN(value, "..", 2); len(s) > 1 { 364 for a := s[0]; a <= s[1]; a = inc(a) { 365 b.addToRegistry(a, record) 366 } 367 } else { 368 b.addToRegistry(value, record) 369 } 370 case "Suppress-Script:": 371 record.suppressScript = value 372 case "Added:": 373 record.added = value 374 case "Deprecated:": 375 record.deprecated = value 376 case "Macrolanguage:": 377 record.macro = value 378 case "Preferred-Value:": 379 record.preferred = value 380 case "Prefix:": 381 record.prefix = append(record.prefix, value) 382 case "Scope:": 383 record.scope = value 384 case "Description:": 385 buf := []byte(value) 386 for more = scan.Scan(); more; more = scan.Scan() { 387 b := scan.Bytes() 388 if b[0] == '%' || b[len(b)-1] == ':' { 389 break 390 } 391 buf = append(buf, ' ') 392 buf = append(buf, b...) 393 } 394 record.description = append(record.description, string(buf)) 395 continue 396 default: 397 continue 398 } 399 more = scan.Scan() 400 } 401 if scan.Err() != nil { 402 log.Panic(scan.Err()) 403 } 404 } 405 406 func (b *builder) addToRegistry(key string, entry *ianaEntry) { 407 if info, ok := b.registry[key]; ok { 408 if info.typ != "language" || entry.typ != "extlang" { 409 log.Fatalf("parseRegistry: tag %q already exists", key) 410 } 411 } else { 412 b.registry[key] = entry 413 } 414 } 415 416 var commentIndex = make(map[string]string) 417 418 func init() { 419 for _, s := range comment { 420 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) 421 commentIndex[key] = s 422 } 423 } 424 425 func (b *builder) comment(name string) { 426 if s := commentIndex[name]; len(s) > 0 { 427 b.w.WriteComment(s) 428 } else { 429 fmt.Fprintln(b.w) 430 } 431 } 432 433 func (b *builder) pf(f string, x ...interface{}) { 434 fmt.Fprintf(b.hw, f, x...) 435 fmt.Fprint(b.hw, "\n") 436 } 437 438 func (b *builder) p(x ...interface{}) { 439 fmt.Fprintln(b.hw, x...) 440 } 441 442 func (b *builder) addSize(s int) { 443 b.w.Size += s 444 b.pf("// Size: %d bytes", s) 445 } 446 447 func (b *builder) writeConst(name string, x interface{}) { 448 b.comment(name) 449 b.w.WriteConst(name, x) 450 } 451 452 // writeConsts computes f(v) for all v in values and writes the results 453 // as constants named _v to a single constant block. 454 func (b *builder) writeConsts(f func(string) int, values ...string) { 455 b.pf("const (") 456 for _, v := range values { 457 b.pf("\t_%s = %v", v, f(v)) 458 } 459 b.pf(")") 460 } 461 462 // writeType writes the type of the given value, which must be a struct. 463 func (b *builder) writeType(value interface{}) { 464 b.comment(reflect.TypeOf(value).Name()) 465 b.w.WriteType(value) 466 } 467 468 func (b *builder) writeSlice(name string, ss interface{}) { 469 b.writeSliceAddSize(name, 0, ss) 470 } 471 472 func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { 473 b.comment(name) 474 b.w.Size += extraSize 475 v := reflect.ValueOf(ss) 476 t := v.Type().Elem() 477 b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) 478 479 fmt.Fprintf(b.w, "var %s = ", name) 480 b.w.WriteArray(ss) 481 b.p() 482 } 483 484 type fromTo struct { 485 from, to uint16 486 } 487 488 func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { 489 ss.sortFunc(func(a, b string) bool { 490 return index(a) < index(b) 491 }) 492 m := []fromTo{} 493 for _, s := range ss.s { 494 m = append(m, fromTo{index(s), index(ss.update[s])}) 495 } 496 b.writeSlice(name, m) 497 } 498 499 const base = 'z' - 'a' + 1 500 501 func strToInt(s string) uint { 502 v := uint(0) 503 for i := 0; i < len(s); i++ { 504 v *= base 505 v += uint(s[i] - 'a') 506 } 507 return v 508 } 509 510 // converts the given integer to the original ASCII string passed to strToInt. 511 // len(s) must match the number of characters obtained. 512 func intToStr(v uint, s []byte) { 513 for i := len(s) - 1; i >= 0; i-- { 514 s[i] = byte(v%base) + 'a' 515 v /= base 516 } 517 } 518 519 func (b *builder) writeBitVector(name string, ss []string) { 520 vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) 521 for _, s := range ss { 522 v := strToInt(s) 523 vec[v/8] |= 1 << (v % 8) 524 } 525 b.writeSlice(name, vec) 526 } 527 528 // TODO: convert this type into a list or two-stage trie. 529 func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { 530 b.comment(name) 531 v := reflect.ValueOf(m) 532 sz := v.Len() * (2 + int(v.Type().Key().Size())) 533 for _, k := range m { 534 sz += len(k) 535 } 536 b.addSize(sz) 537 keys := []string{} 538 b.pf(`var %s = map[string]uint16{`, name) 539 for k := range m { 540 keys = append(keys, k) 541 } 542 sort.Strings(keys) 543 for _, k := range keys { 544 b.pf("\t%q: %v,", k, f(m[k])) 545 } 546 b.p("}") 547 } 548 549 func (b *builder) writeMap(name string, m interface{}) { 550 b.comment(name) 551 v := reflect.ValueOf(m) 552 sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) 553 b.addSize(sz) 554 f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { 555 return strings.IndexRune("{}, ", r) != -1 556 }) 557 sort.Strings(f[1:]) 558 b.pf(`var %s = %s{`, name, f[0]) 559 for _, kv := range f[1:] { 560 b.pf("\t%s,", kv) 561 } 562 b.p("}") 563 } 564 565 func (b *builder) langIndex(s string) uint16 { 566 if s == "und" { 567 return 0 568 } 569 if i, ok := b.lang.find(s); ok { 570 return uint16(i) 571 } 572 return uint16(strToInt(s)) + uint16(len(b.lang.s)) 573 } 574 575 // inc advances the string to its lexicographical successor. 576 func inc(s string) string { 577 const maxTagLength = 4 578 var buf [maxTagLength]byte 579 intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) 580 for i := 0; i < len(s); i++ { 581 if s[i] <= 'Z' { 582 buf[i] -= 'a' - 'A' 583 } 584 } 585 return string(buf[:len(s)]) 586 } 587 588 func (b *builder) parseIndices() { 589 meta := b.supp.Metadata 590 591 for k, v := range b.registry { 592 var ss *stringSet 593 switch v.typ { 594 case "language": 595 if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { 596 b.lang.add(k) 597 continue 598 } else { 599 ss = &b.langNoIndex 600 } 601 case "region": 602 ss = &b.region 603 case "script": 604 ss = &b.script 605 case "variant": 606 ss = &b.variant 607 default: 608 continue 609 } 610 ss.add(k) 611 } 612 // Include any language for which there is data. 613 for _, lang := range b.data.Locales() { 614 if x := b.data.RawLDML(lang); false || 615 x.LocaleDisplayNames != nil || 616 x.Characters != nil || 617 x.Delimiters != nil || 618 x.Measurement != nil || 619 x.Dates != nil || 620 x.Numbers != nil || 621 x.Units != nil || 622 x.ListPatterns != nil || 623 x.Collations != nil || 624 x.Segmentations != nil || 625 x.Rbnf != nil || 626 x.Annotations != nil || 627 x.Metadata != nil { 628 629 from := strings.Split(lang, "_") 630 if lang := from[0]; lang != "root" { 631 b.lang.add(lang) 632 } 633 } 634 } 635 // Include locales for plural rules, which uses a different structure. 636 for _, plurals := range b.data.Supplemental().Plurals { 637 for _, rules := range plurals.PluralRules { 638 for _, lang := range strings.Split(rules.Locales, " ") { 639 if lang = strings.Split(lang, "_")[0]; lang != "root" { 640 b.lang.add(lang) 641 } 642 } 643 } 644 } 645 // Include languages in likely subtags. 646 for _, m := range b.supp.LikelySubtags.LikelySubtag { 647 from := strings.Split(m.From, "_") 648 b.lang.add(from[0]) 649 } 650 // Include ISO-639 alpha-3 bibliographic entries. 651 for _, a := range meta.Alias.LanguageAlias { 652 if a.Reason == "bibliographic" { 653 b.langNoIndex.add(a.Type) 654 } 655 } 656 // Include regions in territoryAlias (not all are in the IANA registry!) 657 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 658 if len(reg.Type) == 2 { 659 b.region.add(reg.Type) 660 } 661 } 662 663 for _, s := range b.lang.s { 664 if len(s) == 3 { 665 b.langNoIndex.remove(s) 666 } 667 } 668 b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) 669 b.writeConst("numScripts", len(b.script.slice())) 670 b.writeConst("numRegions", len(b.region.slice())) 671 672 // Add dummy codes at the start of each list to represent "unspecified". 673 b.lang.add("---") 674 b.script.add("----") 675 b.region.add("---") 676 677 // common locales 678 b.locale.parse(meta.DefaultContent.Locales) 679 } 680 681 func (b *builder) computeRegionGroups() { 682 b.groups = make(map[int]index) 683 684 // Create group indices. 685 for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. 686 b.groups[i] = index(len(b.groups)) 687 } 688 for _, g := range b.supp.TerritoryContainment.Group { 689 group := b.region.index(g.Type) 690 if _, ok := b.groups[group]; !ok { 691 b.groups[group] = index(len(b.groups)) 692 } 693 } 694 if len(b.groups) > 32 { 695 log.Fatalf("only 32 groups supported, found %d", len(b.groups)) 696 } 697 b.writeConst("nRegionGroups", len(b.groups)) 698 } 699 700 var langConsts = []string{ 701 "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", 702 "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", 703 "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", 704 "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", 705 "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", 706 "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", 707 708 // constants for grandfathered tags (if not already defined) 709 "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", 710 "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", 711 } 712 713 // writeLanguage generates all tables needed for language canonicalization. 714 func (b *builder) writeLanguage() { 715 meta := b.supp.Metadata 716 717 b.writeConst("nonCanonicalUnd", b.lang.index("und")) 718 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 719 b.writeConst("langPrivateStart", b.langIndex("qaa")) 720 b.writeConst("langPrivateEnd", b.langIndex("qtz")) 721 722 // Get language codes that need to be mapped (overlong 3-letter codes, 723 // deprecated 2-letter codes, legacy and grandfathered tags.) 724 langAliasMap := stringSet{} 725 aliasTypeMap := map[string]langAliasType{} 726 727 // altLangISO3 get the alternative ISO3 names that need to be mapped. 728 altLangISO3 := stringSet{} 729 // Add dummy start to avoid the use of index 0. 730 altLangISO3.add("---") 731 altLangISO3.updateLater("---", "aa") 732 733 lang := b.lang.clone() 734 for _, a := range meta.Alias.LanguageAlias { 735 if a.Replacement == "" { 736 a.Replacement = "und" 737 } 738 // TODO: support mapping to tags 739 repl := strings.SplitN(a.Replacement, "_", 2)[0] 740 if a.Reason == "overlong" { 741 if len(a.Replacement) == 2 && len(a.Type) == 3 { 742 lang.updateLater(a.Replacement, a.Type) 743 } 744 } else if len(a.Type) <= 3 { 745 switch a.Reason { 746 case "macrolanguage": 747 aliasTypeMap[a.Type] = langMacro 748 case "deprecated": 749 // handled elsewhere 750 continue 751 case "bibliographic", "legacy": 752 if a.Type == "no" { 753 continue 754 } 755 aliasTypeMap[a.Type] = langLegacy 756 default: 757 log.Fatalf("new %s alias: %s", a.Reason, a.Type) 758 } 759 langAliasMap.add(a.Type) 760 langAliasMap.updateLater(a.Type, repl) 761 } 762 } 763 // Manually add the mapping of "nb" (Norwegian) to its macro language. 764 // This can be removed if CLDR adopts this change. 765 langAliasMap.add("nb") 766 langAliasMap.updateLater("nb", "no") 767 aliasTypeMap["nb"] = langMacro 768 769 for k, v := range b.registry { 770 // Also add deprecated values for 3-letter ISO codes, which CLDR omits. 771 if v.typ == "language" && v.deprecated != "" && v.preferred != "" { 772 langAliasMap.add(k) 773 langAliasMap.updateLater(k, v.preferred) 774 aliasTypeMap[k] = langDeprecated 775 } 776 } 777 // Fix CLDR mappings. 778 lang.updateLater("tl", "tgl") 779 lang.updateLater("sh", "hbs") 780 lang.updateLater("mo", "mol") 781 lang.updateLater("no", "nor") 782 lang.updateLater("tw", "twi") 783 lang.updateLater("nb", "nob") 784 lang.updateLater("ak", "aka") 785 786 // Ensure that each 2-letter code is matched with a 3-letter code. 787 for _, v := range lang.s[1:] { 788 s, ok := lang.update[v] 789 if !ok { 790 if s, ok = lang.update[langAliasMap.update[v]]; !ok { 791 continue 792 } 793 lang.update[v] = s 794 } 795 if v[0] != s[0] { 796 altLangISO3.add(s) 797 altLangISO3.updateLater(s, v) 798 } 799 } 800 801 // Complete canonialized language tags. 802 lang.freeze() 803 for i, v := range lang.s { 804 // We can avoid these manual entries by using the IANI registry directly. 805 // Seems easier to update the list manually, as changes are rare. 806 // The panic in this loop will trigger if we miss an entry. 807 add := "" 808 if s, ok := lang.update[v]; ok { 809 if s[0] == v[0] { 810 add = s[1:] 811 } else { 812 add = string([]byte{0, byte(altLangISO3.index(s))}) 813 } 814 } else if len(v) == 3 { 815 add = "\x00" 816 } else { 817 log.Panicf("no data for long form of %q", v) 818 } 819 lang.s[i] += add 820 } 821 b.writeConst("lang", tag.Index(lang.join())) 822 823 b.writeConst("langNoIndexOffset", len(b.lang.s)) 824 825 // space of all valid 3-letter language identifiers. 826 b.writeBitVector("langNoIndex", b.langNoIndex.slice()) 827 828 altLangIndex := []uint16{} 829 for i, s := range altLangISO3.slice() { 830 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) 831 if i > 0 { 832 idx := b.lang.index(altLangISO3.update[s]) 833 altLangIndex = append(altLangIndex, uint16(idx)) 834 } 835 } 836 b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) 837 b.writeSlice("altLangIndex", altLangIndex) 838 839 b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex) 840 types := make([]langAliasType, len(langAliasMap.s)) 841 for i, s := range langAliasMap.s { 842 types[i] = aliasTypeMap[s] 843 } 844 b.writeSlice("langAliasTypes", types) 845 } 846 847 var scriptConsts = []string{ 848 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 849 "Zzzz", 850 } 851 852 func (b *builder) writeScript() { 853 b.writeConsts(b.script.index, scriptConsts...) 854 b.writeConst("script", tag.Index(b.script.join())) 855 856 supp := make([]uint8, len(b.lang.slice())) 857 for i, v := range b.lang.slice()[1:] { 858 if sc := b.registry[v].suppressScript; sc != "" { 859 supp[i+1] = uint8(b.script.index(sc)) 860 } 861 } 862 b.writeSlice("suppressScript", supp) 863 864 // There is only one deprecated script in CLDR. This value is hard-coded. 865 // We check here if the code must be updated. 866 for _, a := range b.supp.Metadata.Alias.ScriptAlias { 867 if a.Type != "Qaai" { 868 log.Panicf("unexpected deprecated stript %q", a.Type) 869 } 870 } 871 } 872 873 func parseM49(s string) int16 { 874 if len(s) == 0 { 875 return 0 876 } 877 v, err := strconv.ParseUint(s, 10, 10) 878 failOnError(err) 879 return int16(v) 880 } 881 882 var regionConsts = []string{ 883 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 884 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 885 } 886 887 func (b *builder) writeRegion() { 888 b.writeConsts(b.region.index, regionConsts...) 889 890 isoOffset := b.region.index("AA") 891 m49map := make([]int16, len(b.region.slice())) 892 fromM49map := make(map[int16]int) 893 altRegionISO3 := "" 894 altRegionIDs := []uint16{} 895 896 b.writeConst("isoRegionOffset", isoOffset) 897 898 // 2-letter region lookup and mapping to numeric codes. 899 regionISO := b.region.clone() 900 regionISO.s = regionISO.s[isoOffset:] 901 regionISO.sorted = false 902 903 regionTypes := make([]byte, len(b.region.s)) 904 905 // Is the region valid BCP 47? 906 for s, e := range b.registry { 907 if len(s) == 2 && s == strings.ToUpper(s) { 908 i := b.region.index(s) 909 for _, d := range e.description { 910 if strings.Contains(d, "Private use") { 911 regionTypes[i] = iso3166UserAssgined 912 } 913 } 914 regionTypes[i] |= bcp47Region 915 } 916 } 917 918 // Is the region a valid ccTLD? 919 r := gen.OpenIANAFile("domains/root/db") 920 defer r.Close() 921 922 buf, err := ioutil.ReadAll(r) 923 failOnError(err) 924 re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) 925 for _, m := range re.FindAllSubmatch(buf, -1) { 926 i := b.region.index(strings.ToUpper(string(m[1]))) 927 regionTypes[i] |= ccTLD 928 } 929 930 b.writeSlice("regionTypes", regionTypes) 931 932 iso3Set := make(map[string]int) 933 update := func(iso2, iso3 string) { 934 i := regionISO.index(iso2) 935 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { 936 regionISO.s[i] += iso3[1:] 937 iso3Set[iso3] = -1 938 } else { 939 if ok && j >= 0 { 940 regionISO.s[i] += string([]byte{0, byte(j)}) 941 } else { 942 iso3Set[iso3] = len(altRegionISO3) 943 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) 944 altRegionISO3 += iso3 945 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) 946 } 947 } 948 } 949 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 950 i := regionISO.index(tc.Type) + isoOffset 951 if d := m49map[i]; d != 0 { 952 log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) 953 } 954 m49 := parseM49(tc.Numeric) 955 m49map[i] = m49 956 if r := fromM49map[m49]; r == 0 { 957 fromM49map[m49] = i 958 } else if r != i { 959 dep := b.registry[regionISO.s[r-isoOffset]].deprecated 960 if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { 961 fromM49map[m49] = i 962 } 963 } 964 } 965 for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { 966 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { 967 from := parseM49(ta.Type) 968 if r := fromM49map[from]; r == 0 { 969 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset 970 } 971 } 972 } 973 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 974 if len(tc.Alpha3) == 3 { 975 update(tc.Type, tc.Alpha3) 976 } 977 } 978 // This entries are not included in territoryCodes. Mostly 3-letter variants 979 // of deleted codes and an entry for QU. 980 for _, m := range []struct{ iso2, iso3 string }{ 981 {"CT", "CTE"}, 982 {"DY", "DHY"}, 983 {"HV", "HVO"}, 984 {"JT", "JTN"}, 985 {"MI", "MID"}, 986 {"NH", "NHB"}, 987 {"NQ", "ATN"}, 988 {"PC", "PCI"}, 989 {"PU", "PUS"}, 990 {"PZ", "PCZ"}, 991 {"RH", "RHO"}, 992 {"VD", "VDR"}, 993 {"WK", "WAK"}, 994 // These three-letter codes are used for others as well. 995 {"FQ", "ATF"}, 996 } { 997 update(m.iso2, m.iso3) 998 } 999 for i, s := range regionISO.s { 1000 if len(s) != 4 { 1001 regionISO.s[i] = s + " " 1002 } 1003 } 1004 b.writeConst("regionISO", tag.Index(regionISO.join())) 1005 b.writeConst("altRegionISO3", altRegionISO3) 1006 b.writeSlice("altRegionIDs", altRegionIDs) 1007 1008 // Create list of deprecated regions. 1009 // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only 1010 // Transitionally-reserved mapping not included. 1011 regionOldMap := stringSet{} 1012 // Include regions in territoryAlias (not all are in the IANA registry!) 1013 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 1014 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { 1015 regionOldMap.add(reg.Type) 1016 regionOldMap.updateLater(reg.Type, reg.Replacement) 1017 i, _ := regionISO.find(reg.Type) 1018 j, _ := regionISO.find(reg.Replacement) 1019 if k := m49map[i+isoOffset]; k == 0 { 1020 m49map[i+isoOffset] = m49map[j+isoOffset] 1021 } 1022 } 1023 } 1024 b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { 1025 return uint16(b.region.index(s)) 1026 }) 1027 // 3-digit region lookup, groupings. 1028 for i := 1; i < isoOffset; i++ { 1029 m := parseM49(b.region.s[i]) 1030 m49map[i] = m 1031 fromM49map[m] = i 1032 } 1033 b.writeSlice("m49", m49map) 1034 1035 const ( 1036 searchBits = 7 1037 regionBits = 9 1038 ) 1039 if len(m49map) >= 1<<regionBits { 1040 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) 1041 } 1042 m49Index := [9]int16{} 1043 fromM49 := []uint16{} 1044 m49 := []int{} 1045 for k, _ := range fromM49map { 1046 m49 = append(m49, int(k)) 1047 } 1048 sort.Ints(m49) 1049 for _, k := range m49[1:] { 1050 val := (k & (1<<searchBits - 1)) << regionBits 1051 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) 1052 m49Index[1:][k>>searchBits] = int16(len(fromM49)) 1053 } 1054 b.writeSlice("m49Index", m49Index) 1055 b.writeSlice("fromM49", fromM49) 1056 } 1057 1058 const ( 1059 // TODO: put these lists in regionTypes as user data? Could be used for 1060 // various optimizations and refinements and could be exposed in the API. 1061 iso3166Except = "AC CP DG EA EU FX IC SU TA UK" 1062 iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. 1063 // DY and RH are actually not deleted, but indeterminately reserved. 1064 iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" 1065 ) 1066 1067 const ( 1068 iso3166UserAssgined = 1 << iota 1069 ccTLD 1070 bcp47Region 1071 ) 1072 1073 func find(list []string, s string) int { 1074 for i, t := range list { 1075 if t == s { 1076 return i 1077 } 1078 } 1079 return -1 1080 } 1081 1082 // writeVariants generates per-variant information and creates a map from variant 1083 // name to index value. We assign index values such that sorting multiple 1084 // variants by index value will result in the correct order. 1085 // There are two types of variants: specialized and general. Specialized variants 1086 // are only applicable to certain language or language-script pairs. Generalized 1087 // variants apply to any language. Generalized variants always sort after 1088 // specialized variants. We will therefore always assign a higher index value 1089 // to a generalized variant than any other variant. Generalized variants are 1090 // sorted alphabetically among themselves. 1091 // Specialized variants may also sort after other specialized variants. Such 1092 // variants will be ordered after any of the variants they may follow. 1093 // We assume that if a variant x is followed by a variant y, then for any prefix 1094 // p of x, p-x is a prefix of y. This allows us to order tags based on the 1095 // maximum of the length of any of its prefixes. 1096 // TODO: it is possible to define a set of Prefix values on variants such that 1097 // a total order cannot be defined to the point that this algorithm breaks. 1098 // In other words, we cannot guarantee the same order of variants for the 1099 // future using the same algorithm or for non-compliant combinations of 1100 // variants. For this reason, consider using simple alphabetic sorting 1101 // of variants and ignore Prefix restrictions altogether. 1102 func (b *builder) writeVariant() { 1103 generalized := stringSet{} 1104 specialized := stringSet{} 1105 specializedExtend := stringSet{} 1106 // Collate the variants by type and check assumptions. 1107 for _, v := range b.variant.slice() { 1108 e := b.registry[v] 1109 if len(e.prefix) == 0 { 1110 generalized.add(v) 1111 continue 1112 } 1113 c := strings.Split(e.prefix[0], "-") 1114 hasScriptOrRegion := false 1115 if len(c) > 1 { 1116 _, hasScriptOrRegion = b.script.find(c[1]) 1117 if !hasScriptOrRegion { 1118 _, hasScriptOrRegion = b.region.find(c[1]) 1119 1120 } 1121 } 1122 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { 1123 // Variant is preceded by a language. 1124 specialized.add(v) 1125 continue 1126 } 1127 // Variant is preceded by another variant. 1128 specializedExtend.add(v) 1129 prefix := c[0] + "-" 1130 if hasScriptOrRegion { 1131 prefix += c[1] 1132 } 1133 for _, p := range e.prefix { 1134 // Verify that the prefix minus the last element is a prefix of the 1135 // predecessor element. 1136 i := strings.LastIndex(p, "-") 1137 pred := b.registry[p[i+1:]] 1138 if find(pred.prefix, p[:i]) < 0 { 1139 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) 1140 } 1141 // The sorting used below does not work in the general case. It works 1142 // if we assume that variants that may be followed by others only have 1143 // prefixes of the same length. Verify this. 1144 count := strings.Count(p[:i], "-") 1145 for _, q := range pred.prefix { 1146 if c := strings.Count(q, "-"); c != count { 1147 log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) 1148 } 1149 } 1150 if !strings.HasPrefix(p, prefix) { 1151 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) 1152 } 1153 } 1154 } 1155 1156 // Sort extended variants. 1157 a := specializedExtend.s 1158 less := func(v, w string) bool { 1159 // Sort by the maximum number of elements. 1160 maxCount := func(s string) (max int) { 1161 for _, p := range b.registry[s].prefix { 1162 if c := strings.Count(p, "-"); c > max { 1163 max = c 1164 } 1165 } 1166 return 1167 } 1168 if cv, cw := maxCount(v), maxCount(w); cv != cw { 1169 return cv < cw 1170 } 1171 // Sort by name as tie breaker. 1172 return v < w 1173 } 1174 sort.Sort(funcSorter{less, sort.StringSlice(a)}) 1175 specializedExtend.frozen = true 1176 1177 // Create index from variant name to index. 1178 variantIndex := make(map[string]uint8) 1179 add := func(s []string) { 1180 for _, v := range s { 1181 variantIndex[v] = uint8(len(variantIndex)) 1182 } 1183 } 1184 add(specialized.slice()) 1185 add(specializedExtend.s) 1186 numSpecialized := len(variantIndex) 1187 add(generalized.slice()) 1188 if n := len(variantIndex); n > 255 { 1189 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) 1190 } 1191 b.writeMap("variantIndex", variantIndex) 1192 b.writeConst("variantNumSpecialized", numSpecialized) 1193 } 1194 1195 func (b *builder) writeLanguageInfo() { 1196 } 1197 1198 // writeLikelyData writes tables that are used both for finding parent relations and for 1199 // language matching. Each entry contains additional bits to indicate the status of the 1200 // data to know when it cannot be used for parent relations. 1201 func (b *builder) writeLikelyData() { 1202 const ( 1203 isList = 1 << iota 1204 scriptInFrom 1205 regionInFrom 1206 ) 1207 type ( // generated types 1208 likelyScriptRegion struct { 1209 region uint16 1210 script uint8 1211 flags uint8 1212 } 1213 likelyLangScript struct { 1214 lang uint16 1215 script uint8 1216 flags uint8 1217 } 1218 likelyLangRegion struct { 1219 lang uint16 1220 region uint16 1221 } 1222 // likelyTag is used for getting likely tags for group regions, where 1223 // the likely region might be a region contained in the group. 1224 likelyTag struct { 1225 lang uint16 1226 region uint16 1227 script uint8 1228 } 1229 ) 1230 var ( // generated variables 1231 likelyRegionGroup = make([]likelyTag, len(b.groups)) 1232 likelyLang = make([]likelyScriptRegion, len(b.lang.s)) 1233 likelyRegion = make([]likelyLangScript, len(b.region.s)) 1234 likelyScript = make([]likelyLangRegion, len(b.script.s)) 1235 likelyLangList = []likelyScriptRegion{} 1236 likelyRegionList = []likelyLangScript{} 1237 ) 1238 type fromTo struct { 1239 from, to []string 1240 } 1241 langToOther := map[int][]fromTo{} 1242 regionToOther := map[int][]fromTo{} 1243 for _, m := range b.supp.LikelySubtags.LikelySubtag { 1244 from := strings.Split(m.From, "_") 1245 to := strings.Split(m.To, "_") 1246 if len(to) != 3 { 1247 log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) 1248 } 1249 if len(from) > 3 { 1250 log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) 1251 } 1252 if from[0] != to[0] && from[0] != "und" { 1253 log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) 1254 } 1255 if len(from) == 3 { 1256 if from[2] != to[2] { 1257 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) 1258 } 1259 if from[0] != "und" { 1260 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) 1261 } 1262 } 1263 if len(from) == 1 || from[0] != "und" { 1264 id := 0 1265 if from[0] != "und" { 1266 id = b.lang.index(from[0]) 1267 } 1268 langToOther[id] = append(langToOther[id], fromTo{from, to}) 1269 } else if len(from) == 2 && len(from[1]) == 4 { 1270 sid := b.script.index(from[1]) 1271 likelyScript[sid].lang = uint16(b.langIndex(to[0])) 1272 likelyScript[sid].region = uint16(b.region.index(to[2])) 1273 } else { 1274 r := b.region.index(from[len(from)-1]) 1275 if id, ok := b.groups[r]; ok { 1276 if from[0] != "und" { 1277 log.Fatalf("region changed unexpectedly: %s -> %s", from, to) 1278 } 1279 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) 1280 likelyRegionGroup[id].script = uint8(b.script.index(to[1])) 1281 likelyRegionGroup[id].region = uint16(b.region.index(to[2])) 1282 } else { 1283 regionToOther[r] = append(regionToOther[r], fromTo{from, to}) 1284 } 1285 } 1286 } 1287 b.writeType(likelyLangRegion{}) 1288 b.writeSlice("likelyScript", likelyScript) 1289 1290 for id := range b.lang.s { 1291 list := langToOther[id] 1292 if len(list) == 1 { 1293 likelyLang[id].region = uint16(b.region.index(list[0].to[2])) 1294 likelyLang[id].script = uint8(b.script.index(list[0].to[1])) 1295 } else if len(list) > 1 { 1296 likelyLang[id].flags = isList 1297 likelyLang[id].region = uint16(len(likelyLangList)) 1298 likelyLang[id].script = uint8(len(list)) 1299 for _, x := range list { 1300 flags := uint8(0) 1301 if len(x.from) > 1 { 1302 if x.from[1] == x.to[2] { 1303 flags = regionInFrom 1304 } else { 1305 flags = scriptInFrom 1306 } 1307 } 1308 likelyLangList = append(likelyLangList, likelyScriptRegion{ 1309 region: uint16(b.region.index(x.to[2])), 1310 script: uint8(b.script.index(x.to[1])), 1311 flags: flags, 1312 }) 1313 } 1314 } 1315 } 1316 // TODO: merge suppressScript data with this table. 1317 b.writeType(likelyScriptRegion{}) 1318 b.writeSlice("likelyLang", likelyLang) 1319 b.writeSlice("likelyLangList", likelyLangList) 1320 1321 for id := range b.region.s { 1322 list := regionToOther[id] 1323 if len(list) == 1 { 1324 likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) 1325 likelyRegion[id].script = uint8(b.script.index(list[0].to[1])) 1326 if len(list[0].from) > 2 { 1327 likelyRegion[id].flags = scriptInFrom 1328 } 1329 } else if len(list) > 1 { 1330 likelyRegion[id].flags = isList 1331 likelyRegion[id].lang = uint16(len(likelyRegionList)) 1332 likelyRegion[id].script = uint8(len(list)) 1333 for i, x := range list { 1334 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { 1335 log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) 1336 } 1337 x := likelyLangScript{ 1338 lang: uint16(b.langIndex(x.to[0])), 1339 script: uint8(b.script.index(x.to[1])), 1340 } 1341 if len(list[0].from) > 2 { 1342 x.flags = scriptInFrom 1343 } 1344 likelyRegionList = append(likelyRegionList, x) 1345 } 1346 } 1347 } 1348 b.writeType(likelyLangScript{}) 1349 b.writeSlice("likelyRegion", likelyRegion) 1350 b.writeSlice("likelyRegionList", likelyRegionList) 1351 1352 b.writeType(likelyTag{}) 1353 b.writeSlice("likelyRegionGroup", likelyRegionGroup) 1354 } 1355 1356 type mutualIntelligibility struct { 1357 want, have uint16 1358 conf uint8 1359 oneway bool 1360 } 1361 1362 type scriptIntelligibility struct { 1363 lang uint16 // langID or 0 if * 1364 want, have uint8 1365 conf uint8 1366 } 1367 1368 type sortByConf []mutualIntelligibility 1369 1370 func (l sortByConf) Less(a, b int) bool { 1371 return l[a].conf > l[b].conf 1372 } 1373 1374 func (l sortByConf) Swap(a, b int) { 1375 l[a], l[b] = l[b], l[a] 1376 } 1377 1378 func (l sortByConf) Len() int { 1379 return len(l) 1380 } 1381 1382 // toConf converts a percentage value [0, 100] to a confidence class. 1383 func toConf(pct uint8) uint8 { 1384 switch { 1385 case pct == 100: 1386 return 3 // Exact 1387 case pct >= 90: 1388 return 2 // High 1389 case pct > 50: 1390 return 1 // Low 1391 default: 1392 return 0 // No 1393 } 1394 } 1395 1396 // writeMatchData writes tables with languages and scripts for which there is 1397 // mutual intelligibility. The data is based on CLDR's languageMatching data. 1398 // Note that we use a different algorithm than the one defined by CLDR and that 1399 // we slightly modify the data. For example, we convert scores to confidence levels. 1400 // We also drop all region-related data as we use a different algorithm to 1401 // determine region equivalence. 1402 func (b *builder) writeMatchData() { 1403 b.writeType(mutualIntelligibility{}) 1404 b.writeType(scriptIntelligibility{}) 1405 lm := b.supp.LanguageMatching.LanguageMatches 1406 cldr.MakeSlice(&lm).SelectAnyOf("type", "written") 1407 1408 matchLang := []mutualIntelligibility{} 1409 matchScript := []scriptIntelligibility{} 1410 // Convert the languageMatch entries in lists keyed by desired language. 1411 for _, m := range lm[0].LanguageMatch { 1412 // Different versions of CLDR use different separators. 1413 desired := strings.Replace(m.Desired, "-", "_", -1) 1414 supported := strings.Replace(m.Supported, "-", "_", -1) 1415 d := strings.Split(desired, "_") 1416 s := strings.Split(supported, "_") 1417 if len(d) != len(s) || len(d) > 2 { 1418 // Skip all entries with regions and work around CLDR bug. 1419 continue 1420 } 1421 pct, _ := strconv.ParseInt(m.Percent, 10, 8) 1422 if len(d) == 2 && d[0] == s[0] && len(d[1]) == 4 { 1423 // language-script pair. 1424 lang := uint16(0) 1425 if d[0] != "*" { 1426 lang = uint16(b.langIndex(d[0])) 1427 } 1428 matchScript = append(matchScript, scriptIntelligibility{ 1429 lang: lang, 1430 want: uint8(b.script.index(d[1])), 1431 have: uint8(b.script.index(s[1])), 1432 conf: toConf(uint8(pct)), 1433 }) 1434 if m.Oneway != "true" { 1435 matchScript = append(matchScript, scriptIntelligibility{ 1436 lang: lang, 1437 want: uint8(b.script.index(s[1])), 1438 have: uint8(b.script.index(d[1])), 1439 conf: toConf(uint8(pct)), 1440 }) 1441 } 1442 } else if len(d) == 1 && d[0] != "*" { 1443 if pct == 100 { 1444 // nb == no is already handled by macro mapping. Check there 1445 // really is only this case. 1446 if d[0] != "no" || s[0] != "nb" { 1447 log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) 1448 } 1449 continue 1450 } 1451 matchLang = append(matchLang, mutualIntelligibility{ 1452 want: uint16(b.langIndex(d[0])), 1453 have: uint16(b.langIndex(s[0])), 1454 conf: uint8(pct), 1455 oneway: m.Oneway == "true", 1456 }) 1457 } else { 1458 // TODO: Handle other mappings. 1459 a := []string{"*;*", "*_*;*_*", "es_MX;es_419"} 1460 s := strings.Join([]string{desired, supported}, ";") 1461 if i := sort.SearchStrings(a, s); i == len(a) || a[i] != s { 1462 log.Printf("%q not handled", s) 1463 } 1464 } 1465 } 1466 sort.Stable(sortByConf(matchLang)) 1467 // collapse percentage into confidence classes 1468 for i, m := range matchLang { 1469 matchLang[i].conf = toConf(m.conf) 1470 } 1471 b.writeSlice("matchLang", matchLang) 1472 b.writeSlice("matchScript", matchScript) 1473 } 1474 1475 func (b *builder) writeRegionInclusionData() { 1476 var ( 1477 // mm holds for each group the set of groups with a distance of 1. 1478 mm = make(map[int][]index) 1479 1480 // containment holds for each group the transitive closure of 1481 // containment of other groups. 1482 containment = make(map[index][]index) 1483 ) 1484 for _, g := range b.supp.TerritoryContainment.Group { 1485 group := b.region.index(g.Type) 1486 groupIdx := b.groups[group] 1487 for _, mem := range strings.Split(g.Contains, " ") { 1488 r := b.region.index(mem) 1489 mm[r] = append(mm[r], groupIdx) 1490 if g, ok := b.groups[r]; ok { 1491 mm[group] = append(mm[group], g) 1492 containment[groupIdx] = append(containment[groupIdx], g) 1493 } 1494 } 1495 } 1496 1497 regionContainment := make([]uint32, len(b.groups)) 1498 for _, g := range b.groups { 1499 l := containment[g] 1500 1501 // Compute the transitive closure of containment. 1502 for i := 0; i < len(l); i++ { 1503 l = append(l, containment[l[i]]...) 1504 } 1505 1506 // Compute the bitmask. 1507 regionContainment[g] = 1 << g 1508 for _, v := range l { 1509 regionContainment[g] |= 1 << v 1510 } 1511 // log.Printf("%d: %X", g, regionContainment[g]) 1512 } 1513 b.writeSlice("regionContainment", regionContainment) 1514 1515 regionInclusion := make([]uint8, len(b.region.s)) 1516 bvs := make(map[uint32]index) 1517 // Make the first bitvector positions correspond with the groups. 1518 for r, i := range b.groups { 1519 bv := uint32(1 << i) 1520 for _, g := range mm[r] { 1521 bv |= 1 << g 1522 } 1523 bvs[bv] = i 1524 regionInclusion[r] = uint8(bvs[bv]) 1525 } 1526 for r := 1; r < len(b.region.s); r++ { 1527 if _, ok := b.groups[r]; !ok { 1528 bv := uint32(0) 1529 for _, g := range mm[r] { 1530 bv |= 1 << g 1531 } 1532 if bv == 0 { 1533 // Pick the world for unspecified regions. 1534 bv = 1 << b.groups[b.region.index("001")] 1535 } 1536 if _, ok := bvs[bv]; !ok { 1537 bvs[bv] = index(len(bvs)) 1538 } 1539 regionInclusion[r] = uint8(bvs[bv]) 1540 } 1541 } 1542 b.writeSlice("regionInclusion", regionInclusion) 1543 regionInclusionBits := make([]uint32, len(bvs)) 1544 for k, v := range bvs { 1545 regionInclusionBits[v] = uint32(k) 1546 } 1547 // Add bit vectors for increasingly large distances until a fixed point is reached. 1548 regionInclusionNext := []uint8{} 1549 for i := 0; i < len(regionInclusionBits); i++ { 1550 bits := regionInclusionBits[i] 1551 next := bits 1552 for i := uint(0); i < uint(len(b.groups)); i++ { 1553 if bits&(1<<i) != 0 { 1554 next |= regionInclusionBits[i] 1555 } 1556 } 1557 if _, ok := bvs[next]; !ok { 1558 bvs[next] = index(len(bvs)) 1559 regionInclusionBits = append(regionInclusionBits, next) 1560 } 1561 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) 1562 } 1563 b.writeSlice("regionInclusionBits", regionInclusionBits) 1564 b.writeSlice("regionInclusionNext", regionInclusionNext) 1565 } 1566 1567 type parentRel struct { 1568 lang uint16 1569 script uint8 1570 maxScript uint8 1571 toRegion uint16 1572 fromRegion []uint16 1573 } 1574 1575 func (b *builder) writeParents() { 1576 b.writeType(parentRel{}) 1577 1578 parents := []parentRel{} 1579 1580 // Construct parent overrides. 1581 n := 0 1582 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { 1583 // Skipping non-standard scripts to root is implemented using addTags. 1584 if p.Parent == "root" { 1585 continue 1586 } 1587 1588 sub := strings.Split(p.Parent, "_") 1589 parent := parentRel{lang: b.langIndex(sub[0])} 1590 if len(sub) == 2 { 1591 // TODO: check that all undefined scripts are indeed Latn in these 1592 // cases. 1593 parent.maxScript = uint8(b.script.index("Latn")) 1594 parent.toRegion = uint16(b.region.index(sub[1])) 1595 } else { 1596 parent.script = uint8(b.script.index(sub[1])) 1597 parent.maxScript = parent.script 1598 parent.toRegion = uint16(b.region.index(sub[2])) 1599 } 1600 for _, c := range strings.Split(p.Locales, " ") { 1601 region := b.region.index(c[strings.LastIndex(c, "_")+1:]) 1602 parent.fromRegion = append(parent.fromRegion, uint16(region)) 1603 } 1604 parents = append(parents, parent) 1605 n += len(parent.fromRegion) 1606 } 1607 b.writeSliceAddSize("parents", n*2, parents) 1608 } 1609 1610 func main() { 1611 gen.Init() 1612 1613 gen.Repackage("gen_common.go", "common.go", "language") 1614 1615 w := gen.NewCodeWriter() 1616 defer w.WriteGoFile("tables.go", "language") 1617 1618 fmt.Fprintln(w, `import "github.com/insionng/yougam/libraries/x/text/internal/tag"`) 1619 1620 b := newBuilder(w) 1621 gen.WriteCLDRVersion(w) 1622 1623 b.parseIndices() 1624 b.writeType(fromTo{}) 1625 b.writeLanguage() 1626 b.writeScript() 1627 b.writeRegion() 1628 b.writeVariant() 1629 // TODO: b.writeLocale() 1630 b.computeRegionGroups() 1631 b.writeLikelyData() 1632 b.writeMatchData() 1633 b.writeRegionInclusionData() 1634 b.writeParents() 1635 }