github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/language/maketables.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  // Language tag table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"bufio"
    14  	"flag"
    15  	"fmt"
    16  	"io"
    17  	"io/ioutil"
    18  	"log"
    19  	"math"
    20  	"reflect"
    21  	"regexp"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"github.com/insionng/yougam/libraries/x/text/internal/gen"
    27  	"github.com/insionng/yougam/libraries/x/text/internal/tag"
    28  	"github.com/insionng/yougam/libraries/x/text/unicode/cldr"
    29  )
    30  
    31  var (
    32  	test = flag.Bool("test",
    33  		false,
    34  		"test existing tables; can be used to compare web data with package data.")
    35  	outputFile = flag.String("output",
    36  		"tables.go",
    37  		"output file for generated tables")
    38  )
    39  
    40  var comment = []string{
    41  	`
    42  lang holds an alphabetically sorted list of ISO-639 language identifiers.
    43  All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
    44  For 2-byte language identifiers, the two successive bytes have the following meaning:
    45      - if the first letter of the 2- and 3-letter ISO codes are the same:
    46        the second and third letter of the 3-letter ISO code.
    47      - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
    48  For 3-byte language identifiers the 4th byte is 0.`,
    49  	`
    50  langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
    51  in lookup tables. The language ids for these language codes are derived directly
    52  from the letters and are not consecutive.`,
    53  	`
    54  altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
    55  to 2-letter language codes that cannot be derived using the method described above.
    56  Each 3-letter code is followed by its 1-byte langID.`,
    57  	`
    58  altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
    59  	`
    60  langAliasMap maps langIDs to their suggested replacements.`,
    61  	`
    62  script is an alphabetically sorted list of ISO 15924 codes. The index
    63  of the script in the string, divided by 4, is the internal scriptID.`,
    64  	`
    65  isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
    66  for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
    67  the UN.M49 codes used for groups.)`,
    68  	`
    69  regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
    70  Each 2-letter codes is followed by two bytes with the following meaning:
    71      - [A-Z}{2}: the first letter of the 2-letter code plus these two 
    72                  letters form the 3-letter ISO code.
    73      - 0, n:     index into altRegionISO3.`,
    74  	`
    75  regionTypes defines the status of a region for various standards.`,
    76  	`
    77  m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
    78  codes indicating collections of regions.`,
    79  	`
    80  m49Index gives indexes into fromM49 based on the three most significant bits
    81  of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
    82     fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
    83  for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
    84  The region code is stored in the 9 lsb of the indexed value.`,
    85  	`
    86  fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
    87  	`
    88  altRegionISO3 holds a list of 3-letter region codes that cannot be
    89  mapped to 2-letter codes using the default algorithm. This is a short list.`,
    90  	`
    91  altRegionIDs holds a list of regionIDs the positions of which match those
    92  of the 3-letter ISO codes in altRegionISO3.`,
    93  	`
    94  variantNumSpecialized is the number of specialized variants in variants.`,
    95  	`
    96  suppressScript is an index from langID to the dominant script for that language,
    97  if it exists.  If a script is given, it should be suppressed from the language tag.`,
    98  	`
    99  likelyLang is a lookup table, indexed by langID, for the most likely
   100  scripts and regions given incomplete information. If more entries exist for a
   101  given language, region and script are the index and size respectively
   102  of the list in likelyLangList.`,
   103  	`
   104  likelyLangList holds lists info associated with likelyLang.`,
   105  	`
   106  likelyRegion is a lookup table, indexed by regionID, for the most likely
   107  languages and scripts given incomplete information. If more entries exist
   108  for a given regionID, lang and script are the index and size respectively
   109  of the list in likelyRegionList.
   110  TODO: exclude containers and user-definable regions from the list.`,
   111  	`
   112  likelyRegionList holds lists info associated with likelyRegion.`,
   113  	`
   114  likelyScript is a lookup table, indexed by scriptID, for the most likely
   115  languages and regions given a script.`,
   116  	`
   117  matchLang holds pairs of langIDs of base languages that are typically
   118  mutually intelligible. Each pair is associated with a confidence and
   119  whether the intelligibility goes one or both ways.`,
   120  	`
   121  matchScript holds pairs of scriptIDs where readers of one script
   122  can typically also read the other. Each is associated with a confidence.`,
   123  	`
   124  nRegionGroups is the number of region groups.`,
   125  	`
   126  regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
   127  where each set holds all groupings that are directly connected in a region
   128  containment graph.`,
   129  	`
   130  regionInclusionBits is an array of bit vectors where every vector represents
   131  a set of region groupings.  These sets are used to compute the distance
   132  between two regions for the purpose of language matching.`,
   133  	`
   134  regionInclusionNext marks, for each entry in regionInclusionBits, the set of
   135  all groups that are reachable from the groups set in the respective entry.`,
   136  }
   137  
   138  // TODO: consider changing some of these structures to tries. This can reduce
   139  // memory, but may increase the need for memory allocations. This could be
   140  // mitigated if we can piggyback on language tags for common cases.
   141  
   142  func failOnError(e error) {
   143  	if e != nil {
   144  		log.Panic(e)
   145  	}
   146  }
   147  
   148  type setType int
   149  
   150  const (
   151  	Indexed setType = 1 + iota // all elements must be of same size
   152  	Linear
   153  )
   154  
   155  type stringSet struct {
   156  	s              []string
   157  	sorted, frozen bool
   158  
   159  	// We often need to update values after the creation of an index is completed.
   160  	// We include a convenience map for keeping track of this.
   161  	update map[string]string
   162  	typ    setType // used for checking.
   163  }
   164  
   165  func (ss *stringSet) clone() stringSet {
   166  	c := *ss
   167  	c.s = append([]string(nil), c.s...)
   168  	return c
   169  }
   170  
   171  func (ss *stringSet) setType(t setType) {
   172  	if ss.typ != t && ss.typ != 0 {
   173  		log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
   174  	}
   175  }
   176  
   177  // parse parses a whitespace-separated string and initializes ss with its
   178  // components.
   179  func (ss *stringSet) parse(s string) {
   180  	scan := bufio.NewScanner(strings.NewReader(s))
   181  	scan.Split(bufio.ScanWords)
   182  	for scan.Scan() {
   183  		ss.add(scan.Text())
   184  	}
   185  }
   186  
   187  func (ss *stringSet) assertChangeable() {
   188  	if ss.frozen {
   189  		log.Panic("attempt to modify a frozen stringSet")
   190  	}
   191  }
   192  
   193  func (ss *stringSet) add(s string) {
   194  	ss.assertChangeable()
   195  	ss.s = append(ss.s, s)
   196  	ss.sorted = ss.frozen
   197  }
   198  
   199  func (ss *stringSet) freeze() {
   200  	ss.compact()
   201  	ss.frozen = true
   202  }
   203  
   204  func (ss *stringSet) compact() {
   205  	if ss.sorted {
   206  		return
   207  	}
   208  	a := ss.s
   209  	sort.Strings(a)
   210  	k := 0
   211  	for i := 1; i < len(a); i++ {
   212  		if a[k] != a[i] {
   213  			a[k+1] = a[i]
   214  			k++
   215  		}
   216  	}
   217  	ss.s = a[:k+1]
   218  	ss.sorted = ss.frozen
   219  }
   220  
   221  type funcSorter struct {
   222  	fn func(a, b string) bool
   223  	sort.StringSlice
   224  }
   225  
   226  func (s funcSorter) Less(i, j int) bool {
   227  	return s.fn(s.StringSlice[i], s.StringSlice[j])
   228  }
   229  
   230  func (ss *stringSet) sortFunc(f func(a, b string) bool) {
   231  	ss.compact()
   232  	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
   233  }
   234  
   235  func (ss *stringSet) remove(s string) {
   236  	ss.assertChangeable()
   237  	if i, ok := ss.find(s); ok {
   238  		copy(ss.s[i:], ss.s[i+1:])
   239  		ss.s = ss.s[:len(ss.s)-1]
   240  	}
   241  }
   242  
   243  func (ss *stringSet) replace(ol, nu string) {
   244  	ss.s[ss.index(ol)] = nu
   245  	ss.sorted = ss.frozen
   246  }
   247  
   248  func (ss *stringSet) index(s string) int {
   249  	ss.setType(Indexed)
   250  	i, ok := ss.find(s)
   251  	if !ok {
   252  		if i < len(ss.s) {
   253  			log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
   254  		}
   255  		log.Panicf("find: item %q is not in list", s)
   256  
   257  	}
   258  	return i
   259  }
   260  
   261  func (ss *stringSet) find(s string) (int, bool) {
   262  	ss.compact()
   263  	i := sort.SearchStrings(ss.s, s)
   264  	return i, i != len(ss.s) && ss.s[i] == s
   265  }
   266  
   267  func (ss *stringSet) slice() []string {
   268  	ss.compact()
   269  	return ss.s
   270  }
   271  
   272  func (ss *stringSet) updateLater(v, key string) {
   273  	if ss.update == nil {
   274  		ss.update = map[string]string{}
   275  	}
   276  	ss.update[v] = key
   277  }
   278  
   279  // join joins the string and ensures that all entries are of the same length.
   280  func (ss *stringSet) join() string {
   281  	ss.setType(Indexed)
   282  	n := len(ss.s[0])
   283  	for _, s := range ss.s {
   284  		if len(s) != n {
   285  			log.Panicf("join: not all entries are of the same length: %q", s)
   286  		}
   287  	}
   288  	ss.s = append(ss.s, strings.Repeat("\xff", n))
   289  	return strings.Join(ss.s, "")
   290  }
   291  
   292  // ianaEntry holds information for an entry in the IANA Language Subtag Repository.
   293  // All types use the same entry.
   294  // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
   295  // fields.
   296  type ianaEntry struct {
   297  	typ            string
   298  	description    []string
   299  	scope          string
   300  	added          string
   301  	preferred      string
   302  	deprecated     string
   303  	suppressScript string
   304  	macro          string
   305  	prefix         []string
   306  }
   307  
   308  type builder struct {
   309  	w    *gen.CodeWriter
   310  	hw   io.Writer // MultiWriter for w and w.Hash
   311  	data *cldr.CLDR
   312  	supp *cldr.SupplementalData
   313  
   314  	// indices
   315  	locale      stringSet // common locales
   316  	lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
   317  	langNoIndex stringSet // 3-letter ISO codes with no associated data
   318  	script      stringSet // 4-letter ISO codes
   319  	region      stringSet // 2-letter ISO or 3-digit UN M49 codes
   320  	variant     stringSet // 4-8-alphanumeric variant code.
   321  
   322  	// Region codes that are groups with their corresponding group IDs.
   323  	groups map[int]index
   324  
   325  	// langInfo
   326  	registry map[string]*ianaEntry
   327  }
   328  
   329  type index uint
   330  
   331  func newBuilder(w *gen.CodeWriter) *builder {
   332  	r := gen.OpenCLDRCoreZip()
   333  	defer r.Close()
   334  	d := &cldr.Decoder{}
   335  	data, err := d.DecodeZip(r)
   336  	failOnError(err)
   337  	b := builder{
   338  		w:    w,
   339  		hw:   io.MultiWriter(w, w.Hash),
   340  		data: data,
   341  		supp: data.Supplemental(),
   342  	}
   343  	b.parseRegistry()
   344  	return &b
   345  }
   346  
   347  func (b *builder) parseRegistry() {
   348  	r := gen.OpenIANAFile("assignments/language-subtag-registry")
   349  	defer r.Close()
   350  	b.registry = make(map[string]*ianaEntry)
   351  
   352  	scan := bufio.NewScanner(r)
   353  	scan.Split(bufio.ScanWords)
   354  	var record *ianaEntry
   355  	for more := scan.Scan(); more; {
   356  		key := scan.Text()
   357  		more = scan.Scan()
   358  		value := scan.Text()
   359  		switch key {
   360  		case "Type:":
   361  			record = &ianaEntry{typ: value}
   362  		case "Subtag:", "Tag:":
   363  			if s := strings.SplitN(value, "..", 2); len(s) > 1 {
   364  				for a := s[0]; a <= s[1]; a = inc(a) {
   365  					b.addToRegistry(a, record)
   366  				}
   367  			} else {
   368  				b.addToRegistry(value, record)
   369  			}
   370  		case "Suppress-Script:":
   371  			record.suppressScript = value
   372  		case "Added:":
   373  			record.added = value
   374  		case "Deprecated:":
   375  			record.deprecated = value
   376  		case "Macrolanguage:":
   377  			record.macro = value
   378  		case "Preferred-Value:":
   379  			record.preferred = value
   380  		case "Prefix:":
   381  			record.prefix = append(record.prefix, value)
   382  		case "Scope:":
   383  			record.scope = value
   384  		case "Description:":
   385  			buf := []byte(value)
   386  			for more = scan.Scan(); more; more = scan.Scan() {
   387  				b := scan.Bytes()
   388  				if b[0] == '%' || b[len(b)-1] == ':' {
   389  					break
   390  				}
   391  				buf = append(buf, ' ')
   392  				buf = append(buf, b...)
   393  			}
   394  			record.description = append(record.description, string(buf))
   395  			continue
   396  		default:
   397  			continue
   398  		}
   399  		more = scan.Scan()
   400  	}
   401  	if scan.Err() != nil {
   402  		log.Panic(scan.Err())
   403  	}
   404  }
   405  
   406  func (b *builder) addToRegistry(key string, entry *ianaEntry) {
   407  	if info, ok := b.registry[key]; ok {
   408  		if info.typ != "language" || entry.typ != "extlang" {
   409  			log.Fatalf("parseRegistry: tag %q already exists", key)
   410  		}
   411  	} else {
   412  		b.registry[key] = entry
   413  	}
   414  }
   415  
   416  var commentIndex = make(map[string]string)
   417  
   418  func init() {
   419  	for _, s := range comment {
   420  		key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
   421  		commentIndex[key] = s
   422  	}
   423  }
   424  
   425  func (b *builder) comment(name string) {
   426  	if s := commentIndex[name]; len(s) > 0 {
   427  		b.w.WriteComment(s)
   428  	} else {
   429  		fmt.Fprintln(b.w)
   430  	}
   431  }
   432  
   433  func (b *builder) pf(f string, x ...interface{}) {
   434  	fmt.Fprintf(b.hw, f, x...)
   435  	fmt.Fprint(b.hw, "\n")
   436  }
   437  
   438  func (b *builder) p(x ...interface{}) {
   439  	fmt.Fprintln(b.hw, x...)
   440  }
   441  
   442  func (b *builder) addSize(s int) {
   443  	b.w.Size += s
   444  	b.pf("// Size: %d bytes", s)
   445  }
   446  
   447  func (b *builder) writeConst(name string, x interface{}) {
   448  	b.comment(name)
   449  	b.w.WriteConst(name, x)
   450  }
   451  
   452  // writeConsts computes f(v) for all v in values and writes the results
   453  // as constants named _v to a single constant block.
   454  func (b *builder) writeConsts(f func(string) int, values ...string) {
   455  	b.pf("const (")
   456  	for _, v := range values {
   457  		b.pf("\t_%s = %v", v, f(v))
   458  	}
   459  	b.pf(")")
   460  }
   461  
   462  // writeType writes the type of the given value, which must be a struct.
   463  func (b *builder) writeType(value interface{}) {
   464  	b.comment(reflect.TypeOf(value).Name())
   465  	b.w.WriteType(value)
   466  }
   467  
   468  func (b *builder) writeSlice(name string, ss interface{}) {
   469  	b.writeSliceAddSize(name, 0, ss)
   470  }
   471  
   472  func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
   473  	b.comment(name)
   474  	b.w.Size += extraSize
   475  	v := reflect.ValueOf(ss)
   476  	t := v.Type().Elem()
   477  	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
   478  
   479  	fmt.Fprintf(b.w, "var %s = ", name)
   480  	b.w.WriteArray(ss)
   481  	b.p()
   482  }
   483  
   484  type fromTo struct {
   485  	from, to uint16
   486  }
   487  
   488  func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
   489  	ss.sortFunc(func(a, b string) bool {
   490  		return index(a) < index(b)
   491  	})
   492  	m := []fromTo{}
   493  	for _, s := range ss.s {
   494  		m = append(m, fromTo{index(s), index(ss.update[s])})
   495  	}
   496  	b.writeSlice(name, m)
   497  }
   498  
   499  const base = 'z' - 'a' + 1
   500  
   501  func strToInt(s string) uint {
   502  	v := uint(0)
   503  	for i := 0; i < len(s); i++ {
   504  		v *= base
   505  		v += uint(s[i] - 'a')
   506  	}
   507  	return v
   508  }
   509  
   510  // converts the given integer to the original ASCII string passed to strToInt.
   511  // len(s) must match the number of characters obtained.
   512  func intToStr(v uint, s []byte) {
   513  	for i := len(s) - 1; i >= 0; i-- {
   514  		s[i] = byte(v%base) + 'a'
   515  		v /= base
   516  	}
   517  }
   518  
   519  func (b *builder) writeBitVector(name string, ss []string) {
   520  	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
   521  	for _, s := range ss {
   522  		v := strToInt(s)
   523  		vec[v/8] |= 1 << (v % 8)
   524  	}
   525  	b.writeSlice(name, vec)
   526  }
   527  
   528  // TODO: convert this type into a list or two-stage trie.
   529  func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
   530  	b.comment(name)
   531  	v := reflect.ValueOf(m)
   532  	sz := v.Len() * (2 + int(v.Type().Key().Size()))
   533  	for _, k := range m {
   534  		sz += len(k)
   535  	}
   536  	b.addSize(sz)
   537  	keys := []string{}
   538  	b.pf(`var %s = map[string]uint16{`, name)
   539  	for k := range m {
   540  		keys = append(keys, k)
   541  	}
   542  	sort.Strings(keys)
   543  	for _, k := range keys {
   544  		b.pf("\t%q: %v,", k, f(m[k]))
   545  	}
   546  	b.p("}")
   547  }
   548  
   549  func (b *builder) writeMap(name string, m interface{}) {
   550  	b.comment(name)
   551  	v := reflect.ValueOf(m)
   552  	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
   553  	b.addSize(sz)
   554  	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
   555  		return strings.IndexRune("{}, ", r) != -1
   556  	})
   557  	sort.Strings(f[1:])
   558  	b.pf(`var %s = %s{`, name, f[0])
   559  	for _, kv := range f[1:] {
   560  		b.pf("\t%s,", kv)
   561  	}
   562  	b.p("}")
   563  }
   564  
   565  func (b *builder) langIndex(s string) uint16 {
   566  	if s == "und" {
   567  		return 0
   568  	}
   569  	if i, ok := b.lang.find(s); ok {
   570  		return uint16(i)
   571  	}
   572  	return uint16(strToInt(s)) + uint16(len(b.lang.s))
   573  }
   574  
   575  // inc advances the string to its lexicographical successor.
   576  func inc(s string) string {
   577  	const maxTagLength = 4
   578  	var buf [maxTagLength]byte
   579  	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
   580  	for i := 0; i < len(s); i++ {
   581  		if s[i] <= 'Z' {
   582  			buf[i] -= 'a' - 'A'
   583  		}
   584  	}
   585  	return string(buf[:len(s)])
   586  }
   587  
   588  func (b *builder) parseIndices() {
   589  	meta := b.supp.Metadata
   590  
   591  	for k, v := range b.registry {
   592  		var ss *stringSet
   593  		switch v.typ {
   594  		case "language":
   595  			if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
   596  				b.lang.add(k)
   597  				continue
   598  			} else {
   599  				ss = &b.langNoIndex
   600  			}
   601  		case "region":
   602  			ss = &b.region
   603  		case "script":
   604  			ss = &b.script
   605  		case "variant":
   606  			ss = &b.variant
   607  		default:
   608  			continue
   609  		}
   610  		ss.add(k)
   611  	}
   612  	// Include any language for which there is data.
   613  	for _, lang := range b.data.Locales() {
   614  		if x := b.data.RawLDML(lang); false ||
   615  			x.LocaleDisplayNames != nil ||
   616  			x.Characters != nil ||
   617  			x.Delimiters != nil ||
   618  			x.Measurement != nil ||
   619  			x.Dates != nil ||
   620  			x.Numbers != nil ||
   621  			x.Units != nil ||
   622  			x.ListPatterns != nil ||
   623  			x.Collations != nil ||
   624  			x.Segmentations != nil ||
   625  			x.Rbnf != nil ||
   626  			x.Annotations != nil ||
   627  			x.Metadata != nil {
   628  
   629  			from := strings.Split(lang, "_")
   630  			if lang := from[0]; lang != "root" {
   631  				b.lang.add(lang)
   632  			}
   633  		}
   634  	}
   635  	// Include locales for plural rules, which uses a different structure.
   636  	for _, plurals := range b.data.Supplemental().Plurals {
   637  		for _, rules := range plurals.PluralRules {
   638  			for _, lang := range strings.Split(rules.Locales, " ") {
   639  				if lang = strings.Split(lang, "_")[0]; lang != "root" {
   640  					b.lang.add(lang)
   641  				}
   642  			}
   643  		}
   644  	}
   645  	// Include languages in likely subtags.
   646  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
   647  		from := strings.Split(m.From, "_")
   648  		b.lang.add(from[0])
   649  	}
   650  	// Include ISO-639 alpha-3 bibliographic entries.
   651  	for _, a := range meta.Alias.LanguageAlias {
   652  		if a.Reason == "bibliographic" {
   653  			b.langNoIndex.add(a.Type)
   654  		}
   655  	}
   656  	// Include regions in territoryAlias (not all are in the IANA registry!)
   657  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
   658  		if len(reg.Type) == 2 {
   659  			b.region.add(reg.Type)
   660  		}
   661  	}
   662  
   663  	for _, s := range b.lang.s {
   664  		if len(s) == 3 {
   665  			b.langNoIndex.remove(s)
   666  		}
   667  	}
   668  	b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
   669  	b.writeConst("numScripts", len(b.script.slice()))
   670  	b.writeConst("numRegions", len(b.region.slice()))
   671  
   672  	// Add dummy codes at the start of each list to represent "unspecified".
   673  	b.lang.add("---")
   674  	b.script.add("----")
   675  	b.region.add("---")
   676  
   677  	// common locales
   678  	b.locale.parse(meta.DefaultContent.Locales)
   679  }
   680  
   681  func (b *builder) computeRegionGroups() {
   682  	b.groups = make(map[int]index)
   683  
   684  	// Create group indices.
   685  	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
   686  		b.groups[i] = index(len(b.groups))
   687  	}
   688  	for _, g := range b.supp.TerritoryContainment.Group {
   689  		group := b.region.index(g.Type)
   690  		if _, ok := b.groups[group]; !ok {
   691  			b.groups[group] = index(len(b.groups))
   692  		}
   693  	}
   694  	if len(b.groups) > 32 {
   695  		log.Fatalf("only 32 groups supported, found %d", len(b.groups))
   696  	}
   697  	b.writeConst("nRegionGroups", len(b.groups))
   698  }
   699  
   700  var langConsts = []string{
   701  	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
   702  	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
   703  	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
   704  	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
   705  	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
   706  	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
   707  
   708  	// constants for grandfathered tags (if not already defined)
   709  	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
   710  	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
   711  }
   712  
   713  // writeLanguage generates all tables needed for language canonicalization.
   714  func (b *builder) writeLanguage() {
   715  	meta := b.supp.Metadata
   716  
   717  	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
   718  	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
   719  	b.writeConst("langPrivateStart", b.langIndex("qaa"))
   720  	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
   721  
   722  	// Get language codes that need to be mapped (overlong 3-letter codes,
   723  	// deprecated 2-letter codes, legacy and grandfathered tags.)
   724  	langAliasMap := stringSet{}
   725  	aliasTypeMap := map[string]langAliasType{}
   726  
   727  	// altLangISO3 get the alternative ISO3 names that need to be mapped.
   728  	altLangISO3 := stringSet{}
   729  	// Add dummy start to avoid the use of index 0.
   730  	altLangISO3.add("---")
   731  	altLangISO3.updateLater("---", "aa")
   732  
   733  	lang := b.lang.clone()
   734  	for _, a := range meta.Alias.LanguageAlias {
   735  		if a.Replacement == "" {
   736  			a.Replacement = "und"
   737  		}
   738  		// TODO: support mapping to tags
   739  		repl := strings.SplitN(a.Replacement, "_", 2)[0]
   740  		if a.Reason == "overlong" {
   741  			if len(a.Replacement) == 2 && len(a.Type) == 3 {
   742  				lang.updateLater(a.Replacement, a.Type)
   743  			}
   744  		} else if len(a.Type) <= 3 {
   745  			switch a.Reason {
   746  			case "macrolanguage":
   747  				aliasTypeMap[a.Type] = langMacro
   748  			case "deprecated":
   749  				// handled elsewhere
   750  				continue
   751  			case "bibliographic", "legacy":
   752  				if a.Type == "no" {
   753  					continue
   754  				}
   755  				aliasTypeMap[a.Type] = langLegacy
   756  			default:
   757  				log.Fatalf("new %s alias: %s", a.Reason, a.Type)
   758  			}
   759  			langAliasMap.add(a.Type)
   760  			langAliasMap.updateLater(a.Type, repl)
   761  		}
   762  	}
   763  	// Manually add the mapping of "nb" (Norwegian) to its macro language.
   764  	// This can be removed if CLDR adopts this change.
   765  	langAliasMap.add("nb")
   766  	langAliasMap.updateLater("nb", "no")
   767  	aliasTypeMap["nb"] = langMacro
   768  
   769  	for k, v := range b.registry {
   770  		// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
   771  		if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
   772  			langAliasMap.add(k)
   773  			langAliasMap.updateLater(k, v.preferred)
   774  			aliasTypeMap[k] = langDeprecated
   775  		}
   776  	}
   777  	// Fix CLDR mappings.
   778  	lang.updateLater("tl", "tgl")
   779  	lang.updateLater("sh", "hbs")
   780  	lang.updateLater("mo", "mol")
   781  	lang.updateLater("no", "nor")
   782  	lang.updateLater("tw", "twi")
   783  	lang.updateLater("nb", "nob")
   784  	lang.updateLater("ak", "aka")
   785  
   786  	// Ensure that each 2-letter code is matched with a 3-letter code.
   787  	for _, v := range lang.s[1:] {
   788  		s, ok := lang.update[v]
   789  		if !ok {
   790  			if s, ok = lang.update[langAliasMap.update[v]]; !ok {
   791  				continue
   792  			}
   793  			lang.update[v] = s
   794  		}
   795  		if v[0] != s[0] {
   796  			altLangISO3.add(s)
   797  			altLangISO3.updateLater(s, v)
   798  		}
   799  	}
   800  
   801  	// Complete canonialized language tags.
   802  	lang.freeze()
   803  	for i, v := range lang.s {
   804  		// We can avoid these manual entries by using the IANI registry directly.
   805  		// Seems easier to update the list manually, as changes are rare.
   806  		// The panic in this loop will trigger if we miss an entry.
   807  		add := ""
   808  		if s, ok := lang.update[v]; ok {
   809  			if s[0] == v[0] {
   810  				add = s[1:]
   811  			} else {
   812  				add = string([]byte{0, byte(altLangISO3.index(s))})
   813  			}
   814  		} else if len(v) == 3 {
   815  			add = "\x00"
   816  		} else {
   817  			log.Panicf("no data for long form of %q", v)
   818  		}
   819  		lang.s[i] += add
   820  	}
   821  	b.writeConst("lang", tag.Index(lang.join()))
   822  
   823  	b.writeConst("langNoIndexOffset", len(b.lang.s))
   824  
   825  	// space of all valid 3-letter language identifiers.
   826  	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
   827  
   828  	altLangIndex := []uint16{}
   829  	for i, s := range altLangISO3.slice() {
   830  		altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
   831  		if i > 0 {
   832  			idx := b.lang.index(altLangISO3.update[s])
   833  			altLangIndex = append(altLangIndex, uint16(idx))
   834  		}
   835  	}
   836  	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
   837  	b.writeSlice("altLangIndex", altLangIndex)
   838  
   839  	b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex)
   840  	types := make([]langAliasType, len(langAliasMap.s))
   841  	for i, s := range langAliasMap.s {
   842  		types[i] = aliasTypeMap[s]
   843  	}
   844  	b.writeSlice("langAliasTypes", types)
   845  }
   846  
   847  var scriptConsts = []string{
   848  	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
   849  	"Zzzz",
   850  }
   851  
   852  func (b *builder) writeScript() {
   853  	b.writeConsts(b.script.index, scriptConsts...)
   854  	b.writeConst("script", tag.Index(b.script.join()))
   855  
   856  	supp := make([]uint8, len(b.lang.slice()))
   857  	for i, v := range b.lang.slice()[1:] {
   858  		if sc := b.registry[v].suppressScript; sc != "" {
   859  			supp[i+1] = uint8(b.script.index(sc))
   860  		}
   861  	}
   862  	b.writeSlice("suppressScript", supp)
   863  
   864  	// There is only one deprecated script in CLDR. This value is hard-coded.
   865  	// We check here if the code must be updated.
   866  	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
   867  		if a.Type != "Qaai" {
   868  			log.Panicf("unexpected deprecated stript %q", a.Type)
   869  		}
   870  	}
   871  }
   872  
   873  func parseM49(s string) int16 {
   874  	if len(s) == 0 {
   875  		return 0
   876  	}
   877  	v, err := strconv.ParseUint(s, 10, 10)
   878  	failOnError(err)
   879  	return int16(v)
   880  }
   881  
   882  var regionConsts = []string{
   883  	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
   884  	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
   885  }
   886  
   887  func (b *builder) writeRegion() {
   888  	b.writeConsts(b.region.index, regionConsts...)
   889  
   890  	isoOffset := b.region.index("AA")
   891  	m49map := make([]int16, len(b.region.slice()))
   892  	fromM49map := make(map[int16]int)
   893  	altRegionISO3 := ""
   894  	altRegionIDs := []uint16{}
   895  
   896  	b.writeConst("isoRegionOffset", isoOffset)
   897  
   898  	// 2-letter region lookup and mapping to numeric codes.
   899  	regionISO := b.region.clone()
   900  	regionISO.s = regionISO.s[isoOffset:]
   901  	regionISO.sorted = false
   902  
   903  	regionTypes := make([]byte, len(b.region.s))
   904  
   905  	// Is the region valid BCP 47?
   906  	for s, e := range b.registry {
   907  		if len(s) == 2 && s == strings.ToUpper(s) {
   908  			i := b.region.index(s)
   909  			for _, d := range e.description {
   910  				if strings.Contains(d, "Private use") {
   911  					regionTypes[i] = iso3166UserAssgined
   912  				}
   913  			}
   914  			regionTypes[i] |= bcp47Region
   915  		}
   916  	}
   917  
   918  	// Is the region a valid ccTLD?
   919  	r := gen.OpenIANAFile("domains/root/db")
   920  	defer r.Close()
   921  
   922  	buf, err := ioutil.ReadAll(r)
   923  	failOnError(err)
   924  	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
   925  	for _, m := range re.FindAllSubmatch(buf, -1) {
   926  		i := b.region.index(strings.ToUpper(string(m[1])))
   927  		regionTypes[i] |= ccTLD
   928  	}
   929  
   930  	b.writeSlice("regionTypes", regionTypes)
   931  
   932  	iso3Set := make(map[string]int)
   933  	update := func(iso2, iso3 string) {
   934  		i := regionISO.index(iso2)
   935  		if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
   936  			regionISO.s[i] += iso3[1:]
   937  			iso3Set[iso3] = -1
   938  		} else {
   939  			if ok && j >= 0 {
   940  				regionISO.s[i] += string([]byte{0, byte(j)})
   941  			} else {
   942  				iso3Set[iso3] = len(altRegionISO3)
   943  				regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
   944  				altRegionISO3 += iso3
   945  				altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
   946  			}
   947  		}
   948  	}
   949  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   950  		i := regionISO.index(tc.Type) + isoOffset
   951  		if d := m49map[i]; d != 0 {
   952  			log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
   953  		}
   954  		m49 := parseM49(tc.Numeric)
   955  		m49map[i] = m49
   956  		if r := fromM49map[m49]; r == 0 {
   957  			fromM49map[m49] = i
   958  		} else if r != i {
   959  			dep := b.registry[regionISO.s[r-isoOffset]].deprecated
   960  			if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
   961  				fromM49map[m49] = i
   962  			}
   963  		}
   964  	}
   965  	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
   966  		if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
   967  			from := parseM49(ta.Type)
   968  			if r := fromM49map[from]; r == 0 {
   969  				fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
   970  			}
   971  		}
   972  	}
   973  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   974  		if len(tc.Alpha3) == 3 {
   975  			update(tc.Type, tc.Alpha3)
   976  		}
   977  	}
   978  	// This entries are not included in territoryCodes. Mostly 3-letter variants
   979  	// of deleted codes and an entry for QU.
   980  	for _, m := range []struct{ iso2, iso3 string }{
   981  		{"CT", "CTE"},
   982  		{"DY", "DHY"},
   983  		{"HV", "HVO"},
   984  		{"JT", "JTN"},
   985  		{"MI", "MID"},
   986  		{"NH", "NHB"},
   987  		{"NQ", "ATN"},
   988  		{"PC", "PCI"},
   989  		{"PU", "PUS"},
   990  		{"PZ", "PCZ"},
   991  		{"RH", "RHO"},
   992  		{"VD", "VDR"},
   993  		{"WK", "WAK"},
   994  		// These three-letter codes are used for others as well.
   995  		{"FQ", "ATF"},
   996  	} {
   997  		update(m.iso2, m.iso3)
   998  	}
   999  	for i, s := range regionISO.s {
  1000  		if len(s) != 4 {
  1001  			regionISO.s[i] = s + "  "
  1002  		}
  1003  	}
  1004  	b.writeConst("regionISO", tag.Index(regionISO.join()))
  1005  	b.writeConst("altRegionISO3", altRegionISO3)
  1006  	b.writeSlice("altRegionIDs", altRegionIDs)
  1007  
  1008  	// Create list of deprecated regions.
  1009  	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
  1010  	// Transitionally-reserved mapping not included.
  1011  	regionOldMap := stringSet{}
  1012  	// Include regions in territoryAlias (not all are in the IANA registry!)
  1013  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
  1014  		if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
  1015  			regionOldMap.add(reg.Type)
  1016  			regionOldMap.updateLater(reg.Type, reg.Replacement)
  1017  			i, _ := regionISO.find(reg.Type)
  1018  			j, _ := regionISO.find(reg.Replacement)
  1019  			if k := m49map[i+isoOffset]; k == 0 {
  1020  				m49map[i+isoOffset] = m49map[j+isoOffset]
  1021  			}
  1022  		}
  1023  	}
  1024  	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
  1025  		return uint16(b.region.index(s))
  1026  	})
  1027  	// 3-digit region lookup, groupings.
  1028  	for i := 1; i < isoOffset; i++ {
  1029  		m := parseM49(b.region.s[i])
  1030  		m49map[i] = m
  1031  		fromM49map[m] = i
  1032  	}
  1033  	b.writeSlice("m49", m49map)
  1034  
  1035  	const (
  1036  		searchBits = 7
  1037  		regionBits = 9
  1038  	)
  1039  	if len(m49map) >= 1<<regionBits {
  1040  		log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
  1041  	}
  1042  	m49Index := [9]int16{}
  1043  	fromM49 := []uint16{}
  1044  	m49 := []int{}
  1045  	for k, _ := range fromM49map {
  1046  		m49 = append(m49, int(k))
  1047  	}
  1048  	sort.Ints(m49)
  1049  	for _, k := range m49[1:] {
  1050  		val := (k & (1<<searchBits - 1)) << regionBits
  1051  		fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
  1052  		m49Index[1:][k>>searchBits] = int16(len(fromM49))
  1053  	}
  1054  	b.writeSlice("m49Index", m49Index)
  1055  	b.writeSlice("fromM49", fromM49)
  1056  }
  1057  
  1058  const (
  1059  	// TODO: put these lists in regionTypes as user data? Could be used for
  1060  	// various optimizations and refinements and could be exposed in the API.
  1061  	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
  1062  	iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
  1063  	// DY and RH are actually not deleted, but indeterminately reserved.
  1064  	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
  1065  )
  1066  
  1067  const (
  1068  	iso3166UserAssgined = 1 << iota
  1069  	ccTLD
  1070  	bcp47Region
  1071  )
  1072  
  1073  func find(list []string, s string) int {
  1074  	for i, t := range list {
  1075  		if t == s {
  1076  			return i
  1077  		}
  1078  	}
  1079  	return -1
  1080  }
  1081  
  1082  // writeVariants generates per-variant information and creates a map from variant
  1083  // name to index value. We assign index values such that sorting multiple
  1084  // variants by index value will result in the correct order.
  1085  // There are two types of variants: specialized and general. Specialized variants
  1086  // are only applicable to certain language or language-script pairs. Generalized
  1087  // variants apply to any language. Generalized variants always sort after
  1088  // specialized variants.  We will therefore always assign a higher index value
  1089  // to a generalized variant than any other variant. Generalized variants are
  1090  // sorted alphabetically among themselves.
  1091  // Specialized variants may also sort after other specialized variants. Such
  1092  // variants will be ordered after any of the variants they may follow.
  1093  // We assume that if a variant x is followed by a variant y, then for any prefix
  1094  // p of x, p-x is a prefix of y. This allows us to order tags based on the
  1095  // maximum of the length of any of its prefixes.
  1096  // TODO: it is possible to define a set of Prefix values on variants such that
  1097  // a total order cannot be defined to the point that this algorithm breaks.
  1098  // In other words, we cannot guarantee the same order of variants for the
  1099  // future using the same algorithm or for non-compliant combinations of
  1100  // variants. For this reason, consider using simple alphabetic sorting
  1101  // of variants and ignore Prefix restrictions altogether.
  1102  func (b *builder) writeVariant() {
  1103  	generalized := stringSet{}
  1104  	specialized := stringSet{}
  1105  	specializedExtend := stringSet{}
  1106  	// Collate the variants by type and check assumptions.
  1107  	for _, v := range b.variant.slice() {
  1108  		e := b.registry[v]
  1109  		if len(e.prefix) == 0 {
  1110  			generalized.add(v)
  1111  			continue
  1112  		}
  1113  		c := strings.Split(e.prefix[0], "-")
  1114  		hasScriptOrRegion := false
  1115  		if len(c) > 1 {
  1116  			_, hasScriptOrRegion = b.script.find(c[1])
  1117  			if !hasScriptOrRegion {
  1118  				_, hasScriptOrRegion = b.region.find(c[1])
  1119  
  1120  			}
  1121  		}
  1122  		if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
  1123  			// Variant is preceded by a language.
  1124  			specialized.add(v)
  1125  			continue
  1126  		}
  1127  		// Variant is preceded by another variant.
  1128  		specializedExtend.add(v)
  1129  		prefix := c[0] + "-"
  1130  		if hasScriptOrRegion {
  1131  			prefix += c[1]
  1132  		}
  1133  		for _, p := range e.prefix {
  1134  			// Verify that the prefix minus the last element is a prefix of the
  1135  			// predecessor element.
  1136  			i := strings.LastIndex(p, "-")
  1137  			pred := b.registry[p[i+1:]]
  1138  			if find(pred.prefix, p[:i]) < 0 {
  1139  				log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
  1140  			}
  1141  			// The sorting used below does not work in the general case. It works
  1142  			// if we assume that variants that may be followed by others only have
  1143  			// prefixes of the same length. Verify this.
  1144  			count := strings.Count(p[:i], "-")
  1145  			for _, q := range pred.prefix {
  1146  				if c := strings.Count(q, "-"); c != count {
  1147  					log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
  1148  				}
  1149  			}
  1150  			if !strings.HasPrefix(p, prefix) {
  1151  				log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
  1152  			}
  1153  		}
  1154  	}
  1155  
  1156  	// Sort extended variants.
  1157  	a := specializedExtend.s
  1158  	less := func(v, w string) bool {
  1159  		// Sort by the maximum number of elements.
  1160  		maxCount := func(s string) (max int) {
  1161  			for _, p := range b.registry[s].prefix {
  1162  				if c := strings.Count(p, "-"); c > max {
  1163  					max = c
  1164  				}
  1165  			}
  1166  			return
  1167  		}
  1168  		if cv, cw := maxCount(v), maxCount(w); cv != cw {
  1169  			return cv < cw
  1170  		}
  1171  		// Sort by name as tie breaker.
  1172  		return v < w
  1173  	}
  1174  	sort.Sort(funcSorter{less, sort.StringSlice(a)})
  1175  	specializedExtend.frozen = true
  1176  
  1177  	// Create index from variant name to index.
  1178  	variantIndex := make(map[string]uint8)
  1179  	add := func(s []string) {
  1180  		for _, v := range s {
  1181  			variantIndex[v] = uint8(len(variantIndex))
  1182  		}
  1183  	}
  1184  	add(specialized.slice())
  1185  	add(specializedExtend.s)
  1186  	numSpecialized := len(variantIndex)
  1187  	add(generalized.slice())
  1188  	if n := len(variantIndex); n > 255 {
  1189  		log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
  1190  	}
  1191  	b.writeMap("variantIndex", variantIndex)
  1192  	b.writeConst("variantNumSpecialized", numSpecialized)
  1193  }
  1194  
  1195  func (b *builder) writeLanguageInfo() {
  1196  }
  1197  
  1198  // writeLikelyData writes tables that are used both for finding parent relations and for
  1199  // language matching.  Each entry contains additional bits to indicate the status of the
  1200  // data to know when it cannot be used for parent relations.
  1201  func (b *builder) writeLikelyData() {
  1202  	const (
  1203  		isList = 1 << iota
  1204  		scriptInFrom
  1205  		regionInFrom
  1206  	)
  1207  	type ( // generated types
  1208  		likelyScriptRegion struct {
  1209  			region uint16
  1210  			script uint8
  1211  			flags  uint8
  1212  		}
  1213  		likelyLangScript struct {
  1214  			lang   uint16
  1215  			script uint8
  1216  			flags  uint8
  1217  		}
  1218  		likelyLangRegion struct {
  1219  			lang   uint16
  1220  			region uint16
  1221  		}
  1222  		// likelyTag is used for getting likely tags for group regions, where
  1223  		// the likely region might be a region contained in the group.
  1224  		likelyTag struct {
  1225  			lang   uint16
  1226  			region uint16
  1227  			script uint8
  1228  		}
  1229  	)
  1230  	var ( // generated variables
  1231  		likelyRegionGroup = make([]likelyTag, len(b.groups))
  1232  		likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
  1233  		likelyRegion      = make([]likelyLangScript, len(b.region.s))
  1234  		likelyScript      = make([]likelyLangRegion, len(b.script.s))
  1235  		likelyLangList    = []likelyScriptRegion{}
  1236  		likelyRegionList  = []likelyLangScript{}
  1237  	)
  1238  	type fromTo struct {
  1239  		from, to []string
  1240  	}
  1241  	langToOther := map[int][]fromTo{}
  1242  	regionToOther := map[int][]fromTo{}
  1243  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
  1244  		from := strings.Split(m.From, "_")
  1245  		to := strings.Split(m.To, "_")
  1246  		if len(to) != 3 {
  1247  			log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
  1248  		}
  1249  		if len(from) > 3 {
  1250  			log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
  1251  		}
  1252  		if from[0] != to[0] && from[0] != "und" {
  1253  			log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
  1254  		}
  1255  		if len(from) == 3 {
  1256  			if from[2] != to[2] {
  1257  				log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
  1258  			}
  1259  			if from[0] != "und" {
  1260  				log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
  1261  			}
  1262  		}
  1263  		if len(from) == 1 || from[0] != "und" {
  1264  			id := 0
  1265  			if from[0] != "und" {
  1266  				id = b.lang.index(from[0])
  1267  			}
  1268  			langToOther[id] = append(langToOther[id], fromTo{from, to})
  1269  		} else if len(from) == 2 && len(from[1]) == 4 {
  1270  			sid := b.script.index(from[1])
  1271  			likelyScript[sid].lang = uint16(b.langIndex(to[0]))
  1272  			likelyScript[sid].region = uint16(b.region.index(to[2]))
  1273  		} else {
  1274  			r := b.region.index(from[len(from)-1])
  1275  			if id, ok := b.groups[r]; ok {
  1276  				if from[0] != "und" {
  1277  					log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
  1278  				}
  1279  				likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
  1280  				likelyRegionGroup[id].script = uint8(b.script.index(to[1]))
  1281  				likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
  1282  			} else {
  1283  				regionToOther[r] = append(regionToOther[r], fromTo{from, to})
  1284  			}
  1285  		}
  1286  	}
  1287  	b.writeType(likelyLangRegion{})
  1288  	b.writeSlice("likelyScript", likelyScript)
  1289  
  1290  	for id := range b.lang.s {
  1291  		list := langToOther[id]
  1292  		if len(list) == 1 {
  1293  			likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
  1294  			likelyLang[id].script = uint8(b.script.index(list[0].to[1]))
  1295  		} else if len(list) > 1 {
  1296  			likelyLang[id].flags = isList
  1297  			likelyLang[id].region = uint16(len(likelyLangList))
  1298  			likelyLang[id].script = uint8(len(list))
  1299  			for _, x := range list {
  1300  				flags := uint8(0)
  1301  				if len(x.from) > 1 {
  1302  					if x.from[1] == x.to[2] {
  1303  						flags = regionInFrom
  1304  					} else {
  1305  						flags = scriptInFrom
  1306  					}
  1307  				}
  1308  				likelyLangList = append(likelyLangList, likelyScriptRegion{
  1309  					region: uint16(b.region.index(x.to[2])),
  1310  					script: uint8(b.script.index(x.to[1])),
  1311  					flags:  flags,
  1312  				})
  1313  			}
  1314  		}
  1315  	}
  1316  	// TODO: merge suppressScript data with this table.
  1317  	b.writeType(likelyScriptRegion{})
  1318  	b.writeSlice("likelyLang", likelyLang)
  1319  	b.writeSlice("likelyLangList", likelyLangList)
  1320  
  1321  	for id := range b.region.s {
  1322  		list := regionToOther[id]
  1323  		if len(list) == 1 {
  1324  			likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
  1325  			likelyRegion[id].script = uint8(b.script.index(list[0].to[1]))
  1326  			if len(list[0].from) > 2 {
  1327  				likelyRegion[id].flags = scriptInFrom
  1328  			}
  1329  		} else if len(list) > 1 {
  1330  			likelyRegion[id].flags = isList
  1331  			likelyRegion[id].lang = uint16(len(likelyRegionList))
  1332  			likelyRegion[id].script = uint8(len(list))
  1333  			for i, x := range list {
  1334  				if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
  1335  					log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
  1336  				}
  1337  				x := likelyLangScript{
  1338  					lang:   uint16(b.langIndex(x.to[0])),
  1339  					script: uint8(b.script.index(x.to[1])),
  1340  				}
  1341  				if len(list[0].from) > 2 {
  1342  					x.flags = scriptInFrom
  1343  				}
  1344  				likelyRegionList = append(likelyRegionList, x)
  1345  			}
  1346  		}
  1347  	}
  1348  	b.writeType(likelyLangScript{})
  1349  	b.writeSlice("likelyRegion", likelyRegion)
  1350  	b.writeSlice("likelyRegionList", likelyRegionList)
  1351  
  1352  	b.writeType(likelyTag{})
  1353  	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
  1354  }
  1355  
  1356  type mutualIntelligibility struct {
  1357  	want, have uint16
  1358  	conf       uint8
  1359  	oneway     bool
  1360  }
  1361  
  1362  type scriptIntelligibility struct {
  1363  	lang       uint16 // langID or 0 if *
  1364  	want, have uint8
  1365  	conf       uint8
  1366  }
  1367  
  1368  type sortByConf []mutualIntelligibility
  1369  
  1370  func (l sortByConf) Less(a, b int) bool {
  1371  	return l[a].conf > l[b].conf
  1372  }
  1373  
  1374  func (l sortByConf) Swap(a, b int) {
  1375  	l[a], l[b] = l[b], l[a]
  1376  }
  1377  
  1378  func (l sortByConf) Len() int {
  1379  	return len(l)
  1380  }
  1381  
  1382  // toConf converts a percentage value [0, 100] to a confidence class.
  1383  func toConf(pct uint8) uint8 {
  1384  	switch {
  1385  	case pct == 100:
  1386  		return 3 // Exact
  1387  	case pct >= 90:
  1388  		return 2 // High
  1389  	case pct > 50:
  1390  		return 1 // Low
  1391  	default:
  1392  		return 0 // No
  1393  	}
  1394  }
  1395  
  1396  // writeMatchData writes tables with languages and scripts for which there is
  1397  // mutual intelligibility. The data is based on CLDR's languageMatching data.
  1398  // Note that we use a different algorithm than the one defined by CLDR and that
  1399  // we slightly modify the data. For example, we convert scores to confidence levels.
  1400  // We also drop all region-related data as we use a different algorithm to
  1401  // determine region equivalence.
  1402  func (b *builder) writeMatchData() {
  1403  	b.writeType(mutualIntelligibility{})
  1404  	b.writeType(scriptIntelligibility{})
  1405  	lm := b.supp.LanguageMatching.LanguageMatches
  1406  	cldr.MakeSlice(&lm).SelectAnyOf("type", "written")
  1407  
  1408  	matchLang := []mutualIntelligibility{}
  1409  	matchScript := []scriptIntelligibility{}
  1410  	// Convert the languageMatch entries in lists keyed by desired language.
  1411  	for _, m := range lm[0].LanguageMatch {
  1412  		// Different versions of CLDR use different separators.
  1413  		desired := strings.Replace(m.Desired, "-", "_", -1)
  1414  		supported := strings.Replace(m.Supported, "-", "_", -1)
  1415  		d := strings.Split(desired, "_")
  1416  		s := strings.Split(supported, "_")
  1417  		if len(d) != len(s) || len(d) > 2 {
  1418  			// Skip all entries with regions and work around CLDR bug.
  1419  			continue
  1420  		}
  1421  		pct, _ := strconv.ParseInt(m.Percent, 10, 8)
  1422  		if len(d) == 2 && d[0] == s[0] && len(d[1]) == 4 {
  1423  			// language-script pair.
  1424  			lang := uint16(0)
  1425  			if d[0] != "*" {
  1426  				lang = uint16(b.langIndex(d[0]))
  1427  			}
  1428  			matchScript = append(matchScript, scriptIntelligibility{
  1429  				lang: lang,
  1430  				want: uint8(b.script.index(d[1])),
  1431  				have: uint8(b.script.index(s[1])),
  1432  				conf: toConf(uint8(pct)),
  1433  			})
  1434  			if m.Oneway != "true" {
  1435  				matchScript = append(matchScript, scriptIntelligibility{
  1436  					lang: lang,
  1437  					want: uint8(b.script.index(s[1])),
  1438  					have: uint8(b.script.index(d[1])),
  1439  					conf: toConf(uint8(pct)),
  1440  				})
  1441  			}
  1442  		} else if len(d) == 1 && d[0] != "*" {
  1443  			if pct == 100 {
  1444  				// nb == no is already handled by macro mapping. Check there
  1445  				// really is only this case.
  1446  				if d[0] != "no" || s[0] != "nb" {
  1447  					log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
  1448  				}
  1449  				continue
  1450  			}
  1451  			matchLang = append(matchLang, mutualIntelligibility{
  1452  				want:   uint16(b.langIndex(d[0])),
  1453  				have:   uint16(b.langIndex(s[0])),
  1454  				conf:   uint8(pct),
  1455  				oneway: m.Oneway == "true",
  1456  			})
  1457  		} else {
  1458  			// TODO: Handle other mappings.
  1459  			a := []string{"*;*", "*_*;*_*", "es_MX;es_419"}
  1460  			s := strings.Join([]string{desired, supported}, ";")
  1461  			if i := sort.SearchStrings(a, s); i == len(a) || a[i] != s {
  1462  				log.Printf("%q not handled", s)
  1463  			}
  1464  		}
  1465  	}
  1466  	sort.Stable(sortByConf(matchLang))
  1467  	// collapse percentage into confidence classes
  1468  	for i, m := range matchLang {
  1469  		matchLang[i].conf = toConf(m.conf)
  1470  	}
  1471  	b.writeSlice("matchLang", matchLang)
  1472  	b.writeSlice("matchScript", matchScript)
  1473  }
  1474  
  1475  func (b *builder) writeRegionInclusionData() {
  1476  	var (
  1477  		// mm holds for each group the set of groups with a distance of 1.
  1478  		mm = make(map[int][]index)
  1479  
  1480  		// containment holds for each group the transitive closure of
  1481  		// containment of other groups.
  1482  		containment = make(map[index][]index)
  1483  	)
  1484  	for _, g := range b.supp.TerritoryContainment.Group {
  1485  		group := b.region.index(g.Type)
  1486  		groupIdx := b.groups[group]
  1487  		for _, mem := range strings.Split(g.Contains, " ") {
  1488  			r := b.region.index(mem)
  1489  			mm[r] = append(mm[r], groupIdx)
  1490  			if g, ok := b.groups[r]; ok {
  1491  				mm[group] = append(mm[group], g)
  1492  				containment[groupIdx] = append(containment[groupIdx], g)
  1493  			}
  1494  		}
  1495  	}
  1496  
  1497  	regionContainment := make([]uint32, len(b.groups))
  1498  	for _, g := range b.groups {
  1499  		l := containment[g]
  1500  
  1501  		// Compute the transitive closure of containment.
  1502  		for i := 0; i < len(l); i++ {
  1503  			l = append(l, containment[l[i]]...)
  1504  		}
  1505  
  1506  		// Compute the bitmask.
  1507  		regionContainment[g] = 1 << g
  1508  		for _, v := range l {
  1509  			regionContainment[g] |= 1 << v
  1510  		}
  1511  		// log.Printf("%d: %X", g, regionContainment[g])
  1512  	}
  1513  	b.writeSlice("regionContainment", regionContainment)
  1514  
  1515  	regionInclusion := make([]uint8, len(b.region.s))
  1516  	bvs := make(map[uint32]index)
  1517  	// Make the first bitvector positions correspond with the groups.
  1518  	for r, i := range b.groups {
  1519  		bv := uint32(1 << i)
  1520  		for _, g := range mm[r] {
  1521  			bv |= 1 << g
  1522  		}
  1523  		bvs[bv] = i
  1524  		regionInclusion[r] = uint8(bvs[bv])
  1525  	}
  1526  	for r := 1; r < len(b.region.s); r++ {
  1527  		if _, ok := b.groups[r]; !ok {
  1528  			bv := uint32(0)
  1529  			for _, g := range mm[r] {
  1530  				bv |= 1 << g
  1531  			}
  1532  			if bv == 0 {
  1533  				// Pick the world for unspecified regions.
  1534  				bv = 1 << b.groups[b.region.index("001")]
  1535  			}
  1536  			if _, ok := bvs[bv]; !ok {
  1537  				bvs[bv] = index(len(bvs))
  1538  			}
  1539  			regionInclusion[r] = uint8(bvs[bv])
  1540  		}
  1541  	}
  1542  	b.writeSlice("regionInclusion", regionInclusion)
  1543  	regionInclusionBits := make([]uint32, len(bvs))
  1544  	for k, v := range bvs {
  1545  		regionInclusionBits[v] = uint32(k)
  1546  	}
  1547  	// Add bit vectors for increasingly large distances until a fixed point is reached.
  1548  	regionInclusionNext := []uint8{}
  1549  	for i := 0; i < len(regionInclusionBits); i++ {
  1550  		bits := regionInclusionBits[i]
  1551  		next := bits
  1552  		for i := uint(0); i < uint(len(b.groups)); i++ {
  1553  			if bits&(1<<i) != 0 {
  1554  				next |= regionInclusionBits[i]
  1555  			}
  1556  		}
  1557  		if _, ok := bvs[next]; !ok {
  1558  			bvs[next] = index(len(bvs))
  1559  			regionInclusionBits = append(regionInclusionBits, next)
  1560  		}
  1561  		regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
  1562  	}
  1563  	b.writeSlice("regionInclusionBits", regionInclusionBits)
  1564  	b.writeSlice("regionInclusionNext", regionInclusionNext)
  1565  }
  1566  
  1567  type parentRel struct {
  1568  	lang       uint16
  1569  	script     uint8
  1570  	maxScript  uint8
  1571  	toRegion   uint16
  1572  	fromRegion []uint16
  1573  }
  1574  
  1575  func (b *builder) writeParents() {
  1576  	b.writeType(parentRel{})
  1577  
  1578  	parents := []parentRel{}
  1579  
  1580  	// Construct parent overrides.
  1581  	n := 0
  1582  	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
  1583  		// Skipping non-standard scripts to root is implemented using addTags.
  1584  		if p.Parent == "root" {
  1585  			continue
  1586  		}
  1587  
  1588  		sub := strings.Split(p.Parent, "_")
  1589  		parent := parentRel{lang: b.langIndex(sub[0])}
  1590  		if len(sub) == 2 {
  1591  			// TODO: check that all undefined scripts are indeed Latn in these
  1592  			// cases.
  1593  			parent.maxScript = uint8(b.script.index("Latn"))
  1594  			parent.toRegion = uint16(b.region.index(sub[1]))
  1595  		} else {
  1596  			parent.script = uint8(b.script.index(sub[1]))
  1597  			parent.maxScript = parent.script
  1598  			parent.toRegion = uint16(b.region.index(sub[2]))
  1599  		}
  1600  		for _, c := range strings.Split(p.Locales, " ") {
  1601  			region := b.region.index(c[strings.LastIndex(c, "_")+1:])
  1602  			parent.fromRegion = append(parent.fromRegion, uint16(region))
  1603  		}
  1604  		parents = append(parents, parent)
  1605  		n += len(parent.fromRegion)
  1606  	}
  1607  	b.writeSliceAddSize("parents", n*2, parents)
  1608  }
  1609  
  1610  func main() {
  1611  	gen.Init()
  1612  
  1613  	gen.Repackage("gen_common.go", "common.go", "language")
  1614  
  1615  	w := gen.NewCodeWriter()
  1616  	defer w.WriteGoFile("tables.go", "language")
  1617  
  1618  	fmt.Fprintln(w, `import "github.com/insionng/yougam/libraries/x/text/internal/tag"`)
  1619  
  1620  	b := newBuilder(w)
  1621  	gen.WriteCLDRVersion(w)
  1622  
  1623  	b.parseIndices()
  1624  	b.writeType(fromTo{})
  1625  	b.writeLanguage()
  1626  	b.writeScript()
  1627  	b.writeRegion()
  1628  	b.writeVariant()
  1629  	// TODO: b.writeLocale()
  1630  	b.computeRegionGroups()
  1631  	b.writeLikelyData()
  1632  	b.writeMatchData()
  1633  	b.writeRegionInclusionData()
  1634  	b.writeParents()
  1635  }