github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/language/maketables.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  // Language tag table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"bufio"
    14  	"flag"
    15  	"fmt"
    16  	"io"
    17  	"io/ioutil"
    18  	"log"
    19  	"math"
    20  	"reflect"
    21  	"regexp"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"golang.org/x/text/internal/gen"
    27  	"golang.org/x/text/internal/tag"
    28  	"golang.org/x/text/unicode/cldr"
    29  )
    30  
    31  var (
    32  	test = flag.Bool("test",
    33  		false,
    34  		"test existing tables; can be used to compare web data with package data.")
    35  	outputFile = flag.String("output",
    36  		"tables.go",
    37  		"output file for generated tables")
    38  )
    39  
    40  var comment = []string{
    41  	`
    42  lang holds an alphabetically sorted list of ISO-639 language identifiers.
    43  All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
    44  For 2-byte language identifiers, the two successive bytes have the following meaning:
    45      - if the first letter of the 2- and 3-letter ISO codes are the same:
    46        the second and third letter of the 3-letter ISO code.
    47      - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
    48  For 3-byte language identifiers the 4th byte is 0.`,
    49  	`
    50  langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
    51  in lookup tables. The language ids for these language codes are derived directly
    52  from the letters and are not consecutive.`,
    53  	`
    54  altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
    55  to 2-letter language codes that cannot be derived using the method described above.
    56  Each 3-letter code is followed by its 1-byte langID.`,
    57  	`
    58  altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
    59  	`
    60  langAliasMap maps langIDs to their suggested replacements.`,
    61  	`
    62  script is an alphabetically sorted list of ISO 15924 codes. The index
    63  of the script in the string, divided by 4, is the internal scriptID.`,
    64  	`
    65  isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
    66  for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
    67  the UN.M49 codes used for groups.)`,
    68  	`
    69  regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
    70  Each 2-letter codes is followed by two bytes with the following meaning:
    71      - [A-Z}{2}: the first letter of the 2-letter code plus these two 
    72                  letters form the 3-letter ISO code.
    73      - 0, n:     index into altRegionISO3.`,
    74  	`
    75  regionTypes defines the status of a region for various standards.`,
    76  	`
    77  m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
    78  codes indicating collections of regions.`,
    79  	`
    80  m49Index gives indexes into fromM49 based on the three most significant bits
    81  of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
    82     fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
    83  for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
    84  The region code is stored in the 9 lsb of the indexed value.`,
    85  	`
    86  fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
    87  	`
    88  altRegionISO3 holds a list of 3-letter region codes that cannot be
    89  mapped to 2-letter codes using the default algorithm. This is a short list.`,
    90  	`
    91  altRegionIDs holds a list of regionIDs the positions of which match those
    92  of the 3-letter ISO codes in altRegionISO3.`,
    93  	`
    94  variantNumSpecialized is the number of specialized variants in variants.`,
    95  	`
    96  suppressScript is an index from langID to the dominant script for that language,
    97  if it exists.  If a script is given, it should be suppressed from the language tag.`,
    98  	`
    99  likelyLang is a lookup table, indexed by langID, for the most likely
   100  scripts and regions given incomplete information. If more entries exist for a
   101  given language, region and script are the index and size respectively
   102  of the list in likelyLangList.`,
   103  	`
   104  likelyLangList holds lists info associated with likelyLang.`,
   105  	`
   106  likelyRegion is a lookup table, indexed by regionID, for the most likely
   107  languages and scripts given incomplete information. If more entries exist
   108  for a given regionID, lang and script are the index and size respectively
   109  of the list in likelyRegionList.
   110  TODO: exclude containers and user-definable regions from the list.`,
   111  	`
   112  likelyRegionList holds lists info associated with likelyRegion.`,
   113  	`
   114  likelyScript is a lookup table, indexed by scriptID, for the most likely
   115  languages and regions given a script.`,
   116  	`
   117  matchLang holds pairs of langIDs of base languages that are typically
   118  mutually intelligible. Each pair is associated with a confidence and
   119  whether the intelligibility goes one or both ways.`,
   120  	`
   121  matchScript holds pairs of scriptIDs where readers of one script
   122  can typically also read the other. Each is associated with a confidence.`,
   123  	`
   124  nRegionGroups is the number of region groups.`,
   125  	`
   126  regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
   127  where each set holds all groupings that are directly connected in a region
   128  containment graph.`,
   129  	`
   130  regionInclusionBits is an array of bit vectors where every vector represents
   131  a set of region groupings.  These sets are used to compute the distance
   132  between two regions for the purpose of language matching.`,
   133  	`
   134  regionInclusionNext marks, for each entry in regionInclusionBits, the set of
   135  all groups that are reachable from the groups set in the respective entry.`,
   136  }
   137  
   138  // TODO: consider changing some of these structures to tries. This can reduce
   139  // memory, but may increase the need for memory allocations. This could be
   140  // mitigated if we can piggyback on language tags for common cases.
   141  
   142  func failOnError(e error) {
   143  	if e != nil {
   144  		log.Panic(e)
   145  	}
   146  }
   147  
   148  type setType int
   149  
   150  const (
   151  	Indexed setType = 1 + iota // all elements must be of same size
   152  	Linear
   153  )
   154  
   155  type stringSet struct {
   156  	s              []string
   157  	sorted, frozen bool
   158  
   159  	// We often need to update values after the creation of an index is completed.
   160  	// We include a convenience map for keeping track of this.
   161  	update map[string]string
   162  	typ    setType // used for checking.
   163  }
   164  
   165  func (ss *stringSet) clone() stringSet {
   166  	c := *ss
   167  	c.s = append([]string(nil), c.s...)
   168  	return c
   169  }
   170  
   171  func (ss *stringSet) setType(t setType) {
   172  	if ss.typ != t && ss.typ != 0 {
   173  		log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
   174  	}
   175  }
   176  
   177  // parse parses a whitespace-separated string and initializes ss with its
   178  // components.
   179  func (ss *stringSet) parse(s string) {
   180  	scan := bufio.NewScanner(strings.NewReader(s))
   181  	scan.Split(bufio.ScanWords)
   182  	for scan.Scan() {
   183  		ss.add(scan.Text())
   184  	}
   185  }
   186  
   187  func (ss *stringSet) assertChangeable() {
   188  	if ss.frozen {
   189  		log.Panic("attempt to modify a frozen stringSet")
   190  	}
   191  }
   192  
   193  func (ss *stringSet) add(s string) {
   194  	ss.assertChangeable()
   195  	ss.s = append(ss.s, s)
   196  	ss.sorted = ss.frozen
   197  }
   198  
   199  func (ss *stringSet) freeze() {
   200  	ss.compact()
   201  	ss.frozen = true
   202  }
   203  
   204  func (ss *stringSet) compact() {
   205  	if ss.sorted {
   206  		return
   207  	}
   208  	a := ss.s
   209  	sort.Strings(a)
   210  	k := 0
   211  	for i := 1; i < len(a); i++ {
   212  		if a[k] != a[i] {
   213  			a[k+1] = a[i]
   214  			k++
   215  		}
   216  	}
   217  	ss.s = a[:k+1]
   218  	ss.sorted = ss.frozen
   219  }
   220  
   221  type funcSorter struct {
   222  	fn func(a, b string) bool
   223  	sort.StringSlice
   224  }
   225  
   226  func (s funcSorter) Less(i, j int) bool {
   227  	return s.fn(s.StringSlice[i], s.StringSlice[j])
   228  }
   229  
   230  func (ss *stringSet) sortFunc(f func(a, b string) bool) {
   231  	ss.compact()
   232  	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
   233  }
   234  
   235  func (ss *stringSet) remove(s string) {
   236  	ss.assertChangeable()
   237  	if i, ok := ss.find(s); ok {
   238  		copy(ss.s[i:], ss.s[i+1:])
   239  		ss.s = ss.s[:len(ss.s)-1]
   240  	}
   241  }
   242  
   243  func (ss *stringSet) replace(ol, nu string) {
   244  	ss.s[ss.index(ol)] = nu
   245  	ss.sorted = ss.frozen
   246  }
   247  
   248  func (ss *stringSet) index(s string) int {
   249  	ss.setType(Indexed)
   250  	i, ok := ss.find(s)
   251  	if !ok {
   252  		if i < len(ss.s) {
   253  			log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
   254  		}
   255  		log.Panicf("find: item %q is not in list", s)
   256  
   257  	}
   258  	return i
   259  }
   260  
   261  func (ss *stringSet) find(s string) (int, bool) {
   262  	ss.compact()
   263  	i := sort.SearchStrings(ss.s, s)
   264  	return i, i != len(ss.s) && ss.s[i] == s
   265  }
   266  
   267  func (ss *stringSet) slice() []string {
   268  	ss.compact()
   269  	return ss.s
   270  }
   271  
   272  func (ss *stringSet) updateLater(v, key string) {
   273  	if ss.update == nil {
   274  		ss.update = map[string]string{}
   275  	}
   276  	ss.update[v] = key
   277  }
   278  
   279  // join joins the string and ensures that all entries are of the same length.
   280  func (ss *stringSet) join() string {
   281  	ss.setType(Indexed)
   282  	n := len(ss.s[0])
   283  	for _, s := range ss.s {
   284  		if len(s) != n {
   285  			log.Panicf("join: not all entries are of the same length: %q", s)
   286  		}
   287  	}
   288  	ss.s = append(ss.s, strings.Repeat("\xff", n))
   289  	return strings.Join(ss.s, "")
   290  }
   291  
   292  // ianaEntry holds information for an entry in the IANA Language Subtag Repository.
   293  // All types use the same entry.
   294  // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
   295  // fields.
   296  type ianaEntry struct {
   297  	typ            string
   298  	description    []string
   299  	scope          string
   300  	added          string
   301  	preferred      string
   302  	deprecated     string
   303  	suppressScript string
   304  	macro          string
   305  	prefix         []string
   306  }
   307  
   308  type builder struct {
   309  	w    *gen.CodeWriter
   310  	hw   io.Writer // MultiWriter for w and w.Hash
   311  	data *cldr.CLDR
   312  	supp *cldr.SupplementalData
   313  
   314  	// indices
   315  	locale      stringSet // common locales
   316  	lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
   317  	langNoIndex stringSet // 3-letter ISO codes with no associated data
   318  	script      stringSet // 4-letter ISO codes
   319  	region      stringSet // 2-letter ISO or 3-digit UN M49 codes
   320  	variant     stringSet // 4-8-alphanumeric variant code.
   321  
   322  	// Region codes that are groups with their corresponding group IDs.
   323  	groups map[int]index
   324  
   325  	// langInfo
   326  	registry map[string]*ianaEntry
   327  }
   328  
   329  type index uint
   330  
   331  func newBuilder(w *gen.CodeWriter) *builder {
   332  	r := gen.OpenCLDRCoreZip()
   333  	defer r.Close()
   334  	d := &cldr.Decoder{}
   335  	data, err := d.DecodeZip(r)
   336  	failOnError(err)
   337  	b := builder{
   338  		w:    w,
   339  		hw:   io.MultiWriter(w, w.Hash),
   340  		data: data,
   341  		supp: data.Supplemental(),
   342  	}
   343  	b.parseRegistry()
   344  	return &b
   345  }
   346  
   347  func (b *builder) parseRegistry() {
   348  	r := gen.OpenIANAFile("assignments/language-subtag-registry")
   349  	defer r.Close()
   350  	b.registry = make(map[string]*ianaEntry)
   351  
   352  	scan := bufio.NewScanner(r)
   353  	scan.Split(bufio.ScanWords)
   354  	var record *ianaEntry
   355  	for more := scan.Scan(); more; {
   356  		key := scan.Text()
   357  		more = scan.Scan()
   358  		value := scan.Text()
   359  		switch key {
   360  		case "Type:":
   361  			record = &ianaEntry{typ: value}
   362  		case "Subtag:", "Tag:":
   363  			if s := strings.SplitN(value, "..", 2); len(s) > 1 {
   364  				for a := s[0]; a <= s[1]; a = inc(a) {
   365  					b.addToRegistry(a, record)
   366  				}
   367  			} else {
   368  				b.addToRegistry(value, record)
   369  			}
   370  		case "Suppress-Script:":
   371  			record.suppressScript = value
   372  		case "Added:":
   373  			record.added = value
   374  		case "Deprecated:":
   375  			record.deprecated = value
   376  		case "Macrolanguage:":
   377  			record.macro = value
   378  		case "Preferred-Value:":
   379  			record.preferred = value
   380  		case "Prefix:":
   381  			record.prefix = append(record.prefix, value)
   382  		case "Scope:":
   383  			record.scope = value
   384  		case "Description:":
   385  			buf := []byte(value)
   386  			for more = scan.Scan(); more; more = scan.Scan() {
   387  				b := scan.Bytes()
   388  				if b[0] == '%' || b[len(b)-1] == ':' {
   389  					break
   390  				}
   391  				buf = append(buf, ' ')
   392  				buf = append(buf, b...)
   393  			}
   394  			record.description = append(record.description, string(buf))
   395  			continue
   396  		default:
   397  			continue
   398  		}
   399  		more = scan.Scan()
   400  	}
   401  	if scan.Err() != nil {
   402  		log.Panic(scan.Err())
   403  	}
   404  }
   405  
   406  func (b *builder) addToRegistry(key string, entry *ianaEntry) {
   407  	if info, ok := b.registry[key]; ok {
   408  		if info.typ != "language" || entry.typ != "extlang" {
   409  			log.Fatalf("parseRegistry: tag %q already exists", key)
   410  		}
   411  	} else {
   412  		b.registry[key] = entry
   413  	}
   414  }
   415  
   416  var commentIndex = make(map[string]string)
   417  
   418  func init() {
   419  	for _, s := range comment {
   420  		key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
   421  		commentIndex[key] = s
   422  	}
   423  }
   424  
   425  func (b *builder) comment(name string) {
   426  	if s := commentIndex[name]; len(s) > 0 {
   427  		b.w.WriteComment(s)
   428  	} else {
   429  		fmt.Fprintln(b.w)
   430  	}
   431  }
   432  
   433  func (b *builder) pf(f string, x ...interface{}) {
   434  	fmt.Fprintf(b.hw, f, x...)
   435  	fmt.Fprint(b.hw, "\n")
   436  }
   437  
   438  func (b *builder) p(x ...interface{}) {
   439  	fmt.Fprintln(b.hw, x...)
   440  }
   441  
   442  func (b *builder) addSize(s int) {
   443  	b.w.Size += s
   444  	b.pf("// Size: %d bytes", s)
   445  }
   446  
   447  func (b *builder) writeConst(name string, x interface{}) {
   448  	b.comment(name)
   449  	b.w.WriteConst(name, x)
   450  }
   451  
   452  // writeConsts computes f(v) for all v in values and writes the results
   453  // as constants named _v to a single constant block.
   454  func (b *builder) writeConsts(f func(string) int, values ...string) {
   455  	b.pf("const (")
   456  	for _, v := range values {
   457  		b.pf("\t_%s = %v", v, f(v))
   458  	}
   459  	b.pf(")")
   460  }
   461  
   462  // writeType writes the type of the given value, which must be a struct.
   463  func (b *builder) writeType(value interface{}) {
   464  	b.comment(reflect.TypeOf(value).Name())
   465  	b.w.WriteType(value)
   466  }
   467  
   468  func (b *builder) writeSlice(name string, ss interface{}) {
   469  	b.writeSliceAddSize(name, 0, ss)
   470  }
   471  
   472  func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
   473  	b.comment(name)
   474  	b.w.Size += extraSize
   475  	v := reflect.ValueOf(ss)
   476  	t := v.Type().Elem()
   477  	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
   478  
   479  	fmt.Fprintf(b.w, "var %s = ", name)
   480  	b.w.WriteArray(ss)
   481  	b.p()
   482  }
   483  
   484  type fromTo struct {
   485  	from, to uint16
   486  }
   487  
   488  func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
   489  	ss.sortFunc(func(a, b string) bool {
   490  		return index(a) < index(b)
   491  	})
   492  	m := []fromTo{}
   493  	for _, s := range ss.s {
   494  		m = append(m, fromTo{index(s), index(ss.update[s])})
   495  	}
   496  	b.writeSlice(name, m)
   497  }
   498  
   499  const base = 'z' - 'a' + 1
   500  
   501  func strToInt(s string) uint {
   502  	v := uint(0)
   503  	for i := 0; i < len(s); i++ {
   504  		v *= base
   505  		v += uint(s[i] - 'a')
   506  	}
   507  	return v
   508  }
   509  
   510  // converts the given integer to the original ASCII string passed to strToInt.
   511  // len(s) must match the number of characters obtained.
   512  func intToStr(v uint, s []byte) {
   513  	for i := len(s) - 1; i >= 0; i-- {
   514  		s[i] = byte(v%base) + 'a'
   515  		v /= base
   516  	}
   517  }
   518  
   519  func (b *builder) writeBitVector(name string, ss []string) {
   520  	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
   521  	for _, s := range ss {
   522  		v := strToInt(s)
   523  		vec[v/8] |= 1 << (v % 8)
   524  	}
   525  	b.writeSlice(name, vec)
   526  }
   527  
   528  // TODO: convert this type into a list or two-stage trie.
   529  func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
   530  	b.comment(name)
   531  	v := reflect.ValueOf(m)
   532  	sz := v.Len() * (2 + int(v.Type().Key().Size()))
   533  	for _, k := range m {
   534  		sz += len(k)
   535  	}
   536  	b.addSize(sz)
   537  	keys := []string{}
   538  	b.pf(`var %s = map[string]uint16{`, name)
   539  	for k := range m {
   540  		keys = append(keys, k)
   541  	}
   542  	sort.Strings(keys)
   543  	for _, k := range keys {
   544  		b.pf("\t%q: %v,", k, f(m[k]))
   545  	}
   546  	b.p("}")
   547  }
   548  
   549  func (b *builder) writeMap(name string, m interface{}) {
   550  	b.comment(name)
   551  	v := reflect.ValueOf(m)
   552  	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
   553  	b.addSize(sz)
   554  	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
   555  		return strings.IndexRune("{}, ", r) != -1
   556  	})
   557  	sort.Strings(f[1:])
   558  	b.pf(`var %s = %s{`, name, f[0])
   559  	for _, kv := range f[1:] {
   560  		b.pf("\t%s,", kv)
   561  	}
   562  	b.p("}")
   563  }
   564  
   565  func (b *builder) langIndex(s string) uint16 {
   566  	if s == "und" {
   567  		return 0
   568  	}
   569  	if i, ok := b.lang.find(s); ok {
   570  		return uint16(i)
   571  	}
   572  	return uint16(strToInt(s)) + uint16(len(b.lang.s))
   573  }
   574  
   575  // inc advances the string to its lexicographical successor.
   576  func inc(s string) string {
   577  	const maxTagLength = 4
   578  	var buf [maxTagLength]byte
   579  	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
   580  	for i := 0; i < len(s); i++ {
   581  		if s[i] <= 'Z' {
   582  			buf[i] -= 'a' - 'A'
   583  		}
   584  	}
   585  	return string(buf[:len(s)])
   586  }
   587  
   588  func (b *builder) parseIndices() {
   589  	meta := b.supp.Metadata
   590  
   591  	for k, v := range b.registry {
   592  		var ss *stringSet
   593  		switch v.typ {
   594  		case "language":
   595  			if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
   596  				b.lang.add(k)
   597  				continue
   598  			} else {
   599  				ss = &b.langNoIndex
   600  			}
   601  		case "region":
   602  			ss = &b.region
   603  		case "script":
   604  			ss = &b.script
   605  		case "variant":
   606  			ss = &b.variant
   607  		default:
   608  			continue
   609  		}
   610  		ss.add(k)
   611  	}
   612  	// Include any language for which there is data.
   613  	for _, lang := range b.data.Locales() {
   614  		if x := b.data.RawLDML(lang); false ||
   615  			x.LocaleDisplayNames != nil ||
   616  			x.Characters != nil ||
   617  			x.Delimiters != nil ||
   618  			x.Measurement != nil ||
   619  			x.Dates != nil ||
   620  			x.Numbers != nil ||
   621  			x.Units != nil ||
   622  			x.ListPatterns != nil ||
   623  			x.Collations != nil ||
   624  			x.Segmentations != nil ||
   625  			x.Rbnf != nil ||
   626  			x.Annotations != nil ||
   627  			x.Metadata != nil {
   628  
   629  			from := strings.Split(lang, "_")
   630  			if lang := from[0]; lang != "root" {
   631  				b.lang.add(lang)
   632  			}
   633  		}
   634  	}
   635  	// Include languages in likely subtags.
   636  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
   637  		from := strings.Split(m.From, "_")
   638  		b.lang.add(from[0])
   639  	}
   640  	// Include ISO-639 alpha-3 bibliographic entries.
   641  	for _, a := range meta.Alias.LanguageAlias {
   642  		if a.Reason == "bibliographic" {
   643  			b.langNoIndex.add(a.Type)
   644  		}
   645  	}
   646  	// Include regions in territoryAlias (not all are in the IANA registry!)
   647  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
   648  		if len(reg.Type) == 2 {
   649  			b.region.add(reg.Type)
   650  		}
   651  	}
   652  
   653  	for _, s := range b.lang.s {
   654  		if len(s) == 3 {
   655  			b.langNoIndex.remove(s)
   656  		}
   657  	}
   658  	b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
   659  	b.writeConst("numScripts", len(b.script.slice()))
   660  	b.writeConst("numRegions", len(b.region.slice()))
   661  
   662  	// Add dummy codes at the start of each list to represent "unspecified".
   663  	b.lang.add("---")
   664  	b.script.add("----")
   665  	b.region.add("---")
   666  
   667  	// common locales
   668  	b.locale.parse(meta.DefaultContent.Locales)
   669  }
   670  
   671  func (b *builder) computeRegionGroups() {
   672  	b.groups = make(map[int]index)
   673  
   674  	// Create group indices.
   675  	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
   676  		b.groups[i] = index(len(b.groups))
   677  	}
   678  	for _, g := range b.supp.TerritoryContainment.Group {
   679  		group := b.region.index(g.Type)
   680  		if _, ok := b.groups[group]; !ok {
   681  			b.groups[group] = index(len(b.groups))
   682  		}
   683  	}
   684  	if len(b.groups) > 32 {
   685  		log.Fatalf("only 32 groups supported, found %d", len(b.groups))
   686  	}
   687  	b.writeConst("nRegionGroups", len(b.groups))
   688  }
   689  
   690  var langConsts = []string{
   691  	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
   692  	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
   693  	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
   694  	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
   695  	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
   696  	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
   697  
   698  	// constants for grandfathered tags (if not already defined)
   699  	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
   700  	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
   701  }
   702  
   703  // writeLanguage generates all tables needed for language canonicalization.
   704  func (b *builder) writeLanguage() {
   705  	meta := b.supp.Metadata
   706  
   707  	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
   708  	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
   709  	b.writeConst("langPrivateStart", b.langIndex("qaa"))
   710  	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
   711  
   712  	// Get language codes that need to be mapped (overlong 3-letter codes,
   713  	// deprecated 2-letter codes, legacy and grandfathered tags.)
   714  	langAliasMap := stringSet{}
   715  	aliasTypeMap := map[string]langAliasType{}
   716  
   717  	// altLangISO3 get the alternative ISO3 names that need to be mapped.
   718  	altLangISO3 := stringSet{}
   719  	// Add dummy start to avoid the use of index 0.
   720  	altLangISO3.add("---")
   721  	altLangISO3.updateLater("---", "aa")
   722  
   723  	lang := b.lang.clone()
   724  	for _, a := range meta.Alias.LanguageAlias {
   725  		if a.Replacement == "" {
   726  			a.Replacement = "und"
   727  		}
   728  		// TODO: support mapping to tags
   729  		repl := strings.SplitN(a.Replacement, "_", 2)[0]
   730  		if a.Reason == "overlong" {
   731  			if len(a.Replacement) == 2 && len(a.Type) == 3 {
   732  				lang.updateLater(a.Replacement, a.Type)
   733  			}
   734  		} else if len(a.Type) <= 3 {
   735  			switch a.Reason {
   736  			case "macrolanguage":
   737  				aliasTypeMap[a.Type] = langMacro
   738  			case "deprecated":
   739  				// handled elsewhere
   740  				continue
   741  			case "bibliographic", "legacy":
   742  				if a.Type == "no" {
   743  					continue
   744  				}
   745  				aliasTypeMap[a.Type] = langLegacy
   746  			default:
   747  				log.Fatalf("new %s alias: %s", a.Reason, a.Type)
   748  			}
   749  			langAliasMap.add(a.Type)
   750  			langAliasMap.updateLater(a.Type, repl)
   751  		}
   752  	}
   753  	// Manually add the mapping of "nb" (Norwegian) to its macro language.
   754  	// This can be removed if CLDR adopts this change.
   755  	langAliasMap.add("nb")
   756  	langAliasMap.updateLater("nb", "no")
   757  	aliasTypeMap["nb"] = langMacro
   758  
   759  	for k, v := range b.registry {
   760  		// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
   761  		if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
   762  			langAliasMap.add(k)
   763  			langAliasMap.updateLater(k, v.preferred)
   764  			aliasTypeMap[k] = langDeprecated
   765  		}
   766  	}
   767  	// Fix CLDR mappings.
   768  	lang.updateLater("tl", "tgl")
   769  	lang.updateLater("sh", "hbs")
   770  	lang.updateLater("mo", "mol")
   771  	lang.updateLater("no", "nor")
   772  	lang.updateLater("tw", "twi")
   773  	lang.updateLater("nb", "nob")
   774  	lang.updateLater("ak", "aka")
   775  
   776  	// Ensure that each 2-letter code is matched with a 3-letter code.
   777  	for _, v := range lang.s[1:] {
   778  		s, ok := lang.update[v]
   779  		if !ok {
   780  			if s, ok = lang.update[langAliasMap.update[v]]; !ok {
   781  				continue
   782  			}
   783  			lang.update[v] = s
   784  		}
   785  		if v[0] != s[0] {
   786  			altLangISO3.add(s)
   787  			altLangISO3.updateLater(s, v)
   788  		}
   789  	}
   790  
   791  	// Complete canonialized language tags.
   792  	lang.freeze()
   793  	for i, v := range lang.s {
   794  		// We can avoid these manual entries by using the IANI registry directly.
   795  		// Seems easier to update the list manually, as changes are rare.
   796  		// The panic in this loop will trigger if we miss an entry.
   797  		add := ""
   798  		if s, ok := lang.update[v]; ok {
   799  			if s[0] == v[0] {
   800  				add = s[1:]
   801  			} else {
   802  				add = string([]byte{0, byte(altLangISO3.index(s))})
   803  			}
   804  		} else if len(v) == 3 {
   805  			add = "\x00"
   806  		} else {
   807  			log.Panicf("no data for long form of %q", v)
   808  		}
   809  		lang.s[i] += add
   810  	}
   811  	b.writeConst("lang", tag.Index(lang.join()))
   812  
   813  	b.writeConst("langNoIndexOffset", len(b.lang.s))
   814  
   815  	// space of all valid 3-letter language identifiers.
   816  	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
   817  
   818  	altLangIndex := []uint16{}
   819  	for i, s := range altLangISO3.slice() {
   820  		altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
   821  		if i > 0 {
   822  			idx := b.lang.index(altLangISO3.update[s])
   823  			altLangIndex = append(altLangIndex, uint16(idx))
   824  		}
   825  	}
   826  	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
   827  	b.writeSlice("altLangIndex", altLangIndex)
   828  
   829  	b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex)
   830  	types := make([]langAliasType, len(langAliasMap.s))
   831  	for i, s := range langAliasMap.s {
   832  		types[i] = aliasTypeMap[s]
   833  	}
   834  	b.writeSlice("langAliasTypes", types)
   835  }
   836  
   837  var scriptConsts = []string{
   838  	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
   839  	"Zzzz",
   840  }
   841  
   842  func (b *builder) writeScript() {
   843  	b.writeConsts(b.script.index, scriptConsts...)
   844  	b.writeConst("script", tag.Index(b.script.join()))
   845  
   846  	supp := make([]uint8, len(b.lang.slice()))
   847  	for i, v := range b.lang.slice()[1:] {
   848  		if sc := b.registry[v].suppressScript; sc != "" {
   849  			supp[i+1] = uint8(b.script.index(sc))
   850  		}
   851  	}
   852  	b.writeSlice("suppressScript", supp)
   853  
   854  	// There is only one deprecated script in CLDR. This value is hard-coded.
   855  	// We check here if the code must be updated.
   856  	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
   857  		if a.Type != "Qaai" {
   858  			log.Panicf("unexpected deprecated stript %q", a.Type)
   859  		}
   860  	}
   861  }
   862  
   863  func parseM49(s string) int16 {
   864  	if len(s) == 0 {
   865  		return 0
   866  	}
   867  	v, err := strconv.ParseUint(s, 10, 10)
   868  	failOnError(err)
   869  	return int16(v)
   870  }
   871  
   872  var regionConsts = []string{
   873  	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
   874  	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
   875  }
   876  
   877  func (b *builder) writeRegion() {
   878  	b.writeConsts(b.region.index, regionConsts...)
   879  
   880  	isoOffset := b.region.index("AA")
   881  	m49map := make([]int16, len(b.region.slice()))
   882  	fromM49map := make(map[int16]int)
   883  	altRegionISO3 := ""
   884  	altRegionIDs := []uint16{}
   885  
   886  	b.writeConst("isoRegionOffset", isoOffset)
   887  
   888  	// 2-letter region lookup and mapping to numeric codes.
   889  	regionISO := b.region.clone()
   890  	regionISO.s = regionISO.s[isoOffset:]
   891  	regionISO.sorted = false
   892  
   893  	regionTypes := make([]byte, len(b.region.s))
   894  
   895  	// Is the region valid BCP 47?
   896  	for s, e := range b.registry {
   897  		if len(s) == 2 && s == strings.ToUpper(s) {
   898  			i := b.region.index(s)
   899  			for _, d := range e.description {
   900  				if strings.Contains(d, "Private use") {
   901  					regionTypes[i] = iso3166UserAssgined
   902  				}
   903  			}
   904  			regionTypes[i] |= bcp47Region
   905  		}
   906  	}
   907  
   908  	// Is the region a valid ccTLD?
   909  	r := gen.OpenIANAFile("domains/root/db")
   910  	defer r.Close()
   911  
   912  	buf, err := ioutil.ReadAll(r)
   913  	failOnError(err)
   914  	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
   915  	for _, m := range re.FindAllSubmatch(buf, -1) {
   916  		i := b.region.index(strings.ToUpper(string(m[1])))
   917  		regionTypes[i] |= ccTLD
   918  	}
   919  
   920  	b.writeSlice("regionTypes", regionTypes)
   921  
   922  	iso3Set := make(map[string]int)
   923  	update := func(iso2, iso3 string) {
   924  		i := regionISO.index(iso2)
   925  		if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
   926  			regionISO.s[i] += iso3[1:]
   927  			iso3Set[iso3] = -1
   928  		} else {
   929  			if ok && j >= 0 {
   930  				regionISO.s[i] += string([]byte{0, byte(j)})
   931  			} else {
   932  				iso3Set[iso3] = len(altRegionISO3)
   933  				regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
   934  				altRegionISO3 += iso3
   935  				altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
   936  			}
   937  		}
   938  	}
   939  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   940  		i := regionISO.index(tc.Type) + isoOffset
   941  		if d := m49map[i]; d != 0 {
   942  			log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
   943  		}
   944  		m49 := parseM49(tc.Numeric)
   945  		m49map[i] = m49
   946  		if r := fromM49map[m49]; r == 0 {
   947  			fromM49map[m49] = i
   948  		} else if r != i {
   949  			dep := b.registry[regionISO.s[r-isoOffset]].deprecated
   950  			if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
   951  				fromM49map[m49] = i
   952  			}
   953  		}
   954  	}
   955  	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
   956  		if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
   957  			from := parseM49(ta.Type)
   958  			if r := fromM49map[from]; r == 0 {
   959  				fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
   960  			}
   961  		}
   962  	}
   963  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   964  		if len(tc.Alpha3) == 3 {
   965  			update(tc.Type, tc.Alpha3)
   966  		}
   967  	}
   968  	// This entries are not included in territoryCodes. Mostly 3-letter variants
   969  	// of deleted codes and an entry for QU.
   970  	for _, m := range []struct{ iso2, iso3 string }{
   971  		{"CT", "CTE"},
   972  		{"DY", "DHY"},
   973  		{"HV", "HVO"},
   974  		{"JT", "JTN"},
   975  		{"MI", "MID"},
   976  		{"NH", "NHB"},
   977  		{"NQ", "ATN"},
   978  		{"PC", "PCI"},
   979  		{"PU", "PUS"},
   980  		{"PZ", "PCZ"},
   981  		{"RH", "RHO"},
   982  		{"VD", "VDR"},
   983  		{"WK", "WAK"},
   984  		// These three-letter codes are used for others as well.
   985  		{"FQ", "ATF"},
   986  	} {
   987  		update(m.iso2, m.iso3)
   988  	}
   989  	for i, s := range regionISO.s {
   990  		if len(s) != 4 {
   991  			regionISO.s[i] = s + "  "
   992  		}
   993  	}
   994  	b.writeConst("regionISO", tag.Index(regionISO.join()))
   995  	b.writeConst("altRegionISO3", altRegionISO3)
   996  	b.writeSlice("altRegionIDs", altRegionIDs)
   997  
   998  	// Create list of deprecated regions.
   999  	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
  1000  	// Transitionally-reserved mapping not included.
  1001  	regionOldMap := stringSet{}
  1002  	// Include regions in territoryAlias (not all are in the IANA registry!)
  1003  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
  1004  		if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
  1005  			regionOldMap.add(reg.Type)
  1006  			regionOldMap.updateLater(reg.Type, reg.Replacement)
  1007  			i, _ := regionISO.find(reg.Type)
  1008  			j, _ := regionISO.find(reg.Replacement)
  1009  			if k := m49map[i+isoOffset]; k == 0 {
  1010  				m49map[i+isoOffset] = m49map[j+isoOffset]
  1011  			}
  1012  		}
  1013  	}
  1014  	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
  1015  		return uint16(b.region.index(s))
  1016  	})
  1017  	// 3-digit region lookup, groupings.
  1018  	for i := 1; i < isoOffset; i++ {
  1019  		m := parseM49(b.region.s[i])
  1020  		m49map[i] = m
  1021  		fromM49map[m] = i
  1022  	}
  1023  	b.writeSlice("m49", m49map)
  1024  
  1025  	const (
  1026  		searchBits = 7
  1027  		regionBits = 9
  1028  	)
  1029  	if len(m49map) >= 1<<regionBits {
  1030  		log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
  1031  	}
  1032  	m49Index := [9]int16{}
  1033  	fromM49 := []uint16{}
  1034  	m49 := []int{}
  1035  	for k, _ := range fromM49map {
  1036  		m49 = append(m49, int(k))
  1037  	}
  1038  	sort.Ints(m49)
  1039  	for _, k := range m49[1:] {
  1040  		val := (k & (1<<searchBits - 1)) << regionBits
  1041  		fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
  1042  		m49Index[1:][k>>searchBits] = int16(len(fromM49))
  1043  	}
  1044  	b.writeSlice("m49Index", m49Index)
  1045  	b.writeSlice("fromM49", fromM49)
  1046  }
  1047  
  1048  const (
  1049  	// TODO: put these lists in regionTypes as user data? Could be used for
  1050  	// various optimizations and refinements and could be exposed in the API.
  1051  	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
  1052  	iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
  1053  	// DY and RH are actually not deleted, but indeterminately reserved.
  1054  	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
  1055  )
  1056  
  1057  const (
  1058  	iso3166UserAssgined = 1 << iota
  1059  	ccTLD
  1060  	bcp47Region
  1061  )
  1062  
  1063  func find(list []string, s string) int {
  1064  	for i, t := range list {
  1065  		if t == s {
  1066  			return i
  1067  		}
  1068  	}
  1069  	return -1
  1070  }
  1071  
  1072  // writeVariants generates per-variant information and creates a map from variant
  1073  // name to index value. We assign index values such that sorting multiple
  1074  // variants by index value will result in the correct order.
  1075  // There are two types of variants: specialized and general. Specialized variants
  1076  // are only applicable to certain language or language-script pairs. Generalized
  1077  // variants apply to any language. Generalized variants always sort after
  1078  // specialized variants.  We will therefore always assign a higher index value
  1079  // to a generalized variant than any other variant. Generalized variants are
  1080  // sorted alphabetically among themselves.
  1081  // Specialized variants may also sort after other specialized variants. Such
  1082  // variants will be ordered after any of the variants they may follow.
  1083  // We assume that if a variant x is followed by a variant y, then for any prefix
  1084  // p of x, p-x is a prefix of y. This allows us to order tags based on the
  1085  // maximum of the length of any of its prefixes.
  1086  // TODO: it is possible to define a set of Prefix values on variants such that
  1087  // a total order cannot be defined to the point that this algorithm breaks.
  1088  // In other words, we cannot guarantee the same order of variants for the
  1089  // future using the same algorithm or for non-compliant combinations of
  1090  // variants. For this reason, consider using simple alphabetic sorting
  1091  // of variants and ignore Prefix restrictions altogether.
  1092  func (b *builder) writeVariant() {
  1093  	generalized := stringSet{}
  1094  	specialized := stringSet{}
  1095  	specializedExtend := stringSet{}
  1096  	// Collate the variants by type and check assumptions.
  1097  	for _, v := range b.variant.slice() {
  1098  		e := b.registry[v]
  1099  		if len(e.prefix) == 0 {
  1100  			generalized.add(v)
  1101  			continue
  1102  		}
  1103  		c := strings.Split(e.prefix[0], "-")
  1104  		hasScriptOrRegion := false
  1105  		if len(c) > 1 {
  1106  			_, hasScriptOrRegion = b.script.find(c[1])
  1107  			if !hasScriptOrRegion {
  1108  				_, hasScriptOrRegion = b.region.find(c[1])
  1109  
  1110  			}
  1111  		}
  1112  		if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
  1113  			// Variant is preceded by a language.
  1114  			specialized.add(v)
  1115  			continue
  1116  		}
  1117  		// Variant is preceded by another variant.
  1118  		specializedExtend.add(v)
  1119  		prefix := c[0] + "-"
  1120  		if hasScriptOrRegion {
  1121  			prefix += c[1]
  1122  		}
  1123  		for _, p := range e.prefix {
  1124  			// Verify that the prefix minus the last element is a prefix of the
  1125  			// predecessor element.
  1126  			i := strings.LastIndex(p, "-")
  1127  			pred := b.registry[p[i+1:]]
  1128  			if find(pred.prefix, p[:i]) < 0 {
  1129  				log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
  1130  			}
  1131  			// The sorting used below does not work in the general case. It works
  1132  			// if we assume that variants that may be followed by others only have
  1133  			// prefixes of the same length. Verify this.
  1134  			count := strings.Count(p[:i], "-")
  1135  			for _, q := range pred.prefix {
  1136  				if c := strings.Count(q, "-"); c != count {
  1137  					log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
  1138  				}
  1139  			}
  1140  			if !strings.HasPrefix(p, prefix) {
  1141  				log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
  1142  			}
  1143  		}
  1144  	}
  1145  
  1146  	// Sort extended variants.
  1147  	a := specializedExtend.s
  1148  	less := func(v, w string) bool {
  1149  		// Sort by the maximum number of elements.
  1150  		maxCount := func(s string) (max int) {
  1151  			for _, p := range b.registry[s].prefix {
  1152  				if c := strings.Count(p, "-"); c > max {
  1153  					max = c
  1154  				}
  1155  			}
  1156  			return
  1157  		}
  1158  		if cv, cw := maxCount(v), maxCount(w); cv != cw {
  1159  			return cv < cw
  1160  		}
  1161  		// Sort by name as tie breaker.
  1162  		return v < w
  1163  	}
  1164  	sort.Sort(funcSorter{less, sort.StringSlice(a)})
  1165  	specializedExtend.frozen = true
  1166  
  1167  	// Create index from variant name to index.
  1168  	variantIndex := make(map[string]uint8)
  1169  	add := func(s []string) {
  1170  		for _, v := range s {
  1171  			variantIndex[v] = uint8(len(variantIndex))
  1172  		}
  1173  	}
  1174  	add(specialized.slice())
  1175  	add(specializedExtend.s)
  1176  	numSpecialized := len(variantIndex)
  1177  	add(generalized.slice())
  1178  	if n := len(variantIndex); n > 255 {
  1179  		log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
  1180  	}
  1181  	b.writeMap("variantIndex", variantIndex)
  1182  	b.writeConst("variantNumSpecialized", numSpecialized)
  1183  }
  1184  
  1185  func (b *builder) writeLanguageInfo() {
  1186  }
  1187  
  1188  // writeLikelyData writes tables that are used both for finding parent relations and for
  1189  // language matching.  Each entry contains additional bits to indicate the status of the
  1190  // data to know when it cannot be used for parent relations.
  1191  func (b *builder) writeLikelyData() {
  1192  	const (
  1193  		isList = 1 << iota
  1194  		scriptInFrom
  1195  		regionInFrom
  1196  	)
  1197  	type ( // generated types
  1198  		likelyScriptRegion struct {
  1199  			region uint16
  1200  			script uint8
  1201  			flags  uint8
  1202  		}
  1203  		likelyLangScript struct {
  1204  			lang   uint16
  1205  			script uint8
  1206  			flags  uint8
  1207  		}
  1208  		likelyLangRegion struct {
  1209  			lang   uint16
  1210  			region uint16
  1211  		}
  1212  		// likelyTag is used for getting likely tags for group regions, where
  1213  		// the likely region might be a region contained in the group.
  1214  		likelyTag struct {
  1215  			lang   uint16
  1216  			region uint16
  1217  			script uint8
  1218  		}
  1219  	)
  1220  	var ( // generated variables
  1221  		likelyRegionGroup = make([]likelyTag, len(b.groups))
  1222  		likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
  1223  		likelyRegion      = make([]likelyLangScript, len(b.region.s))
  1224  		likelyScript      = make([]likelyLangRegion, len(b.script.s))
  1225  		likelyLangList    = []likelyScriptRegion{}
  1226  		likelyRegionList  = []likelyLangScript{}
  1227  	)
  1228  	type fromTo struct {
  1229  		from, to []string
  1230  	}
  1231  	langToOther := map[int][]fromTo{}
  1232  	regionToOther := map[int][]fromTo{}
  1233  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
  1234  		from := strings.Split(m.From, "_")
  1235  		to := strings.Split(m.To, "_")
  1236  		if len(to) != 3 {
  1237  			log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
  1238  		}
  1239  		if len(from) > 3 {
  1240  			log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
  1241  		}
  1242  		if from[0] != to[0] && from[0] != "und" {
  1243  			log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
  1244  		}
  1245  		if len(from) == 3 {
  1246  			if from[2] != to[2] {
  1247  				log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
  1248  			}
  1249  			if from[0] != "und" {
  1250  				log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
  1251  			}
  1252  		}
  1253  		if len(from) == 1 || from[0] != "und" {
  1254  			id := 0
  1255  			if from[0] != "und" {
  1256  				id = b.lang.index(from[0])
  1257  			}
  1258  			langToOther[id] = append(langToOther[id], fromTo{from, to})
  1259  		} else if len(from) == 2 && len(from[1]) == 4 {
  1260  			sid := b.script.index(from[1])
  1261  			likelyScript[sid].lang = uint16(b.langIndex(to[0]))
  1262  			likelyScript[sid].region = uint16(b.region.index(to[2]))
  1263  		} else {
  1264  			r := b.region.index(from[len(from)-1])
  1265  			if id, ok := b.groups[r]; ok {
  1266  				if from[0] != "und" {
  1267  					log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
  1268  				}
  1269  				likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
  1270  				likelyRegionGroup[id].script = uint8(b.script.index(to[1]))
  1271  				likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
  1272  			} else {
  1273  				regionToOther[r] = append(regionToOther[r], fromTo{from, to})
  1274  			}
  1275  		}
  1276  	}
  1277  	b.writeType(likelyLangRegion{})
  1278  	b.writeSlice("likelyScript", likelyScript)
  1279  
  1280  	for id := range b.lang.s {
  1281  		list := langToOther[id]
  1282  		if len(list) == 1 {
  1283  			likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
  1284  			likelyLang[id].script = uint8(b.script.index(list[0].to[1]))
  1285  		} else if len(list) > 1 {
  1286  			likelyLang[id].flags = isList
  1287  			likelyLang[id].region = uint16(len(likelyLangList))
  1288  			likelyLang[id].script = uint8(len(list))
  1289  			for _, x := range list {
  1290  				flags := uint8(0)
  1291  				if len(x.from) > 1 {
  1292  					if x.from[1] == x.to[2] {
  1293  						flags = regionInFrom
  1294  					} else {
  1295  						flags = scriptInFrom
  1296  					}
  1297  				}
  1298  				likelyLangList = append(likelyLangList, likelyScriptRegion{
  1299  					region: uint16(b.region.index(x.to[2])),
  1300  					script: uint8(b.script.index(x.to[1])),
  1301  					flags:  flags,
  1302  				})
  1303  			}
  1304  		}
  1305  	}
  1306  	// TODO: merge suppressScript data with this table.
  1307  	b.writeType(likelyScriptRegion{})
  1308  	b.writeSlice("likelyLang", likelyLang)
  1309  	b.writeSlice("likelyLangList", likelyLangList)
  1310  
  1311  	for id := range b.region.s {
  1312  		list := regionToOther[id]
  1313  		if len(list) == 1 {
  1314  			likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
  1315  			likelyRegion[id].script = uint8(b.script.index(list[0].to[1]))
  1316  			if len(list[0].from) > 2 {
  1317  				likelyRegion[id].flags = scriptInFrom
  1318  			}
  1319  		} else if len(list) > 1 {
  1320  			likelyRegion[id].flags = isList
  1321  			likelyRegion[id].lang = uint16(len(likelyRegionList))
  1322  			likelyRegion[id].script = uint8(len(list))
  1323  			for i, x := range list {
  1324  				if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
  1325  					log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
  1326  				}
  1327  				x := likelyLangScript{
  1328  					lang:   uint16(b.langIndex(x.to[0])),
  1329  					script: uint8(b.script.index(x.to[1])),
  1330  				}
  1331  				if len(list[0].from) > 2 {
  1332  					x.flags = scriptInFrom
  1333  				}
  1334  				likelyRegionList = append(likelyRegionList, x)
  1335  			}
  1336  		}
  1337  	}
  1338  	b.writeType(likelyLangScript{})
  1339  	b.writeSlice("likelyRegion", likelyRegion)
  1340  	b.writeSlice("likelyRegionList", likelyRegionList)
  1341  
  1342  	b.writeType(likelyTag{})
  1343  	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
  1344  }
  1345  
  1346  type mutualIntelligibility struct {
  1347  	want, have uint16
  1348  	conf       uint8
  1349  	oneway     bool
  1350  }
  1351  
  1352  type scriptIntelligibility struct {
  1353  	lang       uint16 // langID or 0 if *
  1354  	want, have uint8
  1355  	conf       uint8
  1356  }
  1357  
  1358  type sortByConf []mutualIntelligibility
  1359  
  1360  func (l sortByConf) Less(a, b int) bool {
  1361  	return l[a].conf > l[b].conf
  1362  }
  1363  
  1364  func (l sortByConf) Swap(a, b int) {
  1365  	l[a], l[b] = l[b], l[a]
  1366  }
  1367  
  1368  func (l sortByConf) Len() int {
  1369  	return len(l)
  1370  }
  1371  
  1372  // toConf converts a percentage value [0, 100] to a confidence class.
  1373  func toConf(pct uint8) uint8 {
  1374  	switch {
  1375  	case pct == 100:
  1376  		return 3 // Exact
  1377  	case pct >= 90:
  1378  		return 2 // High
  1379  	case pct > 50:
  1380  		return 1 // Low
  1381  	default:
  1382  		return 0 // No
  1383  	}
  1384  }
  1385  
  1386  // writeMatchData writes tables with languages and scripts for which there is
  1387  // mutual intelligibility. The data is based on CLDR's languageMatching data.
  1388  // Note that we use a different algorithm than the one defined by CLDR and that
  1389  // we slightly modify the data. For example, we convert scores to confidence levels.
  1390  // We also drop all region-related data as we use a different algorithm to
  1391  // determine region equivalence.
  1392  func (b *builder) writeMatchData() {
  1393  	b.writeType(mutualIntelligibility{})
  1394  	b.writeType(scriptIntelligibility{})
  1395  	lm := b.supp.LanguageMatching.LanguageMatches
  1396  	cldr.MakeSlice(&lm).SelectAnyOf("type", "written")
  1397  
  1398  	matchLang := []mutualIntelligibility{}
  1399  	matchScript := []scriptIntelligibility{}
  1400  	// Convert the languageMatch entries in lists keyed by desired language.
  1401  	for _, m := range lm[0].LanguageMatch {
  1402  		// Different versions of CLDR use different separators.
  1403  		desired := strings.Replace(m.Desired, "-", "_", -1)
  1404  		supported := strings.Replace(m.Supported, "-", "_", -1)
  1405  		d := strings.Split(desired, "_")
  1406  		s := strings.Split(supported, "_")
  1407  		if len(d) != len(s) || len(d) > 2 {
  1408  			// Skip all entries with regions and work around CLDR bug.
  1409  			continue
  1410  		}
  1411  		pct, _ := strconv.ParseInt(m.Percent, 10, 8)
  1412  		if len(d) == 2 && d[0] == s[0] && len(d[1]) == 4 {
  1413  			// language-script pair.
  1414  			lang := uint16(0)
  1415  			if d[0] != "*" {
  1416  				lang = uint16(b.langIndex(d[0]))
  1417  			}
  1418  			matchScript = append(matchScript, scriptIntelligibility{
  1419  				lang: lang,
  1420  				want: uint8(b.script.index(d[1])),
  1421  				have: uint8(b.script.index(s[1])),
  1422  				conf: toConf(uint8(pct)),
  1423  			})
  1424  			if m.Oneway != "true" {
  1425  				matchScript = append(matchScript, scriptIntelligibility{
  1426  					lang: lang,
  1427  					want: uint8(b.script.index(s[1])),
  1428  					have: uint8(b.script.index(d[1])),
  1429  					conf: toConf(uint8(pct)),
  1430  				})
  1431  			}
  1432  		} else if len(d) == 1 && d[0] != "*" {
  1433  			if pct == 100 {
  1434  				// nb == no is already handled by macro mapping. Check there
  1435  				// really is only this case.
  1436  				if d[0] != "no" || s[0] != "nb" {
  1437  					log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
  1438  				}
  1439  				continue
  1440  			}
  1441  			matchLang = append(matchLang, mutualIntelligibility{
  1442  				want:   uint16(b.langIndex(d[0])),
  1443  				have:   uint16(b.langIndex(s[0])),
  1444  				conf:   uint8(pct),
  1445  				oneway: m.Oneway == "true",
  1446  			})
  1447  		} else {
  1448  			// TODO: Handle other mappings.
  1449  			a := []string{"*;*", "*_*;*_*", "es_MX;es_419"}
  1450  			s := strings.Join([]string{desired, supported}, ";")
  1451  			if i := sort.SearchStrings(a, s); i == len(a) || a[i] != s {
  1452  				log.Printf("%q not handled", s)
  1453  			}
  1454  		}
  1455  	}
  1456  	sort.Sort(sortByConf(matchLang))
  1457  	// collapse percentage into confidence classes
  1458  	for i, m := range matchLang {
  1459  		matchLang[i].conf = toConf(m.conf)
  1460  	}
  1461  	b.writeSlice("matchLang", matchLang)
  1462  	b.writeSlice("matchScript", matchScript)
  1463  }
  1464  
  1465  func (b *builder) writeRegionInclusionData() {
  1466  	var (
  1467  		// mm holds for each group the set of groups with a distance of 1.
  1468  		mm = make(map[int][]index)
  1469  
  1470  		// containment holds for each group the transitive closure of
  1471  		// containment of other groups.
  1472  		containment = make(map[index][]index)
  1473  	)
  1474  	for _, g := range b.supp.TerritoryContainment.Group {
  1475  		group := b.region.index(g.Type)
  1476  		groupIdx := b.groups[group]
  1477  		for _, mem := range strings.Split(g.Contains, " ") {
  1478  			r := b.region.index(mem)
  1479  			mm[r] = append(mm[r], groupIdx)
  1480  			if g, ok := b.groups[r]; ok {
  1481  				mm[group] = append(mm[group], g)
  1482  				containment[groupIdx] = append(containment[groupIdx], g)
  1483  			}
  1484  		}
  1485  	}
  1486  
  1487  	regionContainment := make([]uint32, len(b.groups))
  1488  	for _, g := range b.groups {
  1489  		l := containment[g]
  1490  
  1491  		// Compute the transitive closure of containment.
  1492  		for i := 0; i < len(l); i++ {
  1493  			l = append(l, containment[l[i]]...)
  1494  		}
  1495  
  1496  		// Compute the bitmask.
  1497  		regionContainment[g] = 1 << g
  1498  		for _, v := range l {
  1499  			regionContainment[g] |= 1 << v
  1500  		}
  1501  		// log.Printf("%d: %X", g, regionContainment[g])
  1502  	}
  1503  	b.writeSlice("regionContainment", regionContainment)
  1504  
  1505  	regionInclusion := make([]uint8, len(b.region.s))
  1506  	bvs := make(map[uint32]index)
  1507  	// Make the first bitvector positions correspond with the groups.
  1508  	for r, i := range b.groups {
  1509  		bv := uint32(1 << i)
  1510  		for _, g := range mm[r] {
  1511  			bv |= 1 << g
  1512  		}
  1513  		bvs[bv] = i
  1514  		regionInclusion[r] = uint8(bvs[bv])
  1515  	}
  1516  	for r := 1; r < len(b.region.s); r++ {
  1517  		if _, ok := b.groups[r]; !ok {
  1518  			bv := uint32(0)
  1519  			for _, g := range mm[r] {
  1520  				bv |= 1 << g
  1521  			}
  1522  			if bv == 0 {
  1523  				// Pick the world for unspecified regions.
  1524  				bv = 1 << b.groups[b.region.index("001")]
  1525  			}
  1526  			if _, ok := bvs[bv]; !ok {
  1527  				bvs[bv] = index(len(bvs))
  1528  			}
  1529  			regionInclusion[r] = uint8(bvs[bv])
  1530  		}
  1531  	}
  1532  	b.writeSlice("regionInclusion", regionInclusion)
  1533  	regionInclusionBits := make([]uint32, len(bvs))
  1534  	for k, v := range bvs {
  1535  		regionInclusionBits[v] = uint32(k)
  1536  	}
  1537  	// Add bit vectors for increasingly large distances until a fixed point is reached.
  1538  	regionInclusionNext := []uint8{}
  1539  	for i := 0; i < len(regionInclusionBits); i++ {
  1540  		bits := regionInclusionBits[i]
  1541  		next := bits
  1542  		for i := uint(0); i < uint(len(b.groups)); i++ {
  1543  			if bits&(1<<i) != 0 {
  1544  				next |= regionInclusionBits[i]
  1545  			}
  1546  		}
  1547  		if _, ok := bvs[next]; !ok {
  1548  			bvs[next] = index(len(bvs))
  1549  			regionInclusionBits = append(regionInclusionBits, next)
  1550  		}
  1551  		regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
  1552  	}
  1553  	b.writeSlice("regionInclusionBits", regionInclusionBits)
  1554  	b.writeSlice("regionInclusionNext", regionInclusionNext)
  1555  }
  1556  
  1557  type parentRel struct {
  1558  	lang       uint16
  1559  	script     uint8
  1560  	maxScript  uint8
  1561  	toRegion   uint16
  1562  	fromRegion []uint16
  1563  }
  1564  
  1565  func (b *builder) writeParents() {
  1566  	b.writeType(parentRel{})
  1567  
  1568  	parents := []parentRel{}
  1569  
  1570  	// Construct parent overrides.
  1571  	n := 0
  1572  	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
  1573  		// Skipping non-standard scripts to root is implemented using addTags.
  1574  		if p.Parent == "root" {
  1575  			continue
  1576  		}
  1577  
  1578  		sub := strings.Split(p.Parent, "_")
  1579  		parent := parentRel{lang: b.langIndex(sub[0])}
  1580  		if len(sub) == 2 {
  1581  			// TODO: check that all undefined scripts are indeed Latn in these
  1582  			// cases.
  1583  			parent.maxScript = uint8(b.script.index("Latn"))
  1584  			parent.toRegion = uint16(b.region.index(sub[1]))
  1585  		} else {
  1586  			parent.script = uint8(b.script.index(sub[1]))
  1587  			parent.maxScript = parent.script
  1588  			parent.toRegion = uint16(b.region.index(sub[2]))
  1589  		}
  1590  		for _, c := range strings.Split(p.Locales, " ") {
  1591  			region := b.region.index(c[strings.LastIndex(c, "_")+1:])
  1592  			parent.fromRegion = append(parent.fromRegion, uint16(region))
  1593  		}
  1594  		parents = append(parents, parent)
  1595  		n += len(parent.fromRegion)
  1596  	}
  1597  	b.writeSliceAddSize("parents", n*2, parents)
  1598  }
  1599  
  1600  func main() {
  1601  	gen.Init()
  1602  
  1603  	gen.Repackage("gen_common.go", "common.go", "language")
  1604  
  1605  	w := gen.NewCodeWriter()
  1606  	defer w.WriteGoFile("tables.go", "language")
  1607  
  1608  	fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
  1609  
  1610  	b := newBuilder(w)
  1611  	gen.WriteCLDRVersion(w)
  1612  
  1613  	b.parseIndices()
  1614  	b.writeType(fromTo{})
  1615  	b.writeLanguage()
  1616  	b.writeScript()
  1617  	b.writeRegion()
  1618  	b.writeVariant()
  1619  	// TODO: b.writeLocale()
  1620  	b.computeRegionGroups()
  1621  	b.writeLikelyData()
  1622  	b.writeMatchData()
  1623  	b.writeRegionInclusionData()
  1624  	b.writeParents()
  1625  }