github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/language/gen.go

github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/language/gen.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  // +build ignore
     7  
     8  // Language tag table generator.
     9  // Data read from the web.
    10  
    11  package main
    12  
    13  import (
    14  	"bufio"
    15  	"flag"
    16  	"fmt"
    17  	"io"
    18  	"log"
    19  	"math"
    20  	"reflect"
    21  	"regexp"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"github.com/go-enjin/golang-org-x-text/internal/gen"
    27  	"github.com/go-enjin/golang-org-x-text/internal/tag"
    28  	"github.com/go-enjin/golang-org-x-text/unicode/cldr"
    29  )
    30  
    31  var (
    32  	test = flag.Bool("test",
    33  		false,
    34  		"test existing tables; can be used to compare web data with package data.")
    35  	outputFile = flag.String("output",
    36  		"tables.go",
    37  		"output file for generated tables")
    38  )
    39  
    40  var comment = []string{
    41  	`
    42  lang holds an alphabetically sorted list of ISO-639 language identifiers.
    43  All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
    44  For 2-byte language identifiers, the two successive bytes have the following meaning:
    45      - if the first letter of the 2- and 3-letter ISO codes are the same:
    46        the second and third letter of the 3-letter ISO code.
    47      - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
    48  For 3-byte language identifiers the 4th byte is 0.`,
    49  	`
    50  langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
    51  in lookup tables. The language ids for these language codes are derived directly
    52  from the letters and are not consecutive.`,
    53  	`
    54  altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
    55  to 2-letter language codes that cannot be derived using the method described above.
    56  Each 3-letter code is followed by its 1-byte langID.`,
    57  	`
    58  altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
    59  	`
    60  AliasMap maps langIDs to their suggested replacements.`,
    61  	`
    62  script is an alphabetically sorted list of ISO 15924 codes. The index
    63  of the script in the string, divided by 4, is the internal scriptID.`,
    64  	`
    65  isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
    66  for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
    67  the UN.M49 codes used for groups.)`,
    68  	`
    69  regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
    70  Each 2-letter codes is followed by two bytes with the following meaning:
    71      - [A-Z}{2}: the first letter of the 2-letter code plus these two
    72                  letters form the 3-letter ISO code.
    73      - 0, n:     index into altRegionISO3.`,
    74  	`
    75  regionTypes defines the status of a region for various standards.`,
    76  	`
    77  m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
    78  codes indicating collections of regions.`,
    79  	`
    80  m49Index gives indexes into fromM49 based on the three most significant bits
    81  of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
    82     fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
    83  for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
    84  The region code is stored in the 9 lsb of the indexed value.`,
    85  	`
    86  fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
    87  	`
    88  altRegionISO3 holds a list of 3-letter region codes that cannot be
    89  mapped to 2-letter codes using the default algorithm. This is a short list.`,
    90  	`
    91  altRegionIDs holds a list of regionIDs the positions of which match those
    92  of the 3-letter ISO codes in altRegionISO3.`,
    93  	`
    94  variantNumSpecialized is the number of specialized variants in variants.`,
    95  	`
    96  suppressScript is an index from langID to the dominant script for that language,
    97  if it exists.  If a script is given, it should be suppressed from the language tag.`,
    98  	`
    99  likelyLang is a lookup table, indexed by langID, for the most likely
   100  scripts and regions given incomplete information. If more entries exist for a
   101  given language, region and script are the index and size respectively
   102  of the list in likelyLangList.`,
   103  	`
   104  likelyLangList holds lists info associated with likelyLang.`,
   105  	`
   106  likelyRegion is a lookup table, indexed by regionID, for the most likely
   107  languages and scripts given incomplete information. If more entries exist
   108  for a given regionID, lang and script are the index and size respectively
   109  of the list in likelyRegionList.
   110  TODO: exclude containers and user-definable regions from the list.`,
   111  	`
   112  likelyRegionList holds lists info associated with likelyRegion.`,
   113  	`
   114  likelyScript is a lookup table, indexed by scriptID, for the most likely
   115  languages and regions given a script.`,
   116  	`
   117  nRegionGroups is the number of region groups.`,
   118  	`
   119  regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
   120  where each set holds all groupings that are directly connected in a region
   121  containment graph.`,
   122  	`
   123  regionInclusionBits is an array of bit vectors where every vector represents
   124  a set of region groupings.  These sets are used to compute the distance
   125  between two regions for the purpose of language matching.`,
   126  	`
   127  regionInclusionNext marks, for each entry in regionInclusionBits, the set of
   128  all groups that are reachable from the groups set in the respective entry.`,
   129  }
   130  
   131  // TODO: consider changing some of these structures to tries. This can reduce
   132  // memory, but may increase the need for memory allocations. This could be
   133  // mitigated if we can piggyback on language tags for common cases.
   134  
   135  func failOnError(e error) {
   136  	if e != nil {
   137  		log.Panic(e)
   138  	}
   139  }
   140  
   141  type setType int
   142  
   143  const (
   144  	Indexed setType = 1 + iota // all elements must be of same size
   145  	Linear
   146  )
   147  
   148  type stringSet struct {
   149  	s              []string
   150  	sorted, frozen bool
   151  
   152  	// We often need to update values after the creation of an index is completed.
   153  	// We include a convenience map for keeping track of this.
   154  	update map[string]string
   155  	typ    setType // used for checking.
   156  }
   157  
   158  func (ss *stringSet) clone() stringSet {
   159  	c := *ss
   160  	c.s = append([]string(nil), c.s...)
   161  	return c
   162  }
   163  
   164  func (ss *stringSet) setType(t setType) {
   165  	if ss.typ != t && ss.typ != 0 {
   166  		log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
   167  	}
   168  }
   169  
   170  // parse parses a whitespace-separated string and initializes ss with its
   171  // components.
   172  func (ss *stringSet) parse(s string) {
   173  	scan := bufio.NewScanner(strings.NewReader(s))
   174  	scan.Split(bufio.ScanWords)
   175  	for scan.Scan() {
   176  		ss.add(scan.Text())
   177  	}
   178  }
   179  
   180  func (ss *stringSet) assertChangeable() {
   181  	if ss.frozen {
   182  		log.Panic("attempt to modify a frozen stringSet")
   183  	}
   184  }
   185  
   186  func (ss *stringSet) add(s string) {
   187  	ss.assertChangeable()
   188  	ss.s = append(ss.s, s)
   189  	ss.sorted = ss.frozen
   190  }
   191  
   192  func (ss *stringSet) freeze() {
   193  	ss.compact()
   194  	ss.frozen = true
   195  }
   196  
   197  func (ss *stringSet) compact() {
   198  	if ss.sorted {
   199  		return
   200  	}
   201  	a := ss.s
   202  	sort.Strings(a)
   203  	k := 0
   204  	for i := 1; i < len(a); i++ {
   205  		if a[k] != a[i] {
   206  			a[k+1] = a[i]
   207  			k++
   208  		}
   209  	}
   210  	ss.s = a[:k+1]
   211  	ss.sorted = ss.frozen
   212  }
   213  
   214  type funcSorter struct {
   215  	fn func(a, b string) bool
   216  	sort.StringSlice
   217  }
   218  
   219  func (s funcSorter) Less(i, j int) bool {
   220  	return s.fn(s.StringSlice[i], s.StringSlice[j])
   221  }
   222  
   223  func (ss *stringSet) sortFunc(f func(a, b string) bool) {
   224  	ss.compact()
   225  	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
   226  }
   227  
   228  func (ss *stringSet) remove(s string) {
   229  	ss.assertChangeable()
   230  	if i, ok := ss.find(s); ok {
   231  		copy(ss.s[i:], ss.s[i+1:])
   232  		ss.s = ss.s[:len(ss.s)-1]
   233  	}
   234  }
   235  
   236  func (ss *stringSet) replace(ol, nu string) {
   237  	ss.s[ss.index(ol)] = nu
   238  	ss.sorted = ss.frozen
   239  }
   240  
   241  func (ss *stringSet) index(s string) int {
   242  	ss.setType(Indexed)
   243  	i, ok := ss.find(s)
   244  	if !ok {
   245  		if i < len(ss.s) {
   246  			log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
   247  		}
   248  		log.Panicf("find: item %q is not in list", s)
   249  
   250  	}
   251  	return i
   252  }
   253  
   254  func (ss *stringSet) find(s string) (int, bool) {
   255  	ss.compact()
   256  	i := sort.SearchStrings(ss.s, s)
   257  	return i, i != len(ss.s) && ss.s[i] == s
   258  }
   259  
   260  func (ss *stringSet) slice() []string {
   261  	ss.compact()
   262  	return ss.s
   263  }
   264  
   265  func (ss *stringSet) updateLater(v, key string) {
   266  	if ss.update == nil {
   267  		ss.update = map[string]string{}
   268  	}
   269  	ss.update[v] = key
   270  }
   271  
   272  // join joins the string and ensures that all entries are of the same length.
   273  func (ss *stringSet) join() string {
   274  	ss.setType(Indexed)
   275  	n := len(ss.s[0])
   276  	for _, s := range ss.s {
   277  		if len(s) != n {
   278  			log.Panicf("join: not all entries are of the same length: %q", s)
   279  		}
   280  	}
   281  	ss.s = append(ss.s, strings.Repeat("\xff", n))
   282  	return strings.Join(ss.s, "")
   283  }
   284  
   285  // ianaEntry holds information for an entry in the IANA Language Subtag Repository.
   286  // All types use the same entry.
   287  // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
   288  // fields.
   289  type ianaEntry struct {
   290  	typ            string
   291  	description    []string
   292  	scope          string
   293  	added          string
   294  	preferred      string
   295  	deprecated     string
   296  	suppressScript string
   297  	macro          string
   298  	prefix         []string
   299  }
   300  
   301  type builder struct {
   302  	w    *gen.CodeWriter
   303  	hw   io.Writer // MultiWriter for w and w.Hash
   304  	data *cldr.CLDR
   305  	supp *cldr.SupplementalData
   306  
   307  	// indices
   308  	locale      stringSet // common locales
   309  	lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
   310  	langNoIndex stringSet // 3-letter ISO codes with no associated data
   311  	script      stringSet // 4-letter ISO codes
   312  	region      stringSet // 2-letter ISO or 3-digit UN M49 codes
   313  	variant     stringSet // 4-8-alphanumeric variant code.
   314  
   315  	// Region codes that are groups with their corresponding group IDs.
   316  	groups map[int]index
   317  
   318  	// langInfo
   319  	registry map[string]*ianaEntry
   320  }
   321  
   322  type index uint
   323  
   324  func newBuilder(w *gen.CodeWriter) *builder {
   325  	r := gen.OpenCLDRCoreZip()
   326  	defer r.Close()
   327  	d := &cldr.Decoder{}
   328  	data, err := d.DecodeZip(r)
   329  	failOnError(err)
   330  	b := builder{
   331  		w:    w,
   332  		hw:   io.MultiWriter(w, w.Hash),
   333  		data: data,
   334  		supp: data.Supplemental(),
   335  	}
   336  	b.parseRegistry()
   337  	return &b
   338  }
   339  
   340  func (b *builder) parseRegistry() {
   341  	r := gen.OpenIANAFile("assignments/language-subtag-registry")
   342  	defer r.Close()
   343  	b.registry = make(map[string]*ianaEntry)
   344  
   345  	scan := bufio.NewScanner(r)
   346  	scan.Split(bufio.ScanWords)
   347  	var record *ianaEntry
   348  	for more := scan.Scan(); more; {
   349  		key := scan.Text()
   350  		more = scan.Scan()
   351  		value := scan.Text()
   352  		switch key {
   353  		case "Type:":
   354  			record = &ianaEntry{typ: value}
   355  		case "Subtag:", "Tag:":
   356  			if s := strings.SplitN(value, "..", 2); len(s) > 1 {
   357  				for a := s[0]; a <= s[1]; a = inc(a) {
   358  					b.addToRegistry(a, record)
   359  				}
   360  			} else {
   361  				b.addToRegistry(value, record)
   362  			}
   363  		case "Suppress-Script:":
   364  			record.suppressScript = value
   365  		case "Added:":
   366  			record.added = value
   367  		case "Deprecated:":
   368  			record.deprecated = value
   369  		case "Macrolanguage:":
   370  			record.macro = value
   371  		case "Preferred-Value:":
   372  			record.preferred = value
   373  		case "Prefix:":
   374  			record.prefix = append(record.prefix, value)
   375  		case "Scope:":
   376  			record.scope = value
   377  		case "Description:":
   378  			buf := []byte(value)
   379  			for more = scan.Scan(); more; more = scan.Scan() {
   380  				b := scan.Bytes()
   381  				if b[0] == '%' || b[len(b)-1] == ':' {
   382  					break
   383  				}
   384  				buf = append(buf, ' ')
   385  				buf = append(buf, b...)
   386  			}
   387  			record.description = append(record.description, string(buf))
   388  			continue
   389  		default:
   390  			continue
   391  		}
   392  		more = scan.Scan()
   393  	}
   394  	if scan.Err() != nil {
   395  		log.Panic(scan.Err())
   396  	}
   397  }
   398  
   399  func (b *builder) addToRegistry(key string, entry *ianaEntry) {
   400  	if info, ok := b.registry[key]; ok {
   401  		if info.typ != "language" || entry.typ != "extlang" {
   402  			log.Fatalf("parseRegistry: tag %q already exists", key)
   403  		}
   404  	} else {
   405  		b.registry[key] = entry
   406  	}
   407  }
   408  
   409  var commentIndex = make(map[string]string)
   410  
   411  func init() {
   412  	for _, s := range comment {
   413  		key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
   414  		commentIndex[key] = s
   415  	}
   416  }
   417  
   418  func (b *builder) comment(name string) {
   419  	if s := commentIndex[name]; len(s) > 0 {
   420  		b.w.WriteComment(s)
   421  	} else {
   422  		fmt.Fprintln(b.w)
   423  	}
   424  }
   425  
   426  func (b *builder) pf(f string, x ...interface{}) {
   427  	fmt.Fprintf(b.hw, f, x...)
   428  	fmt.Fprint(b.hw, "\n")
   429  }
   430  
   431  func (b *builder) p(x ...interface{}) {
   432  	fmt.Fprintln(b.hw, x...)
   433  }
   434  
   435  func (b *builder) addSize(s int) {
   436  	b.w.Size += s
   437  	b.pf("// Size: %d bytes", s)
   438  }
   439  
   440  func (b *builder) writeConst(name string, x interface{}) {
   441  	b.comment(name)
   442  	b.w.WriteConst(name, x)
   443  }
   444  
   445  // writeConsts computes f(v) for all v in values and writes the results
   446  // as constants named _v to a single constant block.
   447  func (b *builder) writeConsts(f func(string) int, values ...string) {
   448  	b.pf("const (")
   449  	for _, v := range values {
   450  		b.pf("\t_%s = %v", v, f(v))
   451  	}
   452  	b.pf(")")
   453  }
   454  
   455  // writeType writes the type of the given value, which must be a struct.
   456  func (b *builder) writeType(value interface{}) {
   457  	b.comment(reflect.TypeOf(value).Name())
   458  	b.w.WriteType(value)
   459  }
   460  
   461  func (b *builder) writeSlice(name string, ss interface{}) {
   462  	b.writeSliceAddSize(name, 0, ss)
   463  }
   464  
   465  func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
   466  	b.comment(name)
   467  	b.w.Size += extraSize
   468  	v := reflect.ValueOf(ss)
   469  	t := v.Type().Elem()
   470  	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
   471  
   472  	fmt.Fprintf(b.w, "var %s = ", name)
   473  	b.w.WriteArray(ss)
   474  	b.p()
   475  }
   476  
   477  type FromTo struct {
   478  	From, To uint16
   479  }
   480  
   481  func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
   482  	ss.sortFunc(func(a, b string) bool {
   483  		return index(a) < index(b)
   484  	})
   485  	m := []FromTo{}
   486  	for _, s := range ss.s {
   487  		m = append(m, FromTo{index(s), index(ss.update[s])})
   488  	}
   489  	b.writeSlice(name, m)
   490  }
   491  
   492  const base = 'z' - 'a' + 1
   493  
   494  func strToInt(s string) uint {
   495  	v := uint(0)
   496  	for i := 0; i < len(s); i++ {
   497  		v *= base
   498  		v += uint(s[i] - 'a')
   499  	}
   500  	return v
   501  }
   502  
   503  // converts the given integer to the original ASCII string passed to strToInt.
   504  // len(s) must match the number of characters obtained.
   505  func intToStr(v uint, s []byte) {
   506  	for i := len(s) - 1; i >= 0; i-- {
   507  		s[i] = byte(v%base) + 'a'
   508  		v /= base
   509  	}
   510  }
   511  
   512  func (b *builder) writeBitVector(name string, ss []string) {
   513  	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
   514  	for _, s := range ss {
   515  		v := strToInt(s)
   516  		vec[v/8] |= 1 << (v % 8)
   517  	}
   518  	b.writeSlice(name, vec)
   519  }
   520  
   521  // TODO: convert this type into a list or two-stage trie.
   522  func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
   523  	b.comment(name)
   524  	v := reflect.ValueOf(m)
   525  	sz := v.Len() * (2 + int(v.Type().Key().Size()))
   526  	for _, k := range m {
   527  		sz += len(k)
   528  	}
   529  	b.addSize(sz)
   530  	keys := []string{}
   531  	b.pf(`var %s = map[string]uint16{`, name)
   532  	for k := range m {
   533  		keys = append(keys, k)
   534  	}
   535  	sort.Strings(keys)
   536  	for _, k := range keys {
   537  		b.pf("\t%q: %v,", k, f(m[k]))
   538  	}
   539  	b.p("}")
   540  }
   541  
   542  func (b *builder) writeMap(name string, m interface{}) {
   543  	b.comment(name)
   544  	v := reflect.ValueOf(m)
   545  	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
   546  	b.addSize(sz)
   547  	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
   548  		return strings.IndexRune("{}, ", r) != -1
   549  	})
   550  	sort.Strings(f[1:])
   551  	b.pf(`var %s = %s{`, name, f[0])
   552  	for _, kv := range f[1:] {
   553  		b.pf("\t%s,", kv)
   554  	}
   555  	b.p("}")
   556  }
   557  
   558  func (b *builder) langIndex(s string) uint16 {
   559  	if s == "und" {
   560  		return 0
   561  	}
   562  	if i, ok := b.lang.find(s); ok {
   563  		return uint16(i)
   564  	}
   565  	return uint16(strToInt(s)) + uint16(len(b.lang.s))
   566  }
   567  
   568  // inc advances the string to its lexicographical successor.
   569  func inc(s string) string {
   570  	const maxTagLength = 4
   571  	var buf [maxTagLength]byte
   572  	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
   573  	for i := 0; i < len(s); i++ {
   574  		if s[i] <= 'Z' {
   575  			buf[i] -= 'a' - 'A'
   576  		}
   577  	}
   578  	return string(buf[:len(s)])
   579  }
   580  
   581  func (b *builder) parseIndices() {
   582  	meta := b.supp.Metadata
   583  
   584  	for k, v := range b.registry {
   585  		var ss *stringSet
   586  		switch v.typ {
   587  		case "language":
   588  			if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
   589  				b.lang.add(k)
   590  				continue
   591  			} else {
   592  				ss = &b.langNoIndex
   593  			}
   594  		case "region":
   595  			ss = &b.region
   596  		case "script":
   597  			ss = &b.script
   598  		case "variant":
   599  			ss = &b.variant
   600  		default:
   601  			continue
   602  		}
   603  		ss.add(k)
   604  	}
   605  	// Include any language for which there is data.
   606  	for _, lang := range b.data.Locales() {
   607  		if x := b.data.RawLDML(lang); false ||
   608  			x.LocaleDisplayNames != nil ||
   609  			x.Characters != nil ||
   610  			x.Delimiters != nil ||
   611  			x.Measurement != nil ||
   612  			x.Dates != nil ||
   613  			x.Numbers != nil ||
   614  			x.Units != nil ||
   615  			x.ListPatterns != nil ||
   616  			x.Collations != nil ||
   617  			x.Segmentations != nil ||
   618  			x.Rbnf != nil ||
   619  			x.Annotations != nil ||
   620  			x.Metadata != nil {
   621  
   622  			from := strings.Split(lang, "_")
   623  			if lang := from[0]; lang != "root" {
   624  				b.lang.add(lang)
   625  			}
   626  		}
   627  	}
   628  	// Include locales for plural rules, which uses a different structure.
   629  	for _, plurals := range b.data.Supplemental().Plurals {
   630  		for _, rules := range plurals.PluralRules {
   631  			for _, lang := range strings.Split(rules.Locales, " ") {
   632  				if lang = strings.Split(lang, "_")[0]; lang != "root" {
   633  					b.lang.add(lang)
   634  				}
   635  			}
   636  		}
   637  	}
   638  	// Include languages in likely subtags.
   639  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
   640  		from := strings.Split(m.From, "_")
   641  		b.lang.add(from[0])
   642  	}
   643  	// Include ISO-639 alpha-3 bibliographic entries.
   644  	for _, a := range meta.Alias.LanguageAlias {
   645  		if a.Reason == "bibliographic" {
   646  			b.langNoIndex.add(a.Type)
   647  		}
   648  	}
   649  	// Include regions in territoryAlias (not all are in the IANA registry!)
   650  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
   651  		if len(reg.Type) == 2 {
   652  			b.region.add(reg.Type)
   653  		}
   654  	}
   655  
   656  	for _, s := range b.lang.s {
   657  		if len(s) == 3 {
   658  			b.langNoIndex.remove(s)
   659  		}
   660  	}
   661  	b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
   662  	b.writeConst("NumScripts", len(b.script.slice()))
   663  	b.writeConst("NumRegions", len(b.region.slice()))
   664  
   665  	// Add dummy codes at the start of each list to represent "unspecified".
   666  	b.lang.add("---")
   667  	b.script.add("----")
   668  	b.region.add("---")
   669  
   670  	// common locales
   671  	b.locale.parse(meta.DefaultContent.Locales)
   672  }
   673  
   674  // TODO: region inclusion data will probably not be use used in future matchers.
   675  
   676  func (b *builder) computeRegionGroups() {
   677  	b.groups = make(map[int]index)
   678  
   679  	// Create group indices.
   680  	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
   681  		b.groups[i] = index(len(b.groups))
   682  	}
   683  	for _, g := range b.supp.TerritoryContainment.Group {
   684  		// Skip UN and EURO zone as they are flattening the containment
   685  		// relationship.
   686  		if g.Type == "EZ" || g.Type == "UN" {
   687  			continue
   688  		}
   689  		group := b.region.index(g.Type)
   690  		if _, ok := b.groups[group]; !ok {
   691  			b.groups[group] = index(len(b.groups))
   692  		}
   693  	}
   694  	if len(b.groups) > 64 {
   695  		log.Fatalf("only 64 groups supported, found %d", len(b.groups))
   696  	}
   697  	b.writeConst("nRegionGroups", len(b.groups))
   698  }
   699  
   700  var langConsts = []string{
   701  	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
   702  	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
   703  	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
   704  	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
   705  	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
   706  	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
   707  
   708  	// constants for grandfathered tags (if not already defined)
   709  	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
   710  	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
   711  }
   712  
   713  // writeLanguage generates all tables needed for language canonicalization.
   714  func (b *builder) writeLanguage() {
   715  	meta := b.supp.Metadata
   716  
   717  	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
   718  	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
   719  	b.writeConst("langPrivateStart", b.langIndex("qaa"))
   720  	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
   721  
   722  	// Get language codes that need to be mapped (overlong 3-letter codes,
   723  	// deprecated 2-letter codes, legacy and grandfathered tags.)
   724  	langAliasMap := stringSet{}
   725  	aliasTypeMap := map[string]AliasType{}
   726  
   727  	// altLangISO3 get the alternative ISO3 names that need to be mapped.
   728  	altLangISO3 := stringSet{}
   729  	// Add dummy start to avoid the use of index 0.
   730  	altLangISO3.add("---")
   731  	altLangISO3.updateLater("---", "aa")
   732  
   733  	lang := b.lang.clone()
   734  	for _, a := range meta.Alias.LanguageAlias {
   735  		if a.Replacement == "" {
   736  			a.Replacement = "und"
   737  		}
   738  		// TODO: support mapping to tags
   739  		repl := strings.SplitN(a.Replacement, "_", 2)[0]
   740  		if a.Reason == "overlong" {
   741  			if len(a.Replacement) == 2 && len(a.Type) == 3 {
   742  				lang.updateLater(a.Replacement, a.Type)
   743  			}
   744  		} else if len(a.Type) <= 3 {
   745  			switch a.Reason {
   746  			case "macrolanguage":
   747  				aliasTypeMap[a.Type] = Macro
   748  			case "deprecated":
   749  				// handled elsewhere
   750  				continue
   751  			case "bibliographic", "legacy":
   752  				if a.Type == "no" {
   753  					continue
   754  				}
   755  				aliasTypeMap[a.Type] = Legacy
   756  			default:
   757  				log.Fatalf("new %s alias: %s", a.Reason, a.Type)
   758  			}
   759  			langAliasMap.add(a.Type)
   760  			langAliasMap.updateLater(a.Type, repl)
   761  		}
   762  	}
   763  	// Manually add the mapping of "nb" (Norwegian) to its macro language.
   764  	// This can be removed if CLDR adopts this change.
   765  	langAliasMap.add("nb")
   766  	langAliasMap.updateLater("nb", "no")
   767  	aliasTypeMap["nb"] = Macro
   768  
   769  	for k, v := range b.registry {
   770  		// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
   771  		if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
   772  			langAliasMap.add(k)
   773  			langAliasMap.updateLater(k, v.preferred)
   774  			aliasTypeMap[k] = Deprecated
   775  		}
   776  	}
   777  	// Fix CLDR mappings.
   778  	lang.updateLater("tl", "tgl")
   779  	lang.updateLater("sh", "hbs")
   780  	lang.updateLater("mo", "mol")
   781  	lang.updateLater("no", "nor")
   782  	lang.updateLater("tw", "twi")
   783  	lang.updateLater("nb", "nob")
   784  	lang.updateLater("ak", "aka")
   785  	lang.updateLater("bh", "bih")
   786  
   787  	// Ensure that each 2-letter code is matched with a 3-letter code.
   788  	for _, v := range lang.s[1:] {
   789  		s, ok := lang.update[v]
   790  		if !ok {
   791  			if s, ok = lang.update[langAliasMap.update[v]]; !ok {
   792  				continue
   793  			}
   794  			lang.update[v] = s
   795  		}
   796  		if v[0] != s[0] {
   797  			altLangISO3.add(s)
   798  			altLangISO3.updateLater(s, v)
   799  		}
   800  	}
   801  
   802  	// Complete canonicalized language tags.
   803  	lang.freeze()
   804  	for i, v := range lang.s {
   805  		// We can avoid these manual entries by using the IANA registry directly.
   806  		// Seems easier to update the list manually, as changes are rare.
   807  		// The panic in this loop will trigger if we miss an entry.
   808  		add := ""
   809  		if s, ok := lang.update[v]; ok {
   810  			if s[0] == v[0] {
   811  				add = s[1:]
   812  			} else {
   813  				add = string([]byte{0, byte(altLangISO3.index(s))})
   814  			}
   815  		} else if len(v) == 3 {
   816  			add = "\x00"
   817  		} else {
   818  			log.Panicf("no data for long form of %q", v)
   819  		}
   820  		lang.s[i] += add
   821  	}
   822  	b.writeConst("lang", tag.Index(lang.join()))
   823  
   824  	b.writeConst("langNoIndexOffset", len(b.lang.s))
   825  
   826  	// space of all valid 3-letter language identifiers.
   827  	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
   828  
   829  	altLangIndex := []uint16{}
   830  	for i, s := range altLangISO3.slice() {
   831  		altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
   832  		if i > 0 {
   833  			idx := b.lang.index(altLangISO3.update[s])
   834  			altLangIndex = append(altLangIndex, uint16(idx))
   835  		}
   836  	}
   837  	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
   838  	b.writeSlice("altLangIndex", altLangIndex)
   839  
   840  	b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex)
   841  	types := make([]AliasType, len(langAliasMap.s))
   842  	for i, s := range langAliasMap.s {
   843  		types[i] = aliasTypeMap[s]
   844  	}
   845  	b.writeSlice("AliasTypes", types)
   846  }
   847  
   848  var scriptConsts = []string{
   849  	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
   850  	"Zzzz",
   851  }
   852  
   853  func (b *builder) writeScript() {
   854  	b.writeConsts(b.script.index, scriptConsts...)
   855  	b.writeConst("script", tag.Index(b.script.join()))
   856  
   857  	supp := make([]uint8, len(b.lang.slice()))
   858  	for i, v := range b.lang.slice()[1:] {
   859  		if sc := b.registry[v].suppressScript; sc != "" {
   860  			supp[i+1] = uint8(b.script.index(sc))
   861  		}
   862  	}
   863  	b.writeSlice("suppressScript", supp)
   864  
   865  	// There is only one deprecated script in CLDR. This value is hard-coded.
   866  	// We check here if the code must be updated.
   867  	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
   868  		if a.Type != "Qaai" {
   869  			log.Panicf("unexpected deprecated stript %q", a.Type)
   870  		}
   871  	}
   872  }
   873  
   874  func parseM49(s string) int16 {
   875  	if len(s) == 0 {
   876  		return 0
   877  	}
   878  	v, err := strconv.ParseUint(s, 10, 10)
   879  	failOnError(err)
   880  	return int16(v)
   881  }
   882  
   883  var regionConsts = []string{
   884  	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
   885  	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
   886  }
   887  
   888  func (b *builder) writeRegion() {
   889  	b.writeConsts(b.region.index, regionConsts...)
   890  
   891  	isoOffset := b.region.index("AA")
   892  	m49map := make([]int16, len(b.region.slice()))
   893  	fromM49map := make(map[int16]int)
   894  	altRegionISO3 := ""
   895  	altRegionIDs := []uint16{}
   896  
   897  	b.writeConst("isoRegionOffset", isoOffset)
   898  
   899  	// 2-letter region lookup and mapping to numeric codes.
   900  	regionISO := b.region.clone()
   901  	regionISO.s = regionISO.s[isoOffset:]
   902  	regionISO.sorted = false
   903  
   904  	regionTypes := make([]byte, len(b.region.s))
   905  
   906  	// Is the region valid BCP 47?
   907  	for s, e := range b.registry {
   908  		if len(s) == 2 && s == strings.ToUpper(s) {
   909  			i := b.region.index(s)
   910  			for _, d := range e.description {
   911  				if strings.Contains(d, "Private use") {
   912  					regionTypes[i] = iso3166UserAssigned
   913  				}
   914  			}
   915  			regionTypes[i] |= bcp47Region
   916  		}
   917  	}
   918  
   919  	// Is the region a valid ccTLD?
   920  	r := gen.OpenIANAFile("domains/root/db")
   921  	defer r.Close()
   922  
   923  	buf, err := io.ReadAll(r)
   924  	failOnError(err)
   925  	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
   926  	for _, m := range re.FindAllSubmatch(buf, -1) {
   927  		i := b.region.index(strings.ToUpper(string(m[1])))
   928  		regionTypes[i] |= ccTLD
   929  	}
   930  
   931  	b.writeSlice("regionTypes", regionTypes)
   932  
   933  	iso3Set := make(map[string]int)
   934  	update := func(iso2, iso3 string) {
   935  		i := regionISO.index(iso2)
   936  		if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
   937  			regionISO.s[i] += iso3[1:]
   938  			iso3Set[iso3] = -1
   939  		} else {
   940  			if ok && j >= 0 {
   941  				regionISO.s[i] += string([]byte{0, byte(j)})
   942  			} else {
   943  				iso3Set[iso3] = len(altRegionISO3)
   944  				regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
   945  				altRegionISO3 += iso3
   946  				altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
   947  			}
   948  		}
   949  	}
   950  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   951  		i := regionISO.index(tc.Type) + isoOffset
   952  		if d := m49map[i]; d != 0 {
   953  			log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
   954  		}
   955  		m49 := parseM49(tc.Numeric)
   956  		m49map[i] = m49
   957  		if r := fromM49map[m49]; r == 0 {
   958  			fromM49map[m49] = i
   959  		} else if r != i {
   960  			dep := b.registry[regionISO.s[r-isoOffset]].deprecated
   961  			if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
   962  				fromM49map[m49] = i
   963  			}
   964  		}
   965  	}
   966  	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
   967  		if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
   968  			from := parseM49(ta.Type)
   969  			if r := fromM49map[from]; r == 0 {
   970  				fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
   971  			}
   972  		}
   973  	}
   974  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   975  		if len(tc.Alpha3) == 3 {
   976  			update(tc.Type, tc.Alpha3)
   977  		}
   978  	}
   979  	// This entries are not included in territoryCodes. Mostly 3-letter variants
   980  	// of deleted codes and an entry for QU.
   981  	for _, m := range []struct{ iso2, iso3 string }{
   982  		{"CT", "CTE"},
   983  		{"DY", "DHY"},
   984  		{"HV", "HVO"},
   985  		{"JT", "JTN"},
   986  		{"MI", "MID"},
   987  		{"NH", "NHB"},
   988  		{"NQ", "ATN"},
   989  		{"PC", "PCI"},
   990  		{"PU", "PUS"},
   991  		{"PZ", "PCZ"},
   992  		{"RH", "RHO"},
   993  		{"VD", "VDR"},
   994  		{"WK", "WAK"},
   995  		// These three-letter codes are used for others as well.
   996  		{"FQ", "ATF"},
   997  	} {
   998  		update(m.iso2, m.iso3)
   999  	}
  1000  	for i, s := range regionISO.s {
  1001  		if len(s) != 4 {
  1002  			regionISO.s[i] = s + "  "
  1003  		}
  1004  	}
  1005  	b.writeConst("regionISO", tag.Index(regionISO.join()))
  1006  	b.writeConst("altRegionISO3", altRegionISO3)
  1007  	b.writeSlice("altRegionIDs", altRegionIDs)
  1008  
  1009  	// Create list of deprecated regions.
  1010  	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
  1011  	// Transitionally-reserved mapping not included.
  1012  	regionOldMap := stringSet{}
  1013  	// Include regions in territoryAlias (not all are in the IANA registry!)
  1014  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
  1015  		if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
  1016  			regionOldMap.add(reg.Type)
  1017  			regionOldMap.updateLater(reg.Type, reg.Replacement)
  1018  			i, _ := regionISO.find(reg.Type)
  1019  			j, _ := regionISO.find(reg.Replacement)
  1020  			if k := m49map[i+isoOffset]; k == 0 {
  1021  				m49map[i+isoOffset] = m49map[j+isoOffset]
  1022  			}
  1023  		}
  1024  	}
  1025  	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
  1026  		return uint16(b.region.index(s))
  1027  	})
  1028  	// 3-digit region lookup, groupings.
  1029  	for i := 1; i < isoOffset; i++ {
  1030  		m := parseM49(b.region.s[i])
  1031  		m49map[i] = m
  1032  		fromM49map[m] = i
  1033  	}
  1034  	b.writeSlice("m49", m49map)
  1035  
  1036  	const (
  1037  		searchBits = 7
  1038  		regionBits = 9
  1039  	)
  1040  	if len(m49map) >= 1<<regionBits {
  1041  		log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
  1042  	}
  1043  	m49Index := [9]int16{}
  1044  	fromM49 := []uint16{}
  1045  	m49 := []int{}
  1046  	for k, _ := range fromM49map {
  1047  		m49 = append(m49, int(k))
  1048  	}
  1049  	sort.Ints(m49)
  1050  	for _, k := range m49[1:] {
  1051  		val := (k & (1<<searchBits - 1)) << regionBits
  1052  		fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
  1053  		m49Index[1:][k>>searchBits] = int16(len(fromM49))
  1054  	}
  1055  	b.writeSlice("m49Index", m49Index)
  1056  	b.writeSlice("fromM49", fromM49)
  1057  }
  1058  
  1059  const (
  1060  	// TODO: put these lists in regionTypes as user data? Could be used for
  1061  	// various optimizations and refinements and could be exposed in the API.
  1062  	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
  1063  	iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
  1064  	// DY and RH are actually not deleted, but indeterminately reserved.
  1065  	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
  1066  )
  1067  
  1068  const (
  1069  	iso3166UserAssigned = 1 << iota
  1070  	ccTLD
  1071  	bcp47Region
  1072  )
  1073  
  1074  func find(list []string, s string) int {
  1075  	for i, t := range list {
  1076  		if t == s {
  1077  			return i
  1078  		}
  1079  	}
  1080  	return -1
  1081  }
  1082  
  1083  // writeVariant generates per-variant information and creates a map from variant
  1084  // name to index value. We assign index values such that sorting multiple
  1085  // variants by index value will result in the correct order.
  1086  // There are two types of variants: specialized and general. Specialized variants
  1087  // are only applicable to certain language or language-script pairs. Generalized
  1088  // variants apply to any language. Generalized variants always sort after
  1089  // specialized variants.  We will therefore always assign a higher index value
  1090  // to a generalized variant than any other variant. Generalized variants are
  1091  // sorted alphabetically among themselves.
  1092  // Specialized variants may also sort after other specialized variants. Such
  1093  // variants will be ordered after any of the variants they may follow.
  1094  // We assume that if a variant x is followed by a variant y, then for any prefix
  1095  // p of x, p-x is a prefix of y. This allows us to order tags based on the
  1096  // maximum of the length of any of its prefixes.
  1097  // TODO: it is possible to define a set of Prefix values on variants such that
  1098  // a total order cannot be defined to the point that this algorithm breaks.
  1099  // In other words, we cannot guarantee the same order of variants for the
  1100  // future using the same algorithm or for non-compliant combinations of
  1101  // variants. For this reason, consider using simple alphabetic sorting
  1102  // of variants and ignore Prefix restrictions altogether.
  1103  func (b *builder) writeVariant() {
  1104  	generalized := stringSet{}
  1105  	specialized := stringSet{}
  1106  	specializedExtend := stringSet{}
  1107  	// Collate the variants by type and check assumptions.
  1108  	for _, v := range b.variant.slice() {
  1109  		e := b.registry[v]
  1110  		if len(e.prefix) == 0 {
  1111  			generalized.add(v)
  1112  			continue
  1113  		}
  1114  		c := strings.Split(e.prefix[0], "-")
  1115  		hasScriptOrRegion := false
  1116  		if len(c) > 1 {
  1117  			_, hasScriptOrRegion = b.script.find(c[1])
  1118  			if !hasScriptOrRegion {
  1119  				_, hasScriptOrRegion = b.region.find(c[1])
  1120  
  1121  			}
  1122  		}
  1123  		if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
  1124  			// Variant is preceded by a language.
  1125  			specialized.add(v)
  1126  			continue
  1127  		}
  1128  		// Variant is preceded by another variant.
  1129  		specializedExtend.add(v)
  1130  		prefix := c[0] + "-"
  1131  		if hasScriptOrRegion {
  1132  			prefix += c[1]
  1133  		}
  1134  		for _, p := range e.prefix {
  1135  			// Verify that the prefix minus the last element is a prefix of the
  1136  			// predecessor element.
  1137  			i := strings.LastIndex(p, "-")
  1138  			pred := b.registry[p[i+1:]]
  1139  			if find(pred.prefix, p[:i]) < 0 {
  1140  				log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
  1141  			}
  1142  			// The sorting used below does not work in the general case. It works
  1143  			// if we assume that variants that may be followed by others only have
  1144  			// prefixes of the same length. Verify this.
  1145  			count := strings.Count(p[:i], "-")
  1146  			for _, q := range pred.prefix {
  1147  				if c := strings.Count(q, "-"); c != count {
  1148  					log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
  1149  				}
  1150  			}
  1151  			if !strings.HasPrefix(p, prefix) {
  1152  				log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
  1153  			}
  1154  		}
  1155  	}
  1156  
  1157  	// Sort extended variants.
  1158  	a := specializedExtend.s
  1159  	less := func(v, w string) bool {
  1160  		// Sort by the maximum number of elements.
  1161  		maxCount := func(s string) (max int) {
  1162  			for _, p := range b.registry[s].prefix {
  1163  				if c := strings.Count(p, "-"); c > max {
  1164  					max = c
  1165  				}
  1166  			}
  1167  			return
  1168  		}
  1169  		if cv, cw := maxCount(v), maxCount(w); cv != cw {
  1170  			return cv < cw
  1171  		}
  1172  		// Sort by name as tie breaker.
  1173  		return v < w
  1174  	}
  1175  	sort.Sort(funcSorter{less, sort.StringSlice(a)})
  1176  	specializedExtend.frozen = true
  1177  
  1178  	// Create index from variant name to index.
  1179  	variantIndex := make(map[string]uint8)
  1180  	add := func(s []string) {
  1181  		for _, v := range s {
  1182  			variantIndex[v] = uint8(len(variantIndex))
  1183  		}
  1184  	}
  1185  	add(specialized.slice())
  1186  	add(specializedExtend.s)
  1187  	numSpecialized := len(variantIndex)
  1188  	add(generalized.slice())
  1189  	if n := len(variantIndex); n > 255 {
  1190  		log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
  1191  	}
  1192  	b.writeMap("variantIndex", variantIndex)
  1193  	b.writeConst("variantNumSpecialized", numSpecialized)
  1194  }
  1195  
  1196  func (b *builder) writeLanguageInfo() {
  1197  }
  1198  
  1199  // writeLikelyData writes tables that are used both for finding parent relations and for
  1200  // language matching.  Each entry contains additional bits to indicate the status of the
  1201  // data to know when it cannot be used for parent relations.
  1202  func (b *builder) writeLikelyData() {
  1203  	const (
  1204  		isList = 1 << iota
  1205  		scriptInFrom
  1206  		regionInFrom
  1207  	)
  1208  	type ( // generated types
  1209  		likelyScriptRegion struct {
  1210  			region uint16
  1211  			script uint16
  1212  			flags  uint8
  1213  		}
  1214  		likelyLangScript struct {
  1215  			lang   uint16
  1216  			script uint16
  1217  			flags  uint8
  1218  		}
  1219  		likelyLangRegion struct {
  1220  			lang   uint16
  1221  			region uint16
  1222  		}
  1223  		// likelyTag is used for getting likely tags for group regions, where
  1224  		// the likely region might be a region contained in the group.
  1225  		likelyTag struct {
  1226  			lang   uint16
  1227  			region uint16
  1228  			script uint16
  1229  		}
  1230  	)
  1231  	var ( // generated variables
  1232  		likelyRegionGroup = make([]likelyTag, len(b.groups))
  1233  		likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
  1234  		likelyRegion      = make([]likelyLangScript, len(b.region.s))
  1235  		likelyScript      = make([]likelyLangRegion, len(b.script.s))
  1236  		likelyLangList    = []likelyScriptRegion{}
  1237  		likelyRegionList  = []likelyLangScript{}
  1238  	)
  1239  	type fromTo struct {
  1240  		from, to []string
  1241  	}
  1242  	langToOther := map[int][]fromTo{}
  1243  	regionToOther := map[int][]fromTo{}
  1244  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
  1245  		from := strings.Split(m.From, "_")
  1246  		to := strings.Split(m.To, "_")
  1247  		if len(to) != 3 {
  1248  			log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
  1249  		}
  1250  		if len(from) > 3 {
  1251  			log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
  1252  		}
  1253  		if from[0] != to[0] && from[0] != "und" {
  1254  			log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
  1255  		}
  1256  		if len(from) == 3 {
  1257  			if from[2] != to[2] {
  1258  				log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
  1259  			}
  1260  			if from[0] != "und" {
  1261  				log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
  1262  			}
  1263  		}
  1264  		if len(from) == 1 || from[0] != "und" {
  1265  			id := 0
  1266  			if from[0] != "und" {
  1267  				id = b.lang.index(from[0])
  1268  			}
  1269  			langToOther[id] = append(langToOther[id], fromTo{from, to})
  1270  		} else if len(from) == 2 && len(from[1]) == 4 {
  1271  			sid := b.script.index(from[1])
  1272  			likelyScript[sid].lang = uint16(b.langIndex(to[0]))
  1273  			likelyScript[sid].region = uint16(b.region.index(to[2]))
  1274  		} else {
  1275  			r := b.region.index(from[len(from)-1])
  1276  			if id, ok := b.groups[r]; ok {
  1277  				if from[0] != "und" {
  1278  					log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
  1279  				}
  1280  				likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
  1281  				likelyRegionGroup[id].script = uint16(b.script.index(to[1]))
  1282  				likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
  1283  			} else {
  1284  				regionToOther[r] = append(regionToOther[r], fromTo{from, to})
  1285  			}
  1286  		}
  1287  	}
  1288  	b.writeType(likelyLangRegion{})
  1289  	b.writeSlice("likelyScript", likelyScript)
  1290  
  1291  	for id := range b.lang.s {
  1292  		list := langToOther[id]
  1293  		if len(list) == 1 {
  1294  			likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
  1295  			likelyLang[id].script = uint16(b.script.index(list[0].to[1]))
  1296  		} else if len(list) > 1 {
  1297  			likelyLang[id].flags = isList
  1298  			likelyLang[id].region = uint16(len(likelyLangList))
  1299  			likelyLang[id].script = uint16(len(list))
  1300  			for _, x := range list {
  1301  				flags := uint8(0)
  1302  				if len(x.from) > 1 {
  1303  					if x.from[1] == x.to[2] {
  1304  						flags = regionInFrom
  1305  					} else {
  1306  						flags = scriptInFrom
  1307  					}
  1308  				}
  1309  				likelyLangList = append(likelyLangList, likelyScriptRegion{
  1310  					region: uint16(b.region.index(x.to[2])),
  1311  					script: uint16(b.script.index(x.to[1])),
  1312  					flags:  flags,
  1313  				})
  1314  			}
  1315  		}
  1316  	}
  1317  	// TODO: merge suppressScript data with this table.
  1318  	b.writeType(likelyScriptRegion{})
  1319  	b.writeSlice("likelyLang", likelyLang)
  1320  	b.writeSlice("likelyLangList", likelyLangList)
  1321  
  1322  	for id := range b.region.s {
  1323  		list := regionToOther[id]
  1324  		if len(list) == 1 {
  1325  			likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
  1326  			likelyRegion[id].script = uint16(b.script.index(list[0].to[1]))
  1327  			if len(list[0].from) > 2 {
  1328  				likelyRegion[id].flags = scriptInFrom
  1329  			}
  1330  		} else if len(list) > 1 {
  1331  			likelyRegion[id].flags = isList
  1332  			likelyRegion[id].lang = uint16(len(likelyRegionList))
  1333  			likelyRegion[id].script = uint16(len(list))
  1334  			for i, x := range list {
  1335  				if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
  1336  					log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
  1337  				}
  1338  				x := likelyLangScript{
  1339  					lang:   uint16(b.langIndex(x.to[0])),
  1340  					script: uint16(b.script.index(x.to[1])),
  1341  				}
  1342  				if len(list[0].from) > 2 {
  1343  					x.flags = scriptInFrom
  1344  				}
  1345  				likelyRegionList = append(likelyRegionList, x)
  1346  			}
  1347  		}
  1348  	}
  1349  	b.writeType(likelyLangScript{})
  1350  	b.writeSlice("likelyRegion", likelyRegion)
  1351  	b.writeSlice("likelyRegionList", likelyRegionList)
  1352  
  1353  	b.writeType(likelyTag{})
  1354  	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
  1355  }
  1356  
  1357  func (b *builder) writeRegionInclusionData() {
  1358  	var (
  1359  		// mm holds for each group the set of groups with a distance of 1.
  1360  		mm = make(map[int][]index)
  1361  
  1362  		// containment holds for each group the transitive closure of
  1363  		// containment of other groups.
  1364  		containment = make(map[index][]index)
  1365  	)
  1366  	for _, g := range b.supp.TerritoryContainment.Group {
  1367  		// Skip UN and EURO zone as they are flattening the containment
  1368  		// relationship.
  1369  		if g.Type == "EZ" || g.Type == "UN" {
  1370  			continue
  1371  		}
  1372  		group := b.region.index(g.Type)
  1373  		groupIdx := b.groups[group]
  1374  		for _, mem := range strings.Split(g.Contains, " ") {
  1375  			r := b.region.index(mem)
  1376  			mm[r] = append(mm[r], groupIdx)
  1377  			if g, ok := b.groups[r]; ok {
  1378  				mm[group] = append(mm[group], g)
  1379  				containment[groupIdx] = append(containment[groupIdx], g)
  1380  			}
  1381  		}
  1382  	}
  1383  
  1384  	regionContainment := make([]uint64, len(b.groups))
  1385  	for _, g := range b.groups {
  1386  		l := containment[g]
  1387  
  1388  		// Compute the transitive closure of containment.
  1389  		for i := 0; i < len(l); i++ {
  1390  			l = append(l, containment[l[i]]...)
  1391  		}
  1392  
  1393  		// Compute the bitmask.
  1394  		regionContainment[g] = 1 << g
  1395  		for _, v := range l {
  1396  			regionContainment[g] |= 1 << v
  1397  		}
  1398  	}
  1399  	b.writeSlice("regionContainment", regionContainment)
  1400  
  1401  	regionInclusion := make([]uint8, len(b.region.s))
  1402  	bvs := make(map[uint64]index)
  1403  	// Make the first bitvector positions correspond with the groups.
  1404  	for r, i := range b.groups {
  1405  		bv := uint64(1 << i)
  1406  		for _, g := range mm[r] {
  1407  			bv |= 1 << g
  1408  		}
  1409  		bvs[bv] = i
  1410  		regionInclusion[r] = uint8(bvs[bv])
  1411  	}
  1412  	for r := 1; r < len(b.region.s); r++ {
  1413  		if _, ok := b.groups[r]; !ok {
  1414  			bv := uint64(0)
  1415  			for _, g := range mm[r] {
  1416  				bv |= 1 << g
  1417  			}
  1418  			if bv == 0 {
  1419  				// Pick the world for unspecified regions.
  1420  				bv = 1 << b.groups[b.region.index("001")]
  1421  			}
  1422  			if _, ok := bvs[bv]; !ok {
  1423  				bvs[bv] = index(len(bvs))
  1424  			}
  1425  			regionInclusion[r] = uint8(bvs[bv])
  1426  		}
  1427  	}
  1428  	b.writeSlice("regionInclusion", regionInclusion)
  1429  	regionInclusionBits := make([]uint64, len(bvs))
  1430  	for k, v := range bvs {
  1431  		regionInclusionBits[v] = uint64(k)
  1432  	}
  1433  	// Add bit vectors for increasingly large distances until a fixed point is reached.
  1434  	regionInclusionNext := []uint8{}
  1435  	for i := 0; i < len(regionInclusionBits); i++ {
  1436  		bits := regionInclusionBits[i]
  1437  		next := bits
  1438  		for i := uint(0); i < uint(len(b.groups)); i++ {
  1439  			if bits&(1<<i) != 0 {
  1440  				next |= regionInclusionBits[i]
  1441  			}
  1442  		}
  1443  		if _, ok := bvs[next]; !ok {
  1444  			bvs[next] = index(len(bvs))
  1445  			regionInclusionBits = append(regionInclusionBits, next)
  1446  		}
  1447  		regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
  1448  	}
  1449  	b.writeSlice("regionInclusionBits", regionInclusionBits)
  1450  	b.writeSlice("regionInclusionNext", regionInclusionNext)
  1451  }
  1452  
  1453  type parentRel struct {
  1454  	lang       uint16
  1455  	script     uint16
  1456  	maxScript  uint16
  1457  	toRegion   uint16
  1458  	fromRegion []uint16
  1459  }
  1460  
  1461  func (b *builder) writeParents() {
  1462  	b.writeType(parentRel{})
  1463  
  1464  	parents := []parentRel{}
  1465  
  1466  	// Construct parent overrides.
  1467  	n := 0
  1468  	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
  1469  		// Skipping non-standard scripts to root is implemented using addTags.
  1470  		if p.Parent == "root" {
  1471  			continue
  1472  		}
  1473  
  1474  		sub := strings.Split(p.Parent, "_")
  1475  		parent := parentRel{lang: b.langIndex(sub[0])}
  1476  		if len(sub) == 2 {
  1477  			// TODO: check that all undefined scripts are indeed Latn in these
  1478  			// cases.
  1479  			parent.maxScript = uint16(b.script.index("Latn"))
  1480  			parent.toRegion = uint16(b.region.index(sub[1]))
  1481  		} else {
  1482  			parent.script = uint16(b.script.index(sub[1]))
  1483  			parent.maxScript = parent.script
  1484  			parent.toRegion = uint16(b.region.index(sub[2]))
  1485  		}
  1486  		for _, c := range strings.Split(p.Locales, " ") {
  1487  			region := b.region.index(c[strings.LastIndex(c, "_")+1:])
  1488  			parent.fromRegion = append(parent.fromRegion, uint16(region))
  1489  		}
  1490  		parents = append(parents, parent)
  1491  		n += len(parent.fromRegion)
  1492  	}
  1493  	b.writeSliceAddSize("parents", n*2, parents)
  1494  }
  1495  
  1496  func main() {
  1497  	gen.Init()
  1498  
  1499  	gen.Repackage("gen_common.go", "common.go", "language")
  1500  
  1501  	w := gen.NewCodeWriter()
  1502  	defer w.WriteGoFile("tables.go", "language")
  1503  
  1504  	fmt.Fprintln(w, `import "github.com/go-enjin/golang-org-x-text/internal/tag"`)
  1505  
  1506  	b := newBuilder(w)
  1507  	gen.WriteCLDRVersion(w)
  1508  
  1509  	b.parseIndices()
  1510  	b.writeType(FromTo{})
  1511  	b.writeLanguage()
  1512  	b.writeScript()
  1513  	b.writeRegion()
  1514  	b.writeVariant()
  1515  	// TODO: b.writeLocale()
  1516  	b.computeRegionGroups()
  1517  	b.writeLikelyData()
  1518  	b.writeRegionInclusionData()
  1519  	b.writeParents()
  1520  }