github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/collate/maketables.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  // Collation table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"archive/zip"
    14  	"bufio"
    15  	"bytes"
    16  	"flag"
    17  	"fmt"
    18  	"io"
    19  	"io/ioutil"
    20  	"log"
    21  	"os"
    22  	"regexp"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  	"unicode/utf8"
    27  
    28  	"golang.org/x/text/collate"
    29  	"golang.org/x/text/collate/build"
    30  	"golang.org/x/text/collate/colltab"
    31  	"golang.org/x/text/internal/gen"
    32  	"golang.org/x/text/language"
    33  	"golang.org/x/text/unicode/cldr"
    34  )
    35  
    36  var (
    37  	test = flag.Bool("test", false,
    38  		"test existing tables; can be used to compare web data with package data.")
    39  	short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
    40  	draft = flag.Bool("draft", false, `Use draft versions, when available.`)
    41  	tags  = flag.String("tags", "", "build tags to be included after +build directive")
    42  	pkg   = flag.String("package", "collate",
    43  		"the name of the package in which the generated file is to be included")
    44  
    45  	tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
    46  		"comma-spearated list of tables to generate.")
    47  	exclude = flagStringSet("exclude", "zh2", "",
    48  		"comma-separated list of languages to exclude.")
    49  	include = flagStringSet("include", "", "",
    50  		"comma-separated list of languages to include. Include trumps exclude.")
    51  	types = flagStringSetAllowAll("types", "", "",
    52  		"comma-separated list of types that should be included.")
    53  )
    54  
    55  // stringSet implements an ordered set based on a list.  It implements flag.Value
    56  // to allow a set to be specified as a comma-separated list.
    57  type stringSet struct {
    58  	s        []string
    59  	allowed  *stringSet
    60  	dirty    bool // needs compaction if true
    61  	all      bool
    62  	allowAll bool
    63  }
    64  
    65  func flagStringSet(name, def, allowed, usage string) *stringSet {
    66  	ss := &stringSet{}
    67  	if allowed != "" {
    68  		usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
    69  		ss.allowed = &stringSet{}
    70  		failOnError(ss.allowed.Set(allowed))
    71  	}
    72  	ss.Set(def)
    73  	flag.Var(ss, name, usage)
    74  	return ss
    75  }
    76  
    77  func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
    78  	ss := &stringSet{allowAll: true}
    79  	if allowed == "" {
    80  		flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
    81  	} else {
    82  		ss.allowed = &stringSet{}
    83  		failOnError(ss.allowed.Set(allowed))
    84  		flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
    85  	}
    86  	ss.Set(def)
    87  	return ss
    88  }
    89  
    90  func (ss stringSet) Len() int {
    91  	return len(ss.s)
    92  }
    93  
    94  func (ss stringSet) String() string {
    95  	return strings.Join(ss.s, ",")
    96  }
    97  
    98  func (ss *stringSet) Set(s string) error {
    99  	if ss.allowAll && s == "all" {
   100  		ss.s = nil
   101  		ss.all = true
   102  		return nil
   103  	}
   104  	ss.s = ss.s[:0]
   105  	for _, s := range strings.Split(s, ",") {
   106  		if s := strings.TrimSpace(s); s != "" {
   107  			if ss.allowed != nil && !ss.allowed.contains(s) {
   108  				return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
   109  			}
   110  			ss.add(s)
   111  		}
   112  	}
   113  	ss.compact()
   114  	return nil
   115  }
   116  
   117  func (ss *stringSet) add(s string) {
   118  	ss.s = append(ss.s, s)
   119  	ss.dirty = true
   120  }
   121  
   122  func (ss *stringSet) values() []string {
   123  	ss.compact()
   124  	return ss.s
   125  }
   126  
   127  func (ss *stringSet) contains(s string) bool {
   128  	if ss.all {
   129  		return true
   130  	}
   131  	for _, v := range ss.s {
   132  		if v == s {
   133  			return true
   134  		}
   135  	}
   136  	return false
   137  }
   138  
   139  func (ss *stringSet) compact() {
   140  	if !ss.dirty {
   141  		return
   142  	}
   143  	a := ss.s
   144  	sort.Strings(a)
   145  	k := 0
   146  	for i := 1; i < len(a); i++ {
   147  		if a[k] != a[i] {
   148  			a[k+1] = a[i]
   149  			k++
   150  		}
   151  	}
   152  	ss.s = a[:k+1]
   153  	ss.dirty = false
   154  }
   155  
   156  func skipLang(l string) bool {
   157  	if include.Len() > 0 {
   158  		return !include.contains(l)
   159  	}
   160  	return exclude.contains(l)
   161  }
   162  
   163  // altInclude returns a list of alternatives (for the LDML alt attribute)
   164  // in order of preference.  An empty string in this list indicates the
   165  // default entry.
   166  func altInclude() []string {
   167  	l := []string{}
   168  	if *short {
   169  		l = append(l, "short")
   170  	}
   171  	l = append(l, "")
   172  	// TODO: handle draft using cldr.SetDraftLevel
   173  	if *draft {
   174  		l = append(l, "proposed")
   175  	}
   176  	return l
   177  }
   178  
   179  func failOnError(e error) {
   180  	if e != nil {
   181  		log.Panic(e)
   182  	}
   183  }
   184  
   185  func openArchive() *zip.Reader {
   186  	f := gen.OpenCLDRCoreZip()
   187  	buffer, err := ioutil.ReadAll(f)
   188  	f.Close()
   189  	failOnError(err)
   190  	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
   191  	failOnError(err)
   192  	return archive
   193  }
   194  
   195  // parseUCA parses a Default Unicode Collation Element Table of the format
   196  // specified in http://www.unicode.org/reports/tr10/#File_Format.
   197  // It returns the variable top.
   198  func parseUCA(builder *build.Builder) {
   199  	var r io.ReadCloser
   200  	var err error
   201  	for _, f := range openArchive().File {
   202  		if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
   203  			r, err = f.Open()
   204  		}
   205  	}
   206  	if r == nil {
   207  		log.Fatal("File allkeys_CLDR.txt not found in archive.")
   208  	}
   209  	failOnError(err)
   210  	defer r.Close()
   211  	scanner := bufio.NewScanner(r)
   212  	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
   213  	for i := 1; scanner.Scan(); i++ {
   214  		line := scanner.Text()
   215  		if len(line) == 0 || line[0] == '#' {
   216  			continue
   217  		}
   218  		if line[0] == '@' {
   219  			// parse properties
   220  			switch {
   221  			case strings.HasPrefix(line[1:], "version "):
   222  				a := strings.Split(line[1:], " ")
   223  				if a[1] != gen.UnicodeVersion() {
   224  					log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
   225  				}
   226  			case strings.HasPrefix(line[1:], "backwards "):
   227  				log.Fatalf("%d: unsupported option backwards", i)
   228  			default:
   229  				log.Printf("%d: unknown option %s", i, line[1:])
   230  			}
   231  		} else {
   232  			// parse entries
   233  			part := strings.Split(line, " ; ")
   234  			if len(part) != 2 {
   235  				log.Fatalf("%d: production rule without ';': %v", i, line)
   236  			}
   237  			lhs := []rune{}
   238  			for _, v := range strings.Split(part[0], " ") {
   239  				if v == "" {
   240  					continue
   241  				}
   242  				lhs = append(lhs, rune(convHex(i, v)))
   243  			}
   244  			var n int
   245  			var vars []int
   246  			rhs := [][]int{}
   247  			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
   248  				n += len(m[0])
   249  				elem := []int{}
   250  				for _, h := range strings.Split(m[2], ".") {
   251  					elem = append(elem, convHex(i, h))
   252  				}
   253  				if m[1] == "*" {
   254  					vars = append(vars, i)
   255  				}
   256  				rhs = append(rhs, elem)
   257  			}
   258  			if len(part[1]) < n+3 || part[1][n+1] != '#' {
   259  				log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
   260  			}
   261  			if *test {
   262  				testInput.add(string(lhs))
   263  			}
   264  			failOnError(builder.Add(lhs, rhs, vars))
   265  		}
   266  	}
   267  	if scanner.Err() != nil {
   268  		log.Fatal(scanner.Err())
   269  	}
   270  }
   271  
   272  func convHex(line int, s string) int {
   273  	r, e := strconv.ParseInt(s, 16, 32)
   274  	if e != nil {
   275  		log.Fatalf("%d: %v", line, e)
   276  	}
   277  	return int(r)
   278  }
   279  
   280  var testInput = stringSet{}
   281  
   282  var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
   283  var tagRe = regexp.MustCompile(`<([a-z_]*)  */>`)
   284  
   285  var mainLocales = []string{}
   286  
   287  // charsets holds a list of exemplar characters per category.
   288  type charSets map[string][]string
   289  
   290  func (p charSets) fprint(w io.Writer) {
   291  	fmt.Fprintln(w, "[exN]string{")
   292  	for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
   293  		if set := p[k]; len(set) != 0 {
   294  			fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
   295  		}
   296  	}
   297  	fmt.Fprintln(w, "\t},")
   298  }
   299  
   300  var localeChars = make(map[string]charSets)
   301  
   302  const exemplarHeader = `
   303  type exemplarType int
   304  const (
   305  	exCharacters exemplarType = iota
   306  	exContractions
   307  	exPunctuation
   308  	exAuxiliary
   309  	exCurrency
   310  	exIndex
   311  	exN
   312  )
   313  `
   314  
   315  func printExemplarCharacters(w io.Writer) {
   316  	fmt.Fprintln(w, exemplarHeader)
   317  	fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
   318  	for _, loc := range mainLocales {
   319  		fmt.Fprintf(w, "\t%q: ", loc)
   320  		localeChars[loc].fprint(w)
   321  	}
   322  	fmt.Fprintln(w, "}")
   323  }
   324  
   325  func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
   326  	r := gen.OpenCLDRCoreZip()
   327  	data, err := d.DecodeZip(r)
   328  	failOnError(err)
   329  	return data
   330  }
   331  
   332  // parseMain parses XML files in the main directory of the CLDR core.zip file.
   333  func parseMain() {
   334  	d := &cldr.Decoder{}
   335  	d.SetDirFilter("main")
   336  	d.SetSectionFilter("characters")
   337  	data := decodeCLDR(d)
   338  	for _, loc := range data.Locales() {
   339  		x := data.RawLDML(loc)
   340  		if skipLang(x.Identity.Language.Type) {
   341  			continue
   342  		}
   343  		if x.Characters != nil {
   344  			x, _ = data.LDML(loc)
   345  			loc = language.Make(loc).String()
   346  			for _, ec := range x.Characters.ExemplarCharacters {
   347  				if ec.Draft != "" {
   348  					continue
   349  				}
   350  				if _, ok := localeChars[loc]; !ok {
   351  					mainLocales = append(mainLocales, loc)
   352  					localeChars[loc] = make(charSets)
   353  				}
   354  				localeChars[loc][ec.Type] = parseCharacters(ec.Data())
   355  			}
   356  		}
   357  	}
   358  }
   359  
   360  func parseCharacters(chars string) []string {
   361  	parseSingle := func(s string) (r rune, tail string, escaped bool) {
   362  		if s[0] == '\\' {
   363  			return rune(s[1]), s[2:], true
   364  		}
   365  		r, sz := utf8.DecodeRuneInString(s)
   366  		return r, s[sz:], false
   367  	}
   368  	chars = strings.TrimSpace(chars)
   369  	if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
   370  		chars = chars[1:n]
   371  	}
   372  	list := []string{}
   373  	var r, last, end rune
   374  	for len(chars) > 0 {
   375  		if chars[0] == '{' { // character sequence
   376  			buf := []rune{}
   377  			for chars = chars[1:]; len(chars) > 0; {
   378  				r, chars, _ = parseSingle(chars)
   379  				if r == '}' {
   380  					break
   381  				}
   382  				if r == ' ' {
   383  					log.Fatalf("space not supported in sequence %q", chars)
   384  				}
   385  				buf = append(buf, r)
   386  			}
   387  			list = append(list, string(buf))
   388  			last = 0
   389  		} else { // single character
   390  			escaped := false
   391  			r, chars, escaped = parseSingle(chars)
   392  			if r != ' ' {
   393  				if r == '-' && !escaped {
   394  					if last == 0 {
   395  						log.Fatal("'-' should be preceded by a character")
   396  					}
   397  					end, chars, _ = parseSingle(chars)
   398  					for ; last <= end; last++ {
   399  						list = append(list, string(last))
   400  					}
   401  					last = 0
   402  				} else {
   403  					list = append(list, string(r))
   404  					last = r
   405  				}
   406  			}
   407  		}
   408  	}
   409  	return list
   410  }
   411  
   412  var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
   413  
   414  // parseCollation parses XML files in the collation directory of the CLDR core.zip file.
   415  func parseCollation(b *build.Builder) {
   416  	d := &cldr.Decoder{}
   417  	d.SetDirFilter("collation")
   418  	data := decodeCLDR(d)
   419  	for _, loc := range data.Locales() {
   420  		x, err := data.LDML(loc)
   421  		failOnError(err)
   422  		if skipLang(x.Identity.Language.Type) {
   423  			continue
   424  		}
   425  		cs := x.Collations.Collation
   426  		sl := cldr.MakeSlice(&cs)
   427  		if len(types.s) == 0 {
   428  			sl.SelectAnyOf("type", x.Collations.Default())
   429  		} else if !types.all {
   430  			sl.SelectAnyOf("type", types.s...)
   431  		}
   432  		sl.SelectOnePerGroup("alt", altInclude())
   433  
   434  		for _, c := range cs {
   435  			id, err := language.Parse(loc)
   436  			if err != nil {
   437  				fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
   438  				continue
   439  			}
   440  			// Support both old- and new-style defaults.
   441  			d := c.Type
   442  			if x.Collations.DefaultCollation == nil {
   443  				d = x.Collations.Default()
   444  			} else {
   445  				d = x.Collations.DefaultCollation.Data()
   446  			}
   447  			// We assume tables are being built either for search or collation,
   448  			// but not both. For search the default is always "search".
   449  			if d != c.Type && c.Type != "search" {
   450  				id, err = id.SetTypeForKey("co", c.Type)
   451  				failOnError(err)
   452  			}
   453  			t := b.Tailoring(id)
   454  			c.Process(processor{t})
   455  		}
   456  	}
   457  }
   458  
   459  type processor struct {
   460  	t *build.Tailoring
   461  }
   462  
   463  func (p processor) Reset(anchor string, before int) (err error) {
   464  	if before != 0 {
   465  		err = p.t.SetAnchorBefore(anchor)
   466  	} else {
   467  		err = p.t.SetAnchor(anchor)
   468  	}
   469  	failOnError(err)
   470  	return nil
   471  }
   472  
   473  func (p processor) Insert(level int, str, context, extend string) error {
   474  	str = context + str
   475  	if *test {
   476  		testInput.add(str)
   477  	}
   478  	// TODO: mimic bug in old maketables: remove.
   479  	err := p.t.Insert(colltab.Level(level-1), str, context+extend)
   480  	failOnError(err)
   481  	return nil
   482  }
   483  
   484  func (p processor) Index(id string) {
   485  }
   486  
   487  func testCollator(c *collate.Collator) {
   488  	c0 := collate.New(language.Und)
   489  
   490  	// iterator over all characters for all locales and check
   491  	// whether Key is equal.
   492  	buf := collate.Buffer{}
   493  
   494  	// Add all common and not too uncommon runes to the test set.
   495  	for i := rune(0); i < 0x30000; i++ {
   496  		testInput.add(string(i))
   497  	}
   498  	for i := rune(0xE0000); i < 0xF0000; i++ {
   499  		testInput.add(string(i))
   500  	}
   501  	for _, str := range testInput.values() {
   502  		k0 := c0.KeyFromString(&buf, str)
   503  		k := c.KeyFromString(&buf, str)
   504  		if !bytes.Equal(k0, k) {
   505  			failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
   506  		}
   507  		buf.Reset()
   508  	}
   509  	fmt.Println("PASS")
   510  }
   511  
   512  func main() {
   513  	gen.Init()
   514  	b := build.NewBuilder()
   515  	parseUCA(b)
   516  	if tables.contains("chars") {
   517  		parseMain()
   518  	}
   519  	parseCollation(b)
   520  
   521  	c, err := b.Build()
   522  	failOnError(err)
   523  
   524  	if *test {
   525  		testCollator(collate.NewFromTable(c))
   526  	} else {
   527  		w := &bytes.Buffer{}
   528  
   529  		gen.WriteUnicodeVersion(w)
   530  		gen.WriteCLDRVersion(w)
   531  
   532  		if tables.contains("collate") {
   533  			_, err = b.Print(w)
   534  			failOnError(err)
   535  		}
   536  		if tables.contains("chars") {
   537  			printExemplarCharacters(w)
   538  		}
   539  		gen.WriteGoFile("tables.go", *pkg, w.Bytes())
   540  	}
   541  }