github.com/liquid-dev/text@v0.3.3-liquid/collate/maketables.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  // Collation table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"archive/zip"
    14  	"bufio"
    15  	"bytes"
    16  	"flag"
    17  	"fmt"
    18  	"io"
    19  	"io/ioutil"
    20  	"log"
    21  	"os"
    22  	"regexp"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  	"unicode/utf8"
    27  
    28  	"github.com/liquid-dev/text/collate"
    29  	"github.com/liquid-dev/text/collate/build"
    30  	"github.com/liquid-dev/text/internal/colltab"
    31  	"github.com/liquid-dev/text/internal/gen"
    32  	"github.com/liquid-dev/text/language"
    33  	"github.com/liquid-dev/text/unicode/cldr"
    34  )
    35  
    36  var (
    37  	test = flag.Bool("test", false,
    38  		"test existing tables; can be used to compare web data with package data.")
    39  	short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
    40  	draft = flag.Bool("draft", false, `Use draft versions, when available.`)
    41  	tags  = flag.String("tags", "", "build tags to be included after +build directive")
    42  	pkg   = flag.String("package", "collate",
    43  		"the name of the package in which the generated file is to be included")
    44  
    45  	tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
    46  		"comma-spearated list of tables to generate.")
    47  	exclude = flagStringSet("exclude", "zh2", "",
    48  		"comma-separated list of languages to exclude.")
    49  	include = flagStringSet("include", "", "",
    50  		"comma-separated list of languages to include. Include trumps exclude.")
    51  	// TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
    52  	// TODO: Not included: traditional (buggy for Bengali)
    53  	types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
    54  		"comma-separated list of types that should be included.")
    55  )
    56  
    57  // stringSet implements an ordered set based on a list.  It implements flag.Value
    58  // to allow a set to be specified as a comma-separated list.
    59  type stringSet struct {
    60  	s        []string
    61  	allowed  *stringSet
    62  	dirty    bool // needs compaction if true
    63  	all      bool
    64  	allowAll bool
    65  }
    66  
    67  func flagStringSet(name, def, allowed, usage string) *stringSet {
    68  	ss := &stringSet{}
    69  	if allowed != "" {
    70  		usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
    71  		ss.allowed = &stringSet{}
    72  		failOnError(ss.allowed.Set(allowed))
    73  	}
    74  	ss.Set(def)
    75  	flag.Var(ss, name, usage)
    76  	return ss
    77  }
    78  
    79  func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
    80  	ss := &stringSet{allowAll: true}
    81  	if allowed == "" {
    82  		flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
    83  	} else {
    84  		ss.allowed = &stringSet{}
    85  		failOnError(ss.allowed.Set(allowed))
    86  		flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
    87  	}
    88  	ss.Set(def)
    89  	return ss
    90  }
    91  
    92  func (ss stringSet) Len() int {
    93  	return len(ss.s)
    94  }
    95  
    96  func (ss stringSet) String() string {
    97  	return strings.Join(ss.s, ",")
    98  }
    99  
   100  func (ss *stringSet) Set(s string) error {
   101  	if ss.allowAll && s == "all" {
   102  		ss.s = nil
   103  		ss.all = true
   104  		return nil
   105  	}
   106  	ss.s = ss.s[:0]
   107  	for _, s := range strings.Split(s, ",") {
   108  		if s := strings.TrimSpace(s); s != "" {
   109  			if ss.allowed != nil && !ss.allowed.contains(s) {
   110  				return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
   111  			}
   112  			ss.add(s)
   113  		}
   114  	}
   115  	ss.compact()
   116  	return nil
   117  }
   118  
   119  func (ss *stringSet) add(s string) {
   120  	ss.s = append(ss.s, s)
   121  	ss.dirty = true
   122  }
   123  
   124  func (ss *stringSet) values() []string {
   125  	ss.compact()
   126  	return ss.s
   127  }
   128  
   129  func (ss *stringSet) contains(s string) bool {
   130  	if ss.all {
   131  		return true
   132  	}
   133  	for _, v := range ss.s {
   134  		if v == s {
   135  			return true
   136  		}
   137  	}
   138  	return false
   139  }
   140  
   141  func (ss *stringSet) compact() {
   142  	if !ss.dirty {
   143  		return
   144  	}
   145  	a := ss.s
   146  	sort.Strings(a)
   147  	k := 0
   148  	for i := 1; i < len(a); i++ {
   149  		if a[k] != a[i] {
   150  			a[k+1] = a[i]
   151  			k++
   152  		}
   153  	}
   154  	ss.s = a[:k+1]
   155  	ss.dirty = false
   156  }
   157  
   158  func skipLang(l string) bool {
   159  	if include.Len() > 0 {
   160  		return !include.contains(l)
   161  	}
   162  	return exclude.contains(l)
   163  }
   164  
   165  // altInclude returns a list of alternatives (for the LDML alt attribute)
   166  // in order of preference.  An empty string in this list indicates the
   167  // default entry.
   168  func altInclude() []string {
   169  	l := []string{}
   170  	if *short {
   171  		l = append(l, "short")
   172  	}
   173  	l = append(l, "")
   174  	// TODO: handle draft using cldr.SetDraftLevel
   175  	if *draft {
   176  		l = append(l, "proposed")
   177  	}
   178  	return l
   179  }
   180  
   181  func failOnError(e error) {
   182  	if e != nil {
   183  		log.Panic(e)
   184  	}
   185  }
   186  
   187  func openArchive() *zip.Reader {
   188  	f := gen.OpenCLDRCoreZip()
   189  	buffer, err := ioutil.ReadAll(f)
   190  	f.Close()
   191  	failOnError(err)
   192  	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
   193  	failOnError(err)
   194  	return archive
   195  }
   196  
   197  // parseUCA parses a Default Unicode Collation Element Table of the format
   198  // specified in https://www.unicode.org/reports/tr10/#File_Format.
   199  // It returns the variable top.
   200  func parseUCA(builder *build.Builder) {
   201  	var r io.ReadCloser
   202  	var err error
   203  	for _, f := range openArchive().File {
   204  		if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
   205  			r, err = f.Open()
   206  		}
   207  	}
   208  	if r == nil {
   209  		log.Fatal("File allkeys_CLDR.txt not found in archive.")
   210  	}
   211  	failOnError(err)
   212  	defer r.Close()
   213  	scanner := bufio.NewScanner(r)
   214  	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
   215  	for i := 1; scanner.Scan(); i++ {
   216  		line := scanner.Text()
   217  		if len(line) == 0 || line[0] == '#' {
   218  			continue
   219  		}
   220  		if line[0] == '@' {
   221  			// parse properties
   222  			switch {
   223  			case strings.HasPrefix(line[1:], "version "):
   224  				a := strings.Split(line[1:], " ")
   225  				if a[1] != gen.UnicodeVersion() {
   226  					log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
   227  				}
   228  			case strings.HasPrefix(line[1:], "backwards "):
   229  				log.Fatalf("%d: unsupported option backwards", i)
   230  			default:
   231  				log.Printf("%d: unknown option %s", i, line[1:])
   232  			}
   233  		} else {
   234  			// parse entries
   235  			part := strings.Split(line, " ; ")
   236  			if len(part) != 2 {
   237  				log.Fatalf("%d: production rule without ';': %v", i, line)
   238  			}
   239  			lhs := []rune{}
   240  			for _, v := range strings.Split(part[0], " ") {
   241  				if v == "" {
   242  					continue
   243  				}
   244  				lhs = append(lhs, rune(convHex(i, v)))
   245  			}
   246  			var n int
   247  			var vars []int
   248  			rhs := [][]int{}
   249  			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
   250  				n += len(m[0])
   251  				elem := []int{}
   252  				for _, h := range strings.Split(m[2], ".") {
   253  					elem = append(elem, convHex(i, h))
   254  				}
   255  				if m[1] == "*" {
   256  					vars = append(vars, i)
   257  				}
   258  				rhs = append(rhs, elem)
   259  			}
   260  			if len(part[1]) < n+3 || part[1][n+1] != '#' {
   261  				log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
   262  			}
   263  			if *test {
   264  				testInput.add(string(lhs))
   265  			}
   266  			failOnError(builder.Add(lhs, rhs, vars))
   267  		}
   268  	}
   269  	if scanner.Err() != nil {
   270  		log.Fatal(scanner.Err())
   271  	}
   272  }
   273  
   274  func convHex(line int, s string) int {
   275  	r, e := strconv.ParseInt(s, 16, 32)
   276  	if e != nil {
   277  		log.Fatalf("%d: %v", line, e)
   278  	}
   279  	return int(r)
   280  }
   281  
   282  var testInput = stringSet{}
   283  
   284  var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
   285  var tagRe = regexp.MustCompile(`<([a-z_]*)  */>`)
   286  
   287  var mainLocales = []string{}
   288  
   289  // charsets holds a list of exemplar characters per category.
   290  type charSets map[string][]string
   291  
   292  func (p charSets) fprint(w io.Writer) {
   293  	fmt.Fprintln(w, "[exN]string{")
   294  	for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
   295  		if set := p[k]; len(set) != 0 {
   296  			fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
   297  		}
   298  	}
   299  	fmt.Fprintln(w, "\t},")
   300  }
   301  
   302  var localeChars = make(map[string]charSets)
   303  
   304  const exemplarHeader = `
   305  type exemplarType int
   306  const (
   307  	exCharacters exemplarType = iota
   308  	exContractions
   309  	exPunctuation
   310  	exAuxiliary
   311  	exCurrency
   312  	exIndex
   313  	exN
   314  )
   315  `
   316  
   317  func printExemplarCharacters(w io.Writer) {
   318  	fmt.Fprintln(w, exemplarHeader)
   319  	fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
   320  	for _, loc := range mainLocales {
   321  		fmt.Fprintf(w, "\t%q: ", loc)
   322  		localeChars[loc].fprint(w)
   323  	}
   324  	fmt.Fprintln(w, "}")
   325  }
   326  
   327  func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
   328  	r := gen.OpenCLDRCoreZip()
   329  	data, err := d.DecodeZip(r)
   330  	failOnError(err)
   331  	return data
   332  }
   333  
   334  // parseMain parses XML files in the main directory of the CLDR core.zip file.
   335  func parseMain() {
   336  	d := &cldr.Decoder{}
   337  	d.SetDirFilter("main")
   338  	d.SetSectionFilter("characters")
   339  	data := decodeCLDR(d)
   340  	for _, loc := range data.Locales() {
   341  		x := data.RawLDML(loc)
   342  		if skipLang(x.Identity.Language.Type) {
   343  			continue
   344  		}
   345  		if x.Characters != nil {
   346  			x, _ = data.LDML(loc)
   347  			loc = language.Make(loc).String()
   348  			for _, ec := range x.Characters.ExemplarCharacters {
   349  				if ec.Draft != "" {
   350  					continue
   351  				}
   352  				if _, ok := localeChars[loc]; !ok {
   353  					mainLocales = append(mainLocales, loc)
   354  					localeChars[loc] = make(charSets)
   355  				}
   356  				localeChars[loc][ec.Type] = parseCharacters(ec.Data())
   357  			}
   358  		}
   359  	}
   360  }
   361  
   362  func parseCharacters(chars string) []string {
   363  	parseSingle := func(s string) (r rune, tail string, escaped bool) {
   364  		if s[0] == '\\' {
   365  			return rune(s[1]), s[2:], true
   366  		}
   367  		r, sz := utf8.DecodeRuneInString(s)
   368  		return r, s[sz:], false
   369  	}
   370  	chars = strings.TrimSpace(chars)
   371  	if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
   372  		chars = chars[1:n]
   373  	}
   374  	list := []string{}
   375  	var r, last, end rune
   376  	for len(chars) > 0 {
   377  		if chars[0] == '{' { // character sequence
   378  			buf := []rune{}
   379  			for chars = chars[1:]; len(chars) > 0; {
   380  				r, chars, _ = parseSingle(chars)
   381  				if r == '}' {
   382  					break
   383  				}
   384  				if r == ' ' {
   385  					log.Fatalf("space not supported in sequence %q", chars)
   386  				}
   387  				buf = append(buf, r)
   388  			}
   389  			list = append(list, string(buf))
   390  			last = 0
   391  		} else { // single character
   392  			escaped := false
   393  			r, chars, escaped = parseSingle(chars)
   394  			if r != ' ' {
   395  				if r == '-' && !escaped {
   396  					if last == 0 {
   397  						log.Fatal("'-' should be preceded by a character")
   398  					}
   399  					end, chars, _ = parseSingle(chars)
   400  					for ; last <= end; last++ {
   401  						list = append(list, string(last))
   402  					}
   403  					last = 0
   404  				} else {
   405  					list = append(list, string(r))
   406  					last = r
   407  				}
   408  			}
   409  		}
   410  	}
   411  	return list
   412  }
   413  
   414  var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
   415  
   416  // typeMap translates legacy type keys to their BCP47 equivalent.
   417  var typeMap = map[string]string{
   418  	"phonebook":   "phonebk",
   419  	"traditional": "trad",
   420  }
   421  
   422  // parseCollation parses XML files in the collation directory of the CLDR core.zip file.
   423  func parseCollation(b *build.Builder) {
   424  	d := &cldr.Decoder{}
   425  	d.SetDirFilter("collation")
   426  	data := decodeCLDR(d)
   427  	for _, loc := range data.Locales() {
   428  		x, err := data.LDML(loc)
   429  		failOnError(err)
   430  		if skipLang(x.Identity.Language.Type) {
   431  			continue
   432  		}
   433  		cs := x.Collations.Collation
   434  		sl := cldr.MakeSlice(&cs)
   435  		if len(types.s) == 0 {
   436  			sl.SelectAnyOf("type", x.Collations.Default())
   437  		} else if !types.all {
   438  			sl.SelectAnyOf("type", types.s...)
   439  		}
   440  		sl.SelectOnePerGroup("alt", altInclude())
   441  
   442  		for _, c := range cs {
   443  			id, err := language.Parse(loc)
   444  			if err != nil {
   445  				fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
   446  				continue
   447  			}
   448  			// Support both old- and new-style defaults.
   449  			d := c.Type
   450  			if x.Collations.DefaultCollation == nil {
   451  				d = x.Collations.Default()
   452  			} else {
   453  				d = x.Collations.DefaultCollation.Data()
   454  			}
   455  			// We assume tables are being built either for search or collation,
   456  			// but not both. For search the default is always "search".
   457  			if d != c.Type && c.Type != "search" {
   458  				typ := c.Type
   459  				if len(c.Type) > 8 {
   460  					typ = typeMap[c.Type]
   461  				}
   462  				id, err = id.SetTypeForKey("co", typ)
   463  				failOnError(err)
   464  			}
   465  			t := b.Tailoring(id)
   466  			c.Process(processor{t})
   467  		}
   468  	}
   469  }
   470  
   471  type processor struct {
   472  	t *build.Tailoring
   473  }
   474  
   475  func (p processor) Reset(anchor string, before int) (err error) {
   476  	if before != 0 {
   477  		err = p.t.SetAnchorBefore(anchor)
   478  	} else {
   479  		err = p.t.SetAnchor(anchor)
   480  	}
   481  	failOnError(err)
   482  	return nil
   483  }
   484  
   485  func (p processor) Insert(level int, str, context, extend string) error {
   486  	str = context + str
   487  	if *test {
   488  		testInput.add(str)
   489  	}
   490  	// TODO: mimic bug in old maketables: remove.
   491  	err := p.t.Insert(colltab.Level(level-1), str, context+extend)
   492  	failOnError(err)
   493  	return nil
   494  }
   495  
   496  func (p processor) Index(id string) {
   497  }
   498  
   499  func testCollator(c *collate.Collator) {
   500  	c0 := collate.New(language.Und)
   501  
   502  	// iterator over all characters for all locales and check
   503  	// whether Key is equal.
   504  	buf := collate.Buffer{}
   505  
   506  	// Add all common and not too uncommon runes to the test set.
   507  	for i := rune(0); i < 0x30000; i++ {
   508  		testInput.add(string(i))
   509  	}
   510  	for i := rune(0xE0000); i < 0xF0000; i++ {
   511  		testInput.add(string(i))
   512  	}
   513  	for _, str := range testInput.values() {
   514  		k0 := c0.KeyFromString(&buf, str)
   515  		k := c.KeyFromString(&buf, str)
   516  		if !bytes.Equal(k0, k) {
   517  			failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
   518  		}
   519  		buf.Reset()
   520  	}
   521  	fmt.Println("PASS")
   522  }
   523  
   524  func main() {
   525  	gen.Init()
   526  	b := build.NewBuilder()
   527  	parseUCA(b)
   528  	if tables.contains("chars") {
   529  		parseMain()
   530  	}
   531  	parseCollation(b)
   532  
   533  	c, err := b.Build()
   534  	failOnError(err)
   535  
   536  	if *test {
   537  		testCollator(collate.NewFromTable(c))
   538  	} else {
   539  		w := &bytes.Buffer{}
   540  
   541  		gen.WriteUnicodeVersion(w)
   542  		gen.WriteCLDRVersion(w)
   543  
   544  		if tables.contains("collate") {
   545  			_, err = b.Print(w)
   546  			failOnError(err)
   547  		}
   548  		if tables.contains("chars") {
   549  			printExemplarCharacters(w)
   550  		}
   551  		gen.WriteGoFile("tables.go", *pkg, w.Bytes())
   552  	}
   553  }