github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/export/unicode/gen.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  // +build ignore
     7  
     8  // Unicode table generator.
     9  // Data read from the web.
    10  
    11  package main
    12  
    13  import (
    14  	"flag"
    15  	"fmt"
    16  	"log"
    17  	"os"
    18  	"regexp"
    19  	"sort"
    20  	"strings"
    21  	"unicode"
    22  
    23  	"github.com/go-enjin/golang-org-x-text/internal/gen"
    24  	"github.com/go-enjin/golang-org-x-text/internal/ucd"
    25  	"github.com/go-enjin/golang-org-x-text/unicode/rangetable"
    26  )
    27  
    28  func main() {
    29  	flag.Parse()
    30  	setupOutput()
    31  	loadChars() // always needed
    32  	loadCasefold()
    33  	printCategories()
    34  	printScriptOrProperty(false)
    35  	printScriptOrProperty(true)
    36  	printCases()
    37  	printLatinProperties()
    38  	printCasefold()
    39  	printSizes()
    40  	flushOutput()
    41  }
    42  
    43  func defaultVersion() string {
    44  	if v := os.Getenv("UNICODE_VERSION"); v != "" {
    45  		return v
    46  	}
    47  	return unicode.Version
    48  }
    49  
    50  var tablelist = flag.String("tables",
    51  	"all",
    52  	"comma-separated list of which tables to generate; can be letter")
    53  var scriptlist = flag.String("scripts",
    54  	"all",
    55  	"comma-separated list of which script tables to generate")
    56  var proplist = flag.String("props",
    57  	"all",
    58  	"comma-separated list of which property tables to generate")
    59  var cases = flag.Bool("cases",
    60  	true,
    61  	"generate case tables")
    62  var test = flag.Bool("test",
    63  	false,
    64  	"test existing tables; can be used to compare web data with package data")
    65  
    66  var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
    67  var logger = log.New(os.Stderr, "", log.Lshortfile)
    68  
    69  var output *gen.CodeWriter
    70  
    71  func setupOutput() {
    72  	output = gen.NewCodeWriter()
    73  }
    74  
    75  func flushOutput() {
    76  	output.WriteGoFile("tables.go", "unicode")
    77  }
    78  
    79  func printf(format string, args ...interface{}) {
    80  	fmt.Fprintf(output, format, args...)
    81  }
    82  
    83  func print(args ...interface{}) {
    84  	fmt.Fprint(output, args...)
    85  }
    86  
    87  func println(args ...interface{}) {
    88  	fmt.Fprintln(output, args...)
    89  }
    90  
    91  var category = map[string]bool{
    92  	// Nd Lu etc.
    93  	// We use one-character names to identify merged categories
    94  	"L": true, // Lu Ll Lt Lm Lo
    95  	"P": true, // Pc Pd Ps Pe Pu Pf Po
    96  	"M": true, // Mn Mc Me
    97  	"N": true, // Nd Nl No
    98  	"S": true, // Sm Sc Sk So
    99  	"Z": true, // Zs Zl Zp
   100  	"C": true, // Cc Cf Cs Co Cn
   101  }
   102  
   103  // This contains only the properties we're interested in.
   104  type Char struct {
   105  	codePoint rune // if zero, this index is not a valid code point.
   106  	category  string
   107  	upperCase rune
   108  	lowerCase rune
   109  	titleCase rune
   110  	foldCase  rune // simple case folding
   111  	caseOrbit rune // next in simple case folding orbit
   112  }
   113  
   114  const MaxChar = 0x10FFFF
   115  
   116  var chars = make([]Char, MaxChar+1)
   117  var scripts = make(map[string][]rune)
   118  var props = make(map[string][]rune) // a property looks like a script; can share the format
   119  
   120  func allCategories() []string {
   121  	a := make([]string, 0, len(category))
   122  	for k := range category {
   123  		a = append(a, k)
   124  	}
   125  	sort.Strings(a)
   126  	return a
   127  }
   128  
   129  func all(scripts map[string][]rune) []string {
   130  	a := make([]string, 0, len(scripts))
   131  	for k := range scripts {
   132  		a = append(a, k)
   133  	}
   134  	sort.Strings(a)
   135  	return a
   136  }
   137  
   138  func allCatFold(m map[string]map[rune]bool) []string {
   139  	a := make([]string, 0, len(m))
   140  	for k := range m {
   141  		a = append(a, k)
   142  	}
   143  	sort.Strings(a)
   144  	return a
   145  }
   146  
   147  func categoryOp(code rune, class uint8) bool {
   148  	category := chars[code].category
   149  	return len(category) > 0 && category[0] == class
   150  }
   151  
   152  func loadChars() {
   153  	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
   154  		c := Char{codePoint: p.Rune(0)}
   155  
   156  		getRune := func(field int) rune {
   157  			if p.String(field) == "" {
   158  				return 0
   159  			}
   160  			return p.Rune(field)
   161  		}
   162  
   163  		c.category = p.String(ucd.GeneralCategory)
   164  		category[c.category] = true
   165  		switch c.category {
   166  		case "Nd":
   167  			// Decimal digit
   168  			p.Int(ucd.NumericValue)
   169  		case "Lu":
   170  			c.upperCase = getRune(ucd.CodePoint)
   171  			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
   172  			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
   173  		case "Ll":
   174  			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
   175  			c.lowerCase = getRune(ucd.CodePoint)
   176  			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
   177  		case "Lt":
   178  			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
   179  			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
   180  			c.titleCase = getRune(ucd.CodePoint)
   181  		default:
   182  			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
   183  			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
   184  			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
   185  		}
   186  
   187  		chars[c.codePoint] = c
   188  	})
   189  }
   190  
   191  func loadCasefold() {
   192  	ucd.Parse(gen.OpenUCDFile("CaseFolding.txt"), func(p *ucd.Parser) {
   193  		kind := p.String(1)
   194  		if kind != "C" && kind != "S" {
   195  			// Only care about 'common' and 'simple' foldings.
   196  			return
   197  		}
   198  		p1 := p.Rune(0)
   199  		p2 := p.Rune(2)
   200  		chars[p1].foldCase = rune(p2)
   201  	})
   202  }
   203  
   204  var categoryMapping = map[string]string{
   205  	"Lu": "Letter, uppercase",
   206  	"Ll": "Letter, lowercase",
   207  	"Lt": "Letter, titlecase",
   208  	"Lm": "Letter, modifier",
   209  	"Lo": "Letter, other",
   210  	"Mn": "Mark, nonspacing",
   211  	"Mc": "Mark, spacing combining",
   212  	"Me": "Mark, enclosing",
   213  	"Nd": "Number, decimal digit",
   214  	"Nl": "Number, letter",
   215  	"No": "Number, other",
   216  	"Pc": "Punctuation, connector",
   217  	"Pd": "Punctuation, dash",
   218  	"Ps": "Punctuation, open",
   219  	"Pe": "Punctuation, close",
   220  	"Pi": "Punctuation, initial quote",
   221  	"Pf": "Punctuation, final quote",
   222  	"Po": "Punctuation, other",
   223  	"Sm": "Symbol, math",
   224  	"Sc": "Symbol, currency",
   225  	"Sk": "Symbol, modifier",
   226  	"So": "Symbol, other",
   227  	"Zs": "Separator, space",
   228  	"Zl": "Separator, line",
   229  	"Zp": "Separator, paragraph",
   230  	"Cc": "Other, control",
   231  	"Cf": "Other, format",
   232  	"Cs": "Other, surrogate",
   233  	"Co": "Other, private use",
   234  	"Cn": "Other, not assigned",
   235  }
   236  
   237  func printCategories() {
   238  	if *tablelist == "" {
   239  		return
   240  	}
   241  	// Find out which categories to dump
   242  	list := strings.Split(*tablelist, ",")
   243  	if *tablelist == "all" {
   244  		list = allCategories()
   245  	}
   246  	if *test {
   247  		fullCategoryTest(list)
   248  		return
   249  	}
   250  
   251  	println("// Version is the Unicode edition from which the tables are derived.")
   252  	printf("const Version = %q\n\n", gen.UnicodeVersion())
   253  
   254  	if *tablelist == "all" {
   255  		println("// Categories is the set of Unicode category tables.")
   256  		println("var Categories = map[string] *RangeTable {")
   257  		for _, k := range allCategories() {
   258  			printf("\t%q: %s,\n", k, k)
   259  		}
   260  		print("}\n\n")
   261  	}
   262  
   263  	decl := make(sort.StringSlice, len(list))
   264  	ndecl := 0
   265  	for _, name := range list {
   266  		if _, ok := category[name]; !ok {
   267  			logger.Fatal("unknown category", name)
   268  		}
   269  		// We generate an UpperCase name to serve as concise documentation and an _UnderScored
   270  		// name to store the data. This stops godoc dumping all the tables but keeps them
   271  		// available to clients.
   272  		// Cases deserving special comments
   273  		varDecl := ""
   274  		switch name {
   275  		case "C":
   276  			varDecl = "\tOther = _C;	// Other/C is the set of Unicode control and special characters, category C.\n"
   277  			varDecl += "\tC = _C\n"
   278  		case "L":
   279  			varDecl = "\tLetter = _L;	// Letter/L is the set of Unicode letters, category L.\n"
   280  			varDecl += "\tL = _L\n"
   281  		case "M":
   282  			varDecl = "\tMark = _M;	// Mark/M is the set of Unicode mark characters, category M.\n"
   283  			varDecl += "\tM = _M\n"
   284  		case "N":
   285  			varDecl = "\tNumber = _N;	// Number/N is the set of Unicode number characters, category N.\n"
   286  			varDecl += "\tN = _N\n"
   287  		case "P":
   288  			varDecl = "\tPunct = _P;	// Punct/P is the set of Unicode punctuation characters, category P.\n"
   289  			varDecl += "\tP = _P\n"
   290  		case "S":
   291  			varDecl = "\tSymbol = _S;	// Symbol/S is the set of Unicode symbol characters, category S.\n"
   292  			varDecl += "\tS = _S\n"
   293  		case "Z":
   294  			varDecl = "\tSpace = _Z;	// Space/Z is the set of Unicode space characters, category Z.\n"
   295  			varDecl += "\tZ = _Z\n"
   296  		case "Nd":
   297  			varDecl = "\tDigit = _Nd;	// Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
   298  		case "Lu":
   299  			varDecl = "\tUpper = _Lu;	// Upper is the set of Unicode upper case letters.\n"
   300  		case "Ll":
   301  			varDecl = "\tLower = _Ll;	// Lower is the set of Unicode lower case letters.\n"
   302  		case "Lt":
   303  			varDecl = "\tTitle = _Lt;	// Title is the set of Unicode title case letters.\n"
   304  		}
   305  		if len(name) > 1 {
   306  			desc, ok := categoryMapping[name]
   307  			if ok {
   308  				varDecl += fmt.Sprintf(
   309  					"\t%s = _%s;	// %s is the set of Unicode characters in category %s (%s).\n",
   310  					name, name, name, name, desc)
   311  			} else {
   312  				varDecl += fmt.Sprintf(
   313  					"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
   314  					name, name, name, name)
   315  			}
   316  		}
   317  		decl[ndecl] = varDecl
   318  		ndecl++
   319  		if len(name) == 1 { // unified categories
   320  			dumpRange(
   321  				"_"+name,
   322  				func(code rune) bool { return categoryOp(code, name[0]) })
   323  			continue
   324  		}
   325  		dumpRange("_"+name,
   326  			func(code rune) bool { return chars[code].category == name })
   327  	}
   328  	decl.Sort()
   329  	println("// These variables have type *RangeTable.")
   330  	println("var (")
   331  	for _, d := range decl {
   332  		print(d)
   333  	}
   334  	print(")\n\n")
   335  }
   336  
   337  type Op func(code rune) bool
   338  
   339  func dumpRange(name string, inCategory Op) {
   340  	runes := []rune{}
   341  	for i := range chars {
   342  		r := rune(i)
   343  		if inCategory(r) {
   344  			runes = append(runes, r)
   345  		}
   346  	}
   347  	printRangeTable(name, runes)
   348  }
   349  
   350  func printRangeTable(name string, runes []rune) {
   351  	rt := rangetable.New(runes...)
   352  	printf("var %s = &RangeTable{\n", name)
   353  	println("\tR16: []Range16{")
   354  	for _, r := range rt.R16 {
   355  		printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
   356  		range16Count++
   357  	}
   358  	println("\t},")
   359  	if len(rt.R32) > 0 {
   360  		println("\tR32: []Range32{")
   361  		for _, r := range rt.R32 {
   362  			printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
   363  			range32Count++
   364  		}
   365  		println("\t},")
   366  	}
   367  	if rt.LatinOffset > 0 {
   368  		printf("\tLatinOffset: %d,\n", rt.LatinOffset)
   369  	}
   370  	printf("}\n\n")
   371  }
   372  
   373  func fullCategoryTest(list []string) {
   374  	for _, name := range list {
   375  		if _, ok := category[name]; !ok {
   376  			logger.Fatal("unknown category", name)
   377  		}
   378  		r, ok := unicode.Categories[name]
   379  		if !ok && len(name) > 1 {
   380  			logger.Fatalf("unknown table %q", name)
   381  		}
   382  		if len(name) == 1 {
   383  			verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r)
   384  		} else {
   385  			verifyRange(
   386  				name,
   387  				func(code rune) bool { return chars[code].category == name },
   388  				r)
   389  		}
   390  	}
   391  }
   392  
   393  func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
   394  	count := 0
   395  	for j := range chars {
   396  		i := rune(j)
   397  		web := inCategory(i)
   398  		pkg := unicode.Is(table, i)
   399  		if web != pkg {
   400  			fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
   401  			count++
   402  			if count > 10 {
   403  				break
   404  			}
   405  		}
   406  	}
   407  }
   408  
   409  func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]rune) {
   410  	for _, name := range list {
   411  		if _, ok := scripts[name]; !ok {
   412  			logger.Fatal("unknown script", name)
   413  		}
   414  		_, ok := installed[name]
   415  		if !ok {
   416  			logger.Fatal("unknown table", name)
   417  		}
   418  		for _, r := range scripts[name] {
   419  			if !unicode.Is(installed[name], rune(r)) {
   420  				fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
   421  			}
   422  		}
   423  	}
   424  }
   425  
   426  var deprecatedAliases = map[string]string{
   427  	"Sentence_Terminal": "STerm",
   428  }
   429  
   430  // PropList.txt has the same format as Scripts.txt so we can share its parser.
   431  func printScriptOrProperty(doProps bool) {
   432  	flaglist := *scriptlist
   433  	file := "Scripts.txt"
   434  	table := scripts
   435  	installed := unicode.Scripts
   436  	if doProps {
   437  		flaglist = *proplist
   438  		file = "PropList.txt"
   439  		table = props
   440  		installed = unicode.Properties
   441  	}
   442  	if flaglist == "" {
   443  		return
   444  	}
   445  	ucd.Parse(gen.OpenUCDFile(file), func(p *ucd.Parser) {
   446  		name := p.String(1)
   447  		table[name] = append(table[name], p.Rune(0))
   448  	})
   449  	// Find out which scripts to dump
   450  	list := strings.Split(flaglist, ",")
   451  	if flaglist == "all" {
   452  		list = all(table)
   453  	}
   454  	if *test {
   455  		fullScriptTest(list, installed, table)
   456  		return
   457  	}
   458  
   459  	if flaglist == "all" {
   460  		if doProps {
   461  			println("// Properties is the set of Unicode property tables.")
   462  			println("var Properties = map[string] *RangeTable{")
   463  		} else {
   464  			println("// Scripts is the set of Unicode script tables.")
   465  			println("var Scripts = map[string] *RangeTable{")
   466  		}
   467  		for _, k := range all(table) {
   468  			printf("\t%q: %s,\n", k, k)
   469  			if alias, ok := deprecatedAliases[k]; ok {
   470  				printf("\t%q: %s,\n", alias, k)
   471  			}
   472  		}
   473  		print("}\n\n")
   474  	}
   475  
   476  	decl := make(sort.StringSlice, len(list)+len(deprecatedAliases))
   477  	ndecl := 0
   478  	for _, name := range list {
   479  		if doProps {
   480  			decl[ndecl] = fmt.Sprintf(
   481  				"\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
   482  				name, name, name, name)
   483  		} else {
   484  			decl[ndecl] = fmt.Sprintf(
   485  				"\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
   486  				name, name, name, name)
   487  		}
   488  		ndecl++
   489  		if alias, ok := deprecatedAliases[name]; ok {
   490  			decl[ndecl] = fmt.Sprintf(
   491  				"\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n",
   492  				alias, name)
   493  			ndecl++
   494  		}
   495  		printRangeTable("_"+name, table[name])
   496  	}
   497  	decl.Sort()
   498  	println("// These variables have type *RangeTable.")
   499  	println("var (")
   500  	for _, d := range decl {
   501  		print(d)
   502  	}
   503  	print(")\n\n")
   504  }
   505  
   506  const (
   507  	CaseUpper = 1 << iota
   508  	CaseLower
   509  	CaseTitle
   510  	CaseNone    = 0  // must be zero
   511  	CaseMissing = -1 // character not present; not a valid case state
   512  )
   513  
   514  type caseState struct {
   515  	point        rune
   516  	_case        int
   517  	deltaToUpper rune
   518  	deltaToLower rune
   519  	deltaToTitle rune
   520  }
   521  
   522  // Is d a continuation of the state of c?
   523  func (c *caseState) adjacent(d *caseState) bool {
   524  	if d.point < c.point {
   525  		c, d = d, c
   526  	}
   527  	switch {
   528  	case d.point != c.point+1: // code points not adjacent (shouldn't happen)
   529  		return false
   530  	case d._case != c._case: // different cases
   531  		return c.upperLowerAdjacent(d)
   532  	case c._case == CaseNone:
   533  		return false
   534  	case c._case == CaseMissing:
   535  		return false
   536  	case d.deltaToUpper != c.deltaToUpper:
   537  		return false
   538  	case d.deltaToLower != c.deltaToLower:
   539  		return false
   540  	case d.deltaToTitle != c.deltaToTitle:
   541  		return false
   542  	}
   543  	return true
   544  }
   545  
   546  // Is d the same as c, but opposite in upper/lower case? this would make it
   547  // an element of an UpperLower sequence.
   548  func (c *caseState) upperLowerAdjacent(d *caseState) bool {
   549  	// check they're a matched case pair.  we know they have adjacent values
   550  	switch {
   551  	case c._case == CaseUpper && d._case != CaseLower:
   552  		return false
   553  	case c._case == CaseLower && d._case != CaseUpper:
   554  		return false
   555  	}
   556  	// matched pair (at least in upper/lower).  make the order Upper Lower
   557  	if c._case == CaseLower {
   558  		c, d = d, c
   559  	}
   560  	// for an Upper Lower sequence the deltas have to be in order
   561  	//	c: 0 1 0
   562  	//	d: -1 0 -1
   563  	switch {
   564  	case c.deltaToUpper != 0:
   565  		return false
   566  	case c.deltaToLower != 1:
   567  		return false
   568  	case c.deltaToTitle != 0:
   569  		return false
   570  	case d.deltaToUpper != -1:
   571  		return false
   572  	case d.deltaToLower != 0:
   573  		return false
   574  	case d.deltaToTitle != -1:
   575  		return false
   576  	}
   577  	return true
   578  }
   579  
   580  // Does this character start an UpperLower sequence?
   581  func (c *caseState) isUpperLower() bool {
   582  	// for an Upper Lower sequence the deltas have to be in order
   583  	//	c: 0 1 0
   584  	switch {
   585  	case c.deltaToUpper != 0:
   586  		return false
   587  	case c.deltaToLower != 1:
   588  		return false
   589  	case c.deltaToTitle != 0:
   590  		return false
   591  	}
   592  	return true
   593  }
   594  
   595  // Does this character start a LowerUpper sequence?
   596  func (c *caseState) isLowerUpper() bool {
   597  	// for an Upper Lower sequence the deltas have to be in order
   598  	//	c: -1 0 -1
   599  	switch {
   600  	case c.deltaToUpper != -1:
   601  		return false
   602  	case c.deltaToLower != 0:
   603  		return false
   604  	case c.deltaToTitle != -1:
   605  		return false
   606  	}
   607  	return true
   608  }
   609  
   610  func getCaseState(i rune) (c *caseState) {
   611  	c = &caseState{point: i, _case: CaseNone}
   612  	ch := &chars[i]
   613  	switch ch.codePoint {
   614  	case 0:
   615  		c._case = CaseMissing // Will get NUL wrong but that doesn't matter
   616  		return
   617  	case ch.upperCase:
   618  		c._case = CaseUpper
   619  	case ch.lowerCase:
   620  		c._case = CaseLower
   621  	case ch.titleCase:
   622  		c._case = CaseTitle
   623  	}
   624  	// Some things such as roman numeral U+2161 don't describe themselves
   625  	// as upper case, but have a lower case. Second-guess them.
   626  	if c._case == CaseNone && ch.lowerCase != 0 {
   627  		c._case = CaseUpper
   628  	}
   629  	// Same in the other direction.
   630  	if c._case == CaseNone && ch.upperCase != 0 {
   631  		c._case = CaseLower
   632  	}
   633  
   634  	if ch.upperCase != 0 {
   635  		c.deltaToUpper = ch.upperCase - i
   636  	}
   637  	if ch.lowerCase != 0 {
   638  		c.deltaToLower = ch.lowerCase - i
   639  	}
   640  	if ch.titleCase != 0 {
   641  		c.deltaToTitle = ch.titleCase - i
   642  	}
   643  	return
   644  }
   645  
   646  func printCases() {
   647  	if *test {
   648  		fullCaseTest()
   649  		return
   650  	}
   651  	printf(
   652  		"// CaseRanges is the table describing case mappings for all letters with\n" +
   653  			"// non-self mappings.\n" +
   654  			"var CaseRanges = _CaseRanges\n" +
   655  			"var _CaseRanges = []CaseRange {\n")
   656  
   657  	var startState *caseState    // the start of a run; nil for not active
   658  	var prevState = &caseState{} // the state of the previous character
   659  	for i := range chars {
   660  		state := getCaseState(rune(i))
   661  		if state.adjacent(prevState) {
   662  			prevState = state
   663  			continue
   664  		}
   665  		// end of run (possibly)
   666  		printCaseRange(startState, prevState)
   667  		startState = nil
   668  		if state._case != CaseMissing && state._case != CaseNone {
   669  			startState = state
   670  		}
   671  		prevState = state
   672  	}
   673  	print("}\n")
   674  }
   675  
   676  func printCaseRange(lo, hi *caseState) {
   677  	if lo == nil {
   678  		return
   679  	}
   680  	if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
   681  		// character represents itself in all cases - no need to mention it
   682  		return
   683  	}
   684  	switch {
   685  	case hi.point > lo.point && lo.isUpperLower():
   686  		printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
   687  			lo.point, hi.point)
   688  	case hi.point > lo.point && lo.isLowerUpper():
   689  		logger.Fatalf("LowerUpper sequence: should not happen: %U.  If it's real, need to fix To()", lo.point)
   690  		printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
   691  			lo.point, hi.point)
   692  	default:
   693  		printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
   694  			lo.point, hi.point,
   695  			lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
   696  	}
   697  }
   698  
   699  // If the cased value in the Char is 0, it means use the rune itself.
   700  func caseIt(r, cased rune) rune {
   701  	if cased == 0 {
   702  		return r
   703  	}
   704  	return cased
   705  }
   706  
   707  func fullCaseTest() {
   708  	for j, c := range chars {
   709  		i := rune(j)
   710  		lower := unicode.ToLower(i)
   711  		want := caseIt(i, c.lowerCase)
   712  		if lower != want {
   713  			fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
   714  		}
   715  		upper := unicode.ToUpper(i)
   716  		want = caseIt(i, c.upperCase)
   717  		if upper != want {
   718  			fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
   719  		}
   720  		title := unicode.ToTitle(i)
   721  		want = caseIt(i, c.titleCase)
   722  		if title != want {
   723  			fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
   724  		}
   725  	}
   726  }
   727  
   728  func printLatinProperties() {
   729  	if *test {
   730  		return
   731  	}
   732  	println("var properties = [MaxLatin1+1]uint8{")
   733  	for code := 0; code <= unicode.MaxLatin1; code++ {
   734  		var property string
   735  		switch chars[code].category {
   736  		case "Cc", "": // NUL has no category.
   737  			property = "pC"
   738  		case "Cf": // soft hyphen, unique category, not printable.
   739  			property = "0"
   740  		case "Ll":
   741  			property = "pLl | pp"
   742  		case "Lo":
   743  			property = "pLo | pp"
   744  		case "Lu":
   745  			property = "pLu | pp"
   746  		case "Nd", "No":
   747  			property = "pN | pp"
   748  		case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
   749  			property = "pP | pp"
   750  		case "Sc", "Sk", "Sm", "So":
   751  			property = "pS | pp"
   752  		case "Zs":
   753  			property = "pZ"
   754  		default:
   755  			logger.Fatalf("%U has unknown category %q", code, chars[code].category)
   756  		}
   757  		// Special case
   758  		if code == ' ' {
   759  			property = "pZ | pp"
   760  		}
   761  		printf("\t0x%02X: %s, // %q\n", code, property, code)
   762  	}
   763  	printf("}\n\n")
   764  }
   765  
   766  func printCasefold() {
   767  	// Build list of case-folding groups attached to each canonical folded char (typically lower case).
   768  	var caseOrbit = make([][]rune, MaxChar+1)
   769  	for j := range chars {
   770  		i := rune(j)
   771  		c := &chars[i]
   772  		if c.foldCase == 0 {
   773  			continue
   774  		}
   775  		orb := caseOrbit[c.foldCase]
   776  		if orb == nil {
   777  			orb = append(orb, c.foldCase)
   778  		}
   779  		caseOrbit[c.foldCase] = append(orb, i)
   780  	}
   781  
   782  	// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
   783  	for j := range chars {
   784  		i := rune(j)
   785  		c := &chars[i]
   786  		f := c.foldCase
   787  		if f == 0 {
   788  			f = i
   789  		}
   790  		orb := caseOrbit[f]
   791  		if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
   792  			// Default assumption of [upper, lower] is wrong.
   793  			caseOrbit[i] = []rune{i}
   794  		}
   795  	}
   796  
   797  	// Delete the groups for which assuming [lower, upper] or [upper, lower] is right.
   798  	for i, orb := range caseOrbit {
   799  		if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
   800  			caseOrbit[i] = nil
   801  		}
   802  		if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
   803  			caseOrbit[i] = nil
   804  		}
   805  	}
   806  
   807  	// Record orbit information in chars.
   808  	for _, orb := range caseOrbit {
   809  		if orb == nil {
   810  			continue
   811  		}
   812  		sort.Slice(orb, func(i, j int) bool {
   813  			return orb[i] < orb[j]
   814  		})
   815  		c := orb[len(orb)-1]
   816  		for _, d := range orb {
   817  			chars[c].caseOrbit = d
   818  			c = d
   819  		}
   820  	}
   821  
   822  	printAsciiFold()
   823  	printCaseOrbit()
   824  
   825  	// Tables of category and script folding exceptions: code points
   826  	// that must be added when interpreting a particular category/script
   827  	// in a case-folding context.
   828  	cat := make(map[string]map[rune]bool)
   829  	for name := range category {
   830  		if x := foldExceptions(inCategory(name)); len(x) > 0 {
   831  			cat[name] = x
   832  		}
   833  	}
   834  
   835  	scr := make(map[string]map[rune]bool)
   836  	for name := range scripts {
   837  		if x := foldExceptions(scripts[name]); len(x) > 0 {
   838  			scr[name] = x
   839  		}
   840  	}
   841  
   842  	printCatFold("FoldCategory", cat)
   843  	printCatFold("FoldScript", scr)
   844  }
   845  
   846  // inCategory returns a list of all the runes in the category.
   847  func inCategory(name string) []rune {
   848  	var x []rune
   849  	for j := range chars {
   850  		i := rune(j)
   851  		c := &chars[i]
   852  		if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
   853  			x = append(x, i)
   854  		}
   855  	}
   856  	return x
   857  }
   858  
   859  // foldExceptions returns a list of all the runes fold-equivalent
   860  // to runes in class but not in class themselves.
   861  func foldExceptions(class []rune) map[rune]bool {
   862  	// Create map containing class and all fold-equivalent chars.
   863  	m := make(map[rune]bool)
   864  	for _, r := range class {
   865  		c := &chars[r]
   866  		if c.caseOrbit == 0 {
   867  			// Just upper and lower.
   868  			if u := c.upperCase; u != 0 {
   869  				m[u] = true
   870  			}
   871  			if l := c.lowerCase; l != 0 {
   872  				m[l] = true
   873  			}
   874  			m[r] = true
   875  			continue
   876  		}
   877  		// Otherwise walk orbit.
   878  		r0 := r
   879  		for {
   880  			m[r] = true
   881  			r = chars[r].caseOrbit
   882  			if r == r0 {
   883  				break
   884  			}
   885  		}
   886  	}
   887  
   888  	// Remove class itself.
   889  	for _, r := range class {
   890  		delete(m, r)
   891  	}
   892  
   893  	// What's left is the exceptions.
   894  	return m
   895  }
   896  
   897  var comment = map[string]string{
   898  	"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
   899  		"// code points outside the category that are equivalent under\n" +
   900  		"// simple case folding to code points inside the category.\n" +
   901  		"// If there is no entry for a category name, there are no such points.\n",
   902  
   903  	"FoldScript": "// FoldScript maps a script name to a table of\n" +
   904  		"// code points outside the script that are equivalent under\n" +
   905  		"// simple case folding to code points inside the script.\n" +
   906  		"// If there is no entry for a script name, there are no such points.\n",
   907  }
   908  
   909  func printAsciiFold() {
   910  	printf("var asciiFold = [MaxASCII + 1]uint16{\n")
   911  	for i := rune(0); i <= unicode.MaxASCII; i++ {
   912  		c := chars[i]
   913  		f := c.caseOrbit
   914  		if f == 0 {
   915  			if c.lowerCase != i && c.lowerCase != 0 {
   916  				f = c.lowerCase
   917  			} else if c.upperCase != i && c.upperCase != 0 {
   918  				f = c.upperCase
   919  			} else {
   920  				f = i
   921  			}
   922  		}
   923  		printf("\t0x%04X,\n", f)
   924  	}
   925  	printf("}\n\n")
   926  }
   927  
   928  func printCaseOrbit() {
   929  	if *test {
   930  		for j := range chars {
   931  			i := rune(j)
   932  			c := &chars[i]
   933  			f := c.caseOrbit
   934  			if f == 0 {
   935  				if c.lowerCase != i && c.lowerCase != 0 {
   936  					f = c.lowerCase
   937  				} else if c.upperCase != i && c.upperCase != 0 {
   938  					f = c.upperCase
   939  				} else {
   940  					f = i
   941  				}
   942  			}
   943  			if g := unicode.SimpleFold(i); g != f {
   944  				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
   945  			}
   946  		}
   947  		return
   948  	}
   949  
   950  	printf("var caseOrbit = []foldPair{\n")
   951  	for i := range chars {
   952  		c := &chars[i]
   953  		if c.caseOrbit != 0 {
   954  			printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
   955  			foldPairCount++
   956  		}
   957  	}
   958  	printf("}\n\n")
   959  }
   960  
   961  func printCatFold(name string, m map[string]map[rune]bool) {
   962  	if *test {
   963  		var pkgMap map[string]*unicode.RangeTable
   964  		if name == "FoldCategory" {
   965  			pkgMap = unicode.FoldCategory
   966  		} else {
   967  			pkgMap = unicode.FoldScript
   968  		}
   969  		if len(pkgMap) != len(m) {
   970  			fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
   971  			return
   972  		}
   973  		for k, v := range m {
   974  			t, ok := pkgMap[k]
   975  			if !ok {
   976  				fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
   977  				continue
   978  			}
   979  			n := 0
   980  			for _, r := range t.R16 {
   981  				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
   982  					if !v[c] {
   983  						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
   984  					}
   985  					n++
   986  				}
   987  			}
   988  			for _, r := range t.R32 {
   989  				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
   990  					if !v[c] {
   991  						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
   992  					}
   993  					n++
   994  				}
   995  			}
   996  			if n != len(v) {
   997  				fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
   998  			}
   999  		}
  1000  		return
  1001  	}
  1002  
  1003  	print(comment[name])
  1004  	printf("var %s = map[string]*RangeTable{\n", name)
  1005  	for _, name := range allCatFold(m) {
  1006  		printf("\t%q: fold%s,\n", name, name)
  1007  	}
  1008  	printf("}\n\n")
  1009  	for _, name := range allCatFold(m) {
  1010  		class := m[name]
  1011  		dumpRange("fold"+name, func(code rune) bool { return class[code] })
  1012  	}
  1013  }
  1014  
  1015  var range16Count = 0  // Number of entries in the 16-bit range tables.
  1016  var range32Count = 0  // Number of entries in the 32-bit range tables.
  1017  var foldPairCount = 0 // Number of fold pairs in the exception tables.
  1018  
  1019  func printSizes() {
  1020  	if *test {
  1021  		return
  1022  	}
  1023  	println()
  1024  	printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
  1025  	range16Bytes := range16Count * 3 * 2
  1026  	range32Bytes := range32Count * 3 * 4
  1027  	printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
  1028  	println()
  1029  	printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
  1030  }