github.com/go-xe2/third@v1.0.3/golang.org/x/text/unicode/norm/maketables.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  // Normalization table generator.
     8  // Data read from the web.
     9  // See forminfo.go for a description of the trie values associated with each rune.
    10  
    11  package main
    12  
    13  import (
    14  	"bytes"
    15  	"flag"
    16  	"fmt"
    17  	"io"
    18  	"log"
    19  	"sort"
    20  	"strconv"
    21  	"strings"
    22  
    23  	"github.com/go-xe2/third/golang.org/x/text/internal/gen"
    24  	"github.com/go-xe2/third/golang.org/x/text/internal/triegen"
    25  	"github.com/go-xe2/third/golang.org/x/text/internal/ucd"
    26  )
    27  
    28  func main() {
    29  	gen.Init()
    30  	loadUnicodeData()
    31  	compactCCC()
    32  	loadCompositionExclusions()
    33  	completeCharFields(FCanonical)
    34  	completeCharFields(FCompatibility)
    35  	computeNonStarterCounts()
    36  	verifyComputed()
    37  	printChars()
    38  	testDerived()
    39  	printTestdata()
    40  	makeTables()
    41  }
    42  
    43  var (
    44  	tablelist = flag.String("tables",
    45  		"all",
    46  		"comma-separated list of which tables to generate; "+
    47  			"can be 'decomp', 'recomp', 'info' and 'all'")
    48  	test = flag.Bool("test",
    49  		false,
    50  		"test existing tables against DerivedNormalizationProps and generate test data for regression testing")
    51  	verbose = flag.Bool("verbose",
    52  		false,
    53  		"write data to stdout as it is parsed")
    54  )
    55  
    56  const MaxChar = 0x10FFFF // anything above this shouldn't exist
    57  
    58  // Quick Check properties of runes allow us to quickly
    59  // determine whether a rune may occur in a normal form.
    60  // For a given normal form, a rune may be guaranteed to occur
    61  // verbatim (QC=Yes), may or may not combine with another
    62  // rune (QC=Maybe), or may not occur (QC=No).
    63  type QCResult int
    64  
    65  const (
    66  	QCUnknown QCResult = iota
    67  	QCYes
    68  	QCNo
    69  	QCMaybe
    70  )
    71  
    72  func (r QCResult) String() string {
    73  	switch r {
    74  	case QCYes:
    75  		return "Yes"
    76  	case QCNo:
    77  		return "No"
    78  	case QCMaybe:
    79  		return "Maybe"
    80  	}
    81  	return "***UNKNOWN***"
    82  }
    83  
    84  const (
    85  	FCanonical     = iota // NFC or NFD
    86  	FCompatibility        // NFKC or NFKD
    87  	FNumberOfFormTypes
    88  )
    89  
    90  const (
    91  	MComposed   = iota // NFC or NFKC
    92  	MDecomposed        // NFD or NFKD
    93  	MNumberOfModes
    94  )
    95  
    96  // This contains only the properties we're interested in.
    97  type Char struct {
    98  	name          string
    99  	codePoint     rune  // if zero, this index is not a valid code point.
   100  	ccc           uint8 // canonical combining class
   101  	origCCC       uint8
   102  	excludeInComp bool // from CompositionExclusions.txt
   103  	compatDecomp  bool // it has a compatibility expansion
   104  
   105  	nTrailingNonStarters uint8
   106  	nLeadingNonStarters  uint8 // must be equal to trailing if non-zero
   107  
   108  	forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
   109  
   110  	state State
   111  }
   112  
   113  var chars = make([]Char, MaxChar+1)
   114  var cccMap = make(map[uint8]uint8)
   115  
   116  func (c Char) String() string {
   117  	buf := new(bytes.Buffer)
   118  
   119  	fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
   120  	fmt.Fprintf(buf, "  ccc: %v\n", c.ccc)
   121  	fmt.Fprintf(buf, "  excludeInComp: %v\n", c.excludeInComp)
   122  	fmt.Fprintf(buf, "  compatDecomp: %v\n", c.compatDecomp)
   123  	fmt.Fprintf(buf, "  state: %v\n", c.state)
   124  	fmt.Fprintf(buf, "  NFC:\n")
   125  	fmt.Fprint(buf, c.forms[FCanonical])
   126  	fmt.Fprintf(buf, "  NFKC:\n")
   127  	fmt.Fprint(buf, c.forms[FCompatibility])
   128  
   129  	return buf.String()
   130  }
   131  
   132  // In UnicodeData.txt, some ranges are marked like this:
   133  //	3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
   134  //	4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
   135  // parseCharacter keeps a state variable indicating the weirdness.
   136  type State int
   137  
   138  const (
   139  	SNormal State = iota // known to be zero for the type
   140  	SFirst
   141  	SLast
   142  	SMissing
   143  )
   144  
   145  var lastChar = rune('\u0000')
   146  
   147  func (c Char) isValid() bool {
   148  	return c.codePoint != 0 && c.state != SMissing
   149  }
   150  
   151  type FormInfo struct {
   152  	quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
   153  	verified   [MNumberOfModes]bool     // index: MComposed or MDecomposed
   154  
   155  	combinesForward  bool // May combine with rune on the right
   156  	combinesBackward bool // May combine with rune on the left
   157  	isOneWay         bool // Never appears in result
   158  	inDecomp         bool // Some decompositions result in this char.
   159  	decomp           Decomposition
   160  	expandedDecomp   Decomposition
   161  }
   162  
   163  func (f FormInfo) String() string {
   164  	buf := bytes.NewBuffer(make([]byte, 0))
   165  
   166  	fmt.Fprintf(buf, "    quickCheck[C]: %v\n", f.quickCheck[MComposed])
   167  	fmt.Fprintf(buf, "    quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
   168  	fmt.Fprintf(buf, "    cmbForward: %v\n", f.combinesForward)
   169  	fmt.Fprintf(buf, "    cmbBackward: %v\n", f.combinesBackward)
   170  	fmt.Fprintf(buf, "    isOneWay: %v\n", f.isOneWay)
   171  	fmt.Fprintf(buf, "    inDecomp: %v\n", f.inDecomp)
   172  	fmt.Fprintf(buf, "    decomposition: %X\n", f.decomp)
   173  	fmt.Fprintf(buf, "    expandedDecomp: %X\n", f.expandedDecomp)
   174  
   175  	return buf.String()
   176  }
   177  
   178  type Decomposition []rune
   179  
   180  func parseDecomposition(s string, skipfirst bool) (a []rune, err error) {
   181  	decomp := strings.Split(s, " ")
   182  	if len(decomp) > 0 && skipfirst {
   183  		decomp = decomp[1:]
   184  	}
   185  	for _, d := range decomp {
   186  		point, err := strconv.ParseUint(d, 16, 64)
   187  		if err != nil {
   188  			return a, err
   189  		}
   190  		a = append(a, rune(point))
   191  	}
   192  	return a, nil
   193  }
   194  
   195  func loadUnicodeData() {
   196  	f := gen.OpenUCDFile("UnicodeData.txt")
   197  	defer f.Close()
   198  	p := ucd.New(f)
   199  	for p.Next() {
   200  		r := p.Rune(ucd.CodePoint)
   201  		char := &chars[r]
   202  
   203  		char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
   204  		decmap := p.String(ucd.DecompMapping)
   205  
   206  		exp, err := parseDecomposition(decmap, false)
   207  		isCompat := false
   208  		if err != nil {
   209  			if len(decmap) > 0 {
   210  				exp, err = parseDecomposition(decmap, true)
   211  				if err != nil {
   212  					log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
   213  				}
   214  				isCompat = true
   215  			}
   216  		}
   217  
   218  		char.name = p.String(ucd.Name)
   219  		char.codePoint = r
   220  		char.forms[FCompatibility].decomp = exp
   221  		if !isCompat {
   222  			char.forms[FCanonical].decomp = exp
   223  		} else {
   224  			char.compatDecomp = true
   225  		}
   226  		if len(decmap) > 0 {
   227  			char.forms[FCompatibility].decomp = exp
   228  		}
   229  	}
   230  	if err := p.Err(); err != nil {
   231  		log.Fatal(err)
   232  	}
   233  }
   234  
   235  // compactCCC converts the sparse set of CCC values to a continguous one,
   236  // reducing the number of bits needed from 8 to 6.
   237  func compactCCC() {
   238  	m := make(map[uint8]uint8)
   239  	for i := range chars {
   240  		c := &chars[i]
   241  		m[c.ccc] = 0
   242  	}
   243  	cccs := []int{}
   244  	for v, _ := range m {
   245  		cccs = append(cccs, int(v))
   246  	}
   247  	sort.Ints(cccs)
   248  	for i, c := range cccs {
   249  		cccMap[uint8(i)] = uint8(c)
   250  		m[uint8(c)] = uint8(i)
   251  	}
   252  	for i := range chars {
   253  		c := &chars[i]
   254  		c.origCCC = c.ccc
   255  		c.ccc = m[c.ccc]
   256  	}
   257  	if len(m) >= 1<<6 {
   258  		log.Fatalf("too many difference CCC values: %d >= 64", len(m))
   259  	}
   260  }
   261  
   262  // CompositionExclusions.txt has form:
   263  // 0958    # ...
   264  // See http://unicode.org/reports/tr44/ for full explanation
   265  func loadCompositionExclusions() {
   266  	f := gen.OpenUCDFile("CompositionExclusions.txt")
   267  	defer f.Close()
   268  	p := ucd.New(f)
   269  	for p.Next() {
   270  		c := &chars[p.Rune(0)]
   271  		if c.excludeInComp {
   272  			log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
   273  		}
   274  		c.excludeInComp = true
   275  	}
   276  	if e := p.Err(); e != nil {
   277  		log.Fatal(e)
   278  	}
   279  }
   280  
   281  // hasCompatDecomp returns true if any of the recursive
   282  // decompositions contains a compatibility expansion.
   283  // In this case, the character may not occur in NFK*.
   284  func hasCompatDecomp(r rune) bool {
   285  	c := &chars[r]
   286  	if c.compatDecomp {
   287  		return true
   288  	}
   289  	for _, d := range c.forms[FCompatibility].decomp {
   290  		if hasCompatDecomp(d) {
   291  			return true
   292  		}
   293  	}
   294  	return false
   295  }
   296  
   297  // Hangul related constants.
   298  const (
   299  	HangulBase = 0xAC00
   300  	HangulEnd  = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
   301  
   302  	JamoLBase = 0x1100
   303  	JamoLEnd  = 0x1113
   304  	JamoVBase = 0x1161
   305  	JamoVEnd  = 0x1176
   306  	JamoTBase = 0x11A8
   307  	JamoTEnd  = 0x11C3
   308  
   309  	JamoLVTCount = 19 * 21 * 28
   310  	JamoTCount   = 28
   311  )
   312  
   313  func isHangul(r rune) bool {
   314  	return HangulBase <= r && r < HangulEnd
   315  }
   316  
   317  func isHangulWithoutJamoT(r rune) bool {
   318  	if !isHangul(r) {
   319  		return false
   320  	}
   321  	r -= HangulBase
   322  	return r < JamoLVTCount && r%JamoTCount == 0
   323  }
   324  
   325  func ccc(r rune) uint8 {
   326  	return chars[r].ccc
   327  }
   328  
   329  // Insert a rune in a buffer, ordered by Canonical Combining Class.
   330  func insertOrdered(b Decomposition, r rune) Decomposition {
   331  	n := len(b)
   332  	b = append(b, 0)
   333  	cc := ccc(r)
   334  	if cc > 0 {
   335  		// Use bubble sort.
   336  		for ; n > 0; n-- {
   337  			if ccc(b[n-1]) <= cc {
   338  				break
   339  			}
   340  			b[n] = b[n-1]
   341  		}
   342  	}
   343  	b[n] = r
   344  	return b
   345  }
   346  
   347  // Recursively decompose.
   348  func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
   349  	dcomp := chars[r].forms[form].decomp
   350  	if len(dcomp) == 0 {
   351  		return insertOrdered(d, r)
   352  	}
   353  	for _, c := range dcomp {
   354  		d = decomposeRecursive(form, c, d)
   355  	}
   356  	return d
   357  }
   358  
   359  func completeCharFields(form int) {
   360  	// Phase 0: pre-expand decomposition.
   361  	for i := range chars {
   362  		f := &chars[i].forms[form]
   363  		if len(f.decomp) == 0 {
   364  			continue
   365  		}
   366  		exp := make(Decomposition, 0)
   367  		for _, c := range f.decomp {
   368  			exp = decomposeRecursive(form, c, exp)
   369  		}
   370  		f.expandedDecomp = exp
   371  	}
   372  
   373  	// Phase 1: composition exclusion, mark decomposition.
   374  	for i := range chars {
   375  		c := &chars[i]
   376  		f := &c.forms[form]
   377  
   378  		// Marks script-specific exclusions and version restricted.
   379  		f.isOneWay = c.excludeInComp
   380  
   381  		// Singletons
   382  		f.isOneWay = f.isOneWay || len(f.decomp) == 1
   383  
   384  		// Non-starter decompositions
   385  		if len(f.decomp) > 1 {
   386  			chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
   387  			f.isOneWay = f.isOneWay || chk
   388  		}
   389  
   390  		// Runes that decompose into more than two runes.
   391  		f.isOneWay = f.isOneWay || len(f.decomp) > 2
   392  
   393  		if form == FCompatibility {
   394  			f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
   395  		}
   396  
   397  		for _, r := range f.decomp {
   398  			chars[r].forms[form].inDecomp = true
   399  		}
   400  	}
   401  
   402  	// Phase 2: forward and backward combining.
   403  	for i := range chars {
   404  		c := &chars[i]
   405  		f := &c.forms[form]
   406  
   407  		if !f.isOneWay && len(f.decomp) == 2 {
   408  			f0 := &chars[f.decomp[0]].forms[form]
   409  			f1 := &chars[f.decomp[1]].forms[form]
   410  			if !f0.isOneWay {
   411  				f0.combinesForward = true
   412  			}
   413  			if !f1.isOneWay {
   414  				f1.combinesBackward = true
   415  			}
   416  		}
   417  		if isHangulWithoutJamoT(rune(i)) {
   418  			f.combinesForward = true
   419  		}
   420  	}
   421  
   422  	// Phase 3: quick check values.
   423  	for i := range chars {
   424  		c := &chars[i]
   425  		f := &c.forms[form]
   426  
   427  		switch {
   428  		case len(f.decomp) > 0:
   429  			f.quickCheck[MDecomposed] = QCNo
   430  		case isHangul(rune(i)):
   431  			f.quickCheck[MDecomposed] = QCNo
   432  		default:
   433  			f.quickCheck[MDecomposed] = QCYes
   434  		}
   435  		switch {
   436  		case f.isOneWay:
   437  			f.quickCheck[MComposed] = QCNo
   438  		case (i & 0xffff00) == JamoLBase:
   439  			f.quickCheck[MComposed] = QCYes
   440  			if JamoLBase <= i && i < JamoLEnd {
   441  				f.combinesForward = true
   442  			}
   443  			if JamoVBase <= i && i < JamoVEnd {
   444  				f.quickCheck[MComposed] = QCMaybe
   445  				f.combinesBackward = true
   446  				f.combinesForward = true
   447  			}
   448  			if JamoTBase <= i && i < JamoTEnd {
   449  				f.quickCheck[MComposed] = QCMaybe
   450  				f.combinesBackward = true
   451  			}
   452  		case !f.combinesBackward:
   453  			f.quickCheck[MComposed] = QCYes
   454  		default:
   455  			f.quickCheck[MComposed] = QCMaybe
   456  		}
   457  	}
   458  }
   459  
   460  func computeNonStarterCounts() {
   461  	// Phase 4: leading and trailing non-starter count
   462  	for i := range chars {
   463  		c := &chars[i]
   464  
   465  		runes := []rune{rune(i)}
   466  		// We always use FCompatibility so that the CGJ insertion points do not
   467  		// change for repeated normalizations with different forms.
   468  		if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 {
   469  			runes = exp
   470  		}
   471  		// We consider runes that combine backwards to be non-starters for the
   472  		// purpose of Stream-Safe Text Processing.
   473  		for _, r := range runes {
   474  			if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
   475  				break
   476  			}
   477  			c.nLeadingNonStarters++
   478  		}
   479  		for i := len(runes) - 1; i >= 0; i-- {
   480  			if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
   481  				break
   482  			}
   483  			c.nTrailingNonStarters++
   484  		}
   485  		if c.nTrailingNonStarters > 3 {
   486  			log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes)
   487  		}
   488  
   489  		if isHangul(rune(i)) {
   490  			c.nTrailingNonStarters = 2
   491  			if isHangulWithoutJamoT(rune(i)) {
   492  				c.nTrailingNonStarters = 1
   493  			}
   494  		}
   495  
   496  		if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t {
   497  			log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t)
   498  		}
   499  		if t := c.nTrailingNonStarters; t > 3 {
   500  			log.Fatalf("%U: number of trailing non-starters is %d > 3", t)
   501  		}
   502  	}
   503  }
   504  
   505  func printBytes(w io.Writer, b []byte, name string) {
   506  	fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b))
   507  	fmt.Fprintf(w, "var %s = [...]byte {", name)
   508  	for i, c := range b {
   509  		switch {
   510  		case i%64 == 0:
   511  			fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63)
   512  		case i%8 == 0:
   513  			fmt.Fprintf(w, "\n")
   514  		}
   515  		fmt.Fprintf(w, "0x%.2X, ", c)
   516  	}
   517  	fmt.Fprint(w, "\n}\n\n")
   518  }
   519  
   520  // See forminfo.go for format.
   521  func makeEntry(f *FormInfo, c *Char) uint16 {
   522  	e := uint16(0)
   523  	if r := c.codePoint; HangulBase <= r && r < HangulEnd {
   524  		e |= 0x40
   525  	}
   526  	if f.combinesForward {
   527  		e |= 0x20
   528  	}
   529  	if f.quickCheck[MDecomposed] == QCNo {
   530  		e |= 0x4
   531  	}
   532  	switch f.quickCheck[MComposed] {
   533  	case QCYes:
   534  	case QCNo:
   535  		e |= 0x10
   536  	case QCMaybe:
   537  		e |= 0x18
   538  	default:
   539  		log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
   540  	}
   541  	e |= uint16(c.nTrailingNonStarters)
   542  	return e
   543  }
   544  
   545  // decompSet keeps track of unique decompositions, grouped by whether
   546  // the decomposition is followed by a trailing and/or leading CCC.
   547  type decompSet [7]map[string]bool
   548  
   549  const (
   550  	normalDecomp = iota
   551  	firstMulti
   552  	firstCCC
   553  	endMulti
   554  	firstLeadingCCC
   555  	firstCCCZeroExcept
   556  	firstStarterWithNLead
   557  	lastDecomp
   558  )
   559  
   560  var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"}
   561  
   562  func makeDecompSet() decompSet {
   563  	m := decompSet{}
   564  	for i := range m {
   565  		m[i] = make(map[string]bool)
   566  	}
   567  	return m
   568  }
   569  func (m *decompSet) insert(key int, s string) {
   570  	m[key][s] = true
   571  }
   572  
   573  func printCharInfoTables(w io.Writer) int {
   574  	mkstr := func(r rune, f *FormInfo) (int, string) {
   575  		d := f.expandedDecomp
   576  		s := string([]rune(d))
   577  		if max := 1 << 6; len(s) >= max {
   578  			const msg = "%U: too many bytes in decomposition: %d >= %d"
   579  			log.Fatalf(msg, r, len(s), max)
   580  		}
   581  		head := uint8(len(s))
   582  		if f.quickCheck[MComposed] != QCYes {
   583  			head |= 0x40
   584  		}
   585  		if f.combinesForward {
   586  			head |= 0x80
   587  		}
   588  		s = string([]byte{head}) + s
   589  
   590  		lccc := ccc(d[0])
   591  		tccc := ccc(d[len(d)-1])
   592  		cc := ccc(r)
   593  		if cc != 0 && lccc == 0 && tccc == 0 {
   594  			log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
   595  		}
   596  		if tccc < lccc && lccc != 0 {
   597  			const msg = "%U: lccc (%d) must be <= tcc (%d)"
   598  			log.Fatalf(msg, r, lccc, tccc)
   599  		}
   600  		index := normalDecomp
   601  		nTrail := chars[r].nTrailingNonStarters
   602  		nLead := chars[r].nLeadingNonStarters
   603  		if tccc > 0 || lccc > 0 || nTrail > 0 {
   604  			tccc <<= 2
   605  			tccc |= nTrail
   606  			s += string([]byte{tccc})
   607  			index = endMulti
   608  			for _, r := range d[1:] {
   609  				if ccc(r) == 0 {
   610  					index = firstCCC
   611  				}
   612  			}
   613  			if lccc > 0 || nLead > 0 {
   614  				s += string([]byte{lccc})
   615  				if index == firstCCC {
   616  					log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
   617  				}
   618  				index = firstLeadingCCC
   619  			}
   620  			if cc != lccc {
   621  				if cc != 0 {
   622  					log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
   623  				}
   624  				index = firstCCCZeroExcept
   625  			}
   626  		} else if len(d) > 1 {
   627  			index = firstMulti
   628  		}
   629  		return index, s
   630  	}
   631  
   632  	decompSet := makeDecompSet()
   633  	const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
   634  	decompSet.insert(firstStarterWithNLead, nLeadStr)
   635  
   636  	// Store the uniqued decompositions in a byte buffer,
   637  	// preceded by their byte length.
   638  	for _, c := range chars {
   639  		for _, f := range c.forms {
   640  			if len(f.expandedDecomp) == 0 {
   641  				continue
   642  			}
   643  			if f.combinesBackward {
   644  				log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
   645  			}
   646  			index, s := mkstr(c.codePoint, &f)
   647  			decompSet.insert(index, s)
   648  		}
   649  	}
   650  
   651  	decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
   652  	size := 0
   653  	positionMap := make(map[string]uint16)
   654  	decompositions.WriteString("\000")
   655  	fmt.Fprintln(w, "const (")
   656  	for i, m := range decompSet {
   657  		sa := []string{}
   658  		for s := range m {
   659  			sa = append(sa, s)
   660  		}
   661  		sort.Strings(sa)
   662  		for _, s := range sa {
   663  			p := decompositions.Len()
   664  			decompositions.WriteString(s)
   665  			positionMap[s] = uint16(p)
   666  		}
   667  		if cname[i] != "" {
   668  			fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len())
   669  		}
   670  	}
   671  	fmt.Fprintln(w, "maxDecomp = 0x8000")
   672  	fmt.Fprintln(w, ")")
   673  	b := decompositions.Bytes()
   674  	printBytes(w, b, "decomps")
   675  	size += len(b)
   676  
   677  	varnames := []string{"nfc", "nfkc"}
   678  	for i := 0; i < FNumberOfFormTypes; i++ {
   679  		trie := triegen.NewTrie(varnames[i])
   680  
   681  		for r, c := range chars {
   682  			f := c.forms[i]
   683  			d := f.expandedDecomp
   684  			if len(d) != 0 {
   685  				_, key := mkstr(c.codePoint, &f)
   686  				trie.Insert(rune(r), uint64(positionMap[key]))
   687  				if c.ccc != ccc(d[0]) {
   688  					// We assume the lead ccc of a decomposition !=0 in this case.
   689  					if ccc(d[0]) == 0 {
   690  						log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
   691  					}
   692  				}
   693  			} else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward {
   694  				// Handle cases where it can't be detected that the nLead should be equal
   695  				// to nTrail.
   696  				trie.Insert(c.codePoint, uint64(positionMap[nLeadStr]))
   697  			} else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 {
   698  				trie.Insert(c.codePoint, uint64(0x8000|v))
   699  			}
   700  		}
   701  		sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]}))
   702  		if err != nil {
   703  			log.Fatal(err)
   704  		}
   705  		size += sz
   706  	}
   707  	return size
   708  }
   709  
   710  func contains(sa []string, s string) bool {
   711  	for _, a := range sa {
   712  		if a == s {
   713  			return true
   714  		}
   715  	}
   716  	return false
   717  }
   718  
   719  func makeTables() {
   720  	w := &bytes.Buffer{}
   721  
   722  	size := 0
   723  	if *tablelist == "" {
   724  		return
   725  	}
   726  	list := strings.Split(*tablelist, ",")
   727  	if *tablelist == "all" {
   728  		list = []string{"recomp", "info"}
   729  	}
   730  
   731  	// Compute maximum decomposition size.
   732  	max := 0
   733  	for _, c := range chars {
   734  		if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max {
   735  			max = n
   736  		}
   737  	}
   738  
   739  	fmt.Fprintln(w, "const (")
   740  	fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
   741  	fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion())
   742  	fmt.Fprintln(w)
   743  	fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform")
   744  	fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at")
   745  	fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that")
   746  	fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.")
   747  	fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max)
   748  	fmt.Fprintln(w, ")\n")
   749  
   750  	// Print the CCC remap table.
   751  	size += len(cccMap)
   752  	fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap))
   753  	for i := 0; i < len(cccMap); i++ {
   754  		if i%8 == 0 {
   755  			fmt.Fprintln(w)
   756  		}
   757  		fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)])
   758  	}
   759  	fmt.Fprintln(w, "\n}\n")
   760  
   761  	if contains(list, "info") {
   762  		size += printCharInfoTables(w)
   763  	}
   764  
   765  	if contains(list, "recomp") {
   766  		// Note that we use 32 bit keys, instead of 64 bit.
   767  		// This clips the bits of three entries, but we know
   768  		// this won't cause a collision. The compiler will catch
   769  		// any changes made to UnicodeData.txt that introduces
   770  		// a collision.
   771  		// Note that the recomposition map for NFC and NFKC
   772  		// are identical.
   773  
   774  		// Recomposition map
   775  		nrentries := 0
   776  		for _, c := range chars {
   777  			f := c.forms[FCanonical]
   778  			if !f.isOneWay && len(f.decomp) > 0 {
   779  				nrentries++
   780  			}
   781  		}
   782  		sz := nrentries * 8
   783  		size += sz
   784  		fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
   785  		fmt.Fprintln(w, "var recompMap = map[uint32]rune{")
   786  		for i, c := range chars {
   787  			f := c.forms[FCanonical]
   788  			d := f.decomp
   789  			if !f.isOneWay && len(d) > 0 {
   790  				key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
   791  				fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i)
   792  			}
   793  		}
   794  		fmt.Fprintf(w, "}\n\n")
   795  	}
   796  
   797  	fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
   798  	gen.WriteGoFile("tables.go", "norm", w.Bytes())
   799  }
   800  
   801  func printChars() {
   802  	if *verbose {
   803  		for _, c := range chars {
   804  			if !c.isValid() || c.state == SMissing {
   805  				continue
   806  			}
   807  			fmt.Println(c)
   808  		}
   809  	}
   810  }
   811  
   812  // verifyComputed does various consistency tests.
   813  func verifyComputed() {
   814  	for i, c := range chars {
   815  		for _, f := range c.forms {
   816  			isNo := (f.quickCheck[MDecomposed] == QCNo)
   817  			if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
   818  				log.Fatalf("%U: NF*D QC must be No if rune decomposes", i)
   819  			}
   820  
   821  			isMaybe := f.quickCheck[MComposed] == QCMaybe
   822  			if f.combinesBackward != isMaybe {
   823  				log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
   824  			}
   825  			if len(f.decomp) > 0 && f.combinesForward && isMaybe {
   826  				log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
   827  			}
   828  
   829  			if len(f.expandedDecomp) != 0 {
   830  				continue
   831  			}
   832  			if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b {
   833  				// We accept these runes to be treated differently (it only affects
   834  				// segment breaking in iteration, most likely on improper use), but
   835  				// reconsider if more characters are added.
   836  				// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
   837  				// U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
   838  				// U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
   839  				// U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
   840  				// U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
   841  				// U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
   842  				if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) {
   843  					log.Fatalf("%U: nLead was %v; want %v", i, a, b)
   844  				}
   845  			}
   846  		}
   847  		nfc := c.forms[FCanonical]
   848  		nfkc := c.forms[FCompatibility]
   849  		if nfc.combinesBackward != nfkc.combinesBackward {
   850  			log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
   851  		}
   852  	}
   853  }
   854  
   855  // Use values in DerivedNormalizationProps.txt to compare against the
   856  // values we computed.
   857  // DerivedNormalizationProps.txt has form:
   858  // 00C0..00C5    ; NFD_QC; N # ...
   859  // 0374          ; NFD_QC; N # ...
   860  // See http://unicode.org/reports/tr44/ for full explanation
   861  func testDerived() {
   862  	f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
   863  	defer f.Close()
   864  	p := ucd.New(f)
   865  	for p.Next() {
   866  		r := p.Rune(0)
   867  		c := &chars[r]
   868  
   869  		var ftype, mode int
   870  		qt := p.String(1)
   871  		switch qt {
   872  		case "NFC_QC":
   873  			ftype, mode = FCanonical, MComposed
   874  		case "NFD_QC":
   875  			ftype, mode = FCanonical, MDecomposed
   876  		case "NFKC_QC":
   877  			ftype, mode = FCompatibility, MComposed
   878  		case "NFKD_QC":
   879  			ftype, mode = FCompatibility, MDecomposed
   880  		default:
   881  			continue
   882  		}
   883  		var qr QCResult
   884  		switch p.String(2) {
   885  		case "Y":
   886  			qr = QCYes
   887  		case "N":
   888  			qr = QCNo
   889  		case "M":
   890  			qr = QCMaybe
   891  		default:
   892  			log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
   893  		}
   894  		if got := c.forms[ftype].quickCheck[mode]; got != qr {
   895  			log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
   896  		}
   897  		c.forms[ftype].verified[mode] = true
   898  	}
   899  	if err := p.Err(); err != nil {
   900  		log.Fatal(err)
   901  	}
   902  	// Any unspecified value must be QCYes. Verify this.
   903  	for i, c := range chars {
   904  		for j, fd := range c.forms {
   905  			for k, qr := range fd.quickCheck {
   906  				if !fd.verified[k] && qr != QCYes {
   907  					m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
   908  					log.Printf(m, i, j, k, qr, c.name)
   909  				}
   910  			}
   911  		}
   912  	}
   913  }
   914  
   915  var testHeader = `const (
   916  	Yes = iota
   917  	No
   918  	Maybe
   919  )
   920  
   921  type formData struct {
   922  	qc              uint8
   923  	combinesForward bool
   924  	decomposition   string
   925  }
   926  
   927  type runeData struct {
   928  	r      rune
   929  	ccc    uint8
   930  	nLead  uint8
   931  	nTrail uint8
   932  	f      [2]formData // 0: canonical; 1: compatibility
   933  }
   934  
   935  func f(qc uint8, cf bool, dec string) [2]formData {
   936  	return [2]formData{{qc, cf, dec}, {qc, cf, dec}}
   937  }
   938  
   939  func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData {
   940  	return [2]formData{{qc, cf, d}, {qck, cfk, dk}}
   941  }
   942  
   943  var testData = []runeData{
   944  `
   945  
   946  func printTestdata() {
   947  	type lastInfo struct {
   948  		ccc    uint8
   949  		nLead  uint8
   950  		nTrail uint8
   951  		f      string
   952  	}
   953  
   954  	last := lastInfo{}
   955  	w := &bytes.Buffer{}
   956  	fmt.Fprintf(w, testHeader)
   957  	for r, c := range chars {
   958  		f := c.forms[FCanonical]
   959  		qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
   960  		f = c.forms[FCompatibility]
   961  		qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
   962  		s := ""
   963  		if d == dk && qc == qck && cf == cfk {
   964  			s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d)
   965  		} else {
   966  			s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk)
   967  		}
   968  		current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s}
   969  		if last != current {
   970  			fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s)
   971  			last = current
   972  		}
   973  	}
   974  	fmt.Fprintln(w, "}")
   975  	gen.WriteGoFile("data_test.go", "norm", w.Bytes())
   976  }