github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/cases/gen.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  // This program generates the trie for casing operations. The Unicode casing
     8  // algorithm requires the lookup of various properties and mappings for each
     9  // rune. The table generated by this generator combines several of the most
    10  // frequently used of these into a single trie so that they can be accessed
    11  // with a single lookup.
    12  package main
    13  
    14  import (
    15  	"bytes"
    16  	"fmt"
    17  	"io"
    18  	"io/ioutil"
    19  	"log"
    20  	"reflect"
    21  	"strconv"
    22  	"strings"
    23  	"unicode"
    24  
    25  	"github.com/insionng/yougam/libraries/x/text/internal/gen"
    26  	"github.com/insionng/yougam/libraries/x/text/internal/triegen"
    27  	"github.com/insionng/yougam/libraries/x/text/internal/ucd"
    28  	"github.com/insionng/yougam/libraries/x/text/unicode/norm"
    29  )
    30  
    31  func main() {
    32  	gen.Init()
    33  	genTables()
    34  	genTablesTest()
    35  	gen.Repackage("gen_trieval.go", "trieval.go", "cases")
    36  }
    37  
    38  // runeInfo contains all information for a rune that we care about for casing
    39  // operations.
    40  type runeInfo struct {
    41  	Rune rune
    42  
    43  	entry info // trie value for this rune.
    44  
    45  	CaseMode info
    46  
    47  	// Simple case mappings.
    48  	Simple [1 + maxCaseMode][]rune
    49  
    50  	// Special casing
    51  	HasSpecial  bool
    52  	Conditional bool
    53  	Special     [1 + maxCaseMode][]rune
    54  
    55  	// Folding
    56  	FoldSimple  rune
    57  	FoldSpecial rune
    58  	FoldFull    []rune
    59  
    60  	// TODO: FC_NFKC, or equivalent data.
    61  
    62  	// Properties
    63  	SoftDotted     bool
    64  	CaseIgnorable  bool
    65  	Cased          bool
    66  	DecomposeGreek bool
    67  	BreakType      string
    68  	BreakCat       breakCategory
    69  
    70  	// We care mostly about 0, Above, and IotaSubscript.
    71  	CCC byte
    72  }
    73  
    74  type breakCategory int
    75  
    76  const (
    77  	breakBreak breakCategory = iota
    78  	breakLetter
    79  	breakIgnored
    80  )
    81  
    82  // mapping returns the case mapping for the given case type.
    83  func (r *runeInfo) mapping(c info) string {
    84  	if r.HasSpecial {
    85  		return string(r.Special[c])
    86  	}
    87  	if len(r.Simple[c]) != 0 {
    88  		return string(r.Simple[c])
    89  	}
    90  	return string(r.Rune)
    91  }
    92  
    93  func parse(file string, f func(p *ucd.Parser)) {
    94  	ucd.Parse(gen.OpenUCDFile(file), f)
    95  }
    96  
    97  func parseUCD() []runeInfo {
    98  	chars := make([]runeInfo, unicode.MaxRune)
    99  
   100  	get := func(r rune) *runeInfo {
   101  		c := &chars[r]
   102  		c.Rune = r
   103  		return c
   104  	}
   105  
   106  	parse("UnicodeData.txt", func(p *ucd.Parser) {
   107  		ri := get(p.Rune(0))
   108  		ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
   109  		ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
   110  		ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
   111  		ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
   112  		if p.String(ucd.GeneralCategory) == "Lt" {
   113  			ri.CaseMode = cTitle
   114  		}
   115  	})
   116  
   117  	// <code>; <property>
   118  	parse("PropList.txt", func(p *ucd.Parser) {
   119  		if p.String(1) == "Soft_Dotted" {
   120  			chars[p.Rune(0)].SoftDotted = true
   121  		}
   122  	})
   123  
   124  	// <code>; <word break type>
   125  	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
   126  		ri := get(p.Rune(0))
   127  		switch p.String(1) {
   128  		case "Case_Ignorable":
   129  			ri.CaseIgnorable = true
   130  		case "Cased":
   131  			ri.Cased = true
   132  		case "Lowercase":
   133  			ri.CaseMode = cLower
   134  		case "Uppercase":
   135  			ri.CaseMode = cUpper
   136  		}
   137  	})
   138  
   139  	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
   140  	parse("SpecialCasing.txt", func(p *ucd.Parser) {
   141  		// We drop all conditional special casing and deal with them manually in
   142  		// the language-specific case mappers. Rune 0x03A3 is the only one with
   143  		// a conditional formatting that is not language-specific. However,
   144  		// dealing with this letter is tricky, especially in a streaming
   145  		// context, so we deal with it in the Caser for Greek specifically.
   146  		ri := get(p.Rune(0))
   147  		if p.String(4) == "" {
   148  			ri.HasSpecial = true
   149  			ri.Special[cLower] = p.Runes(1)
   150  			ri.Special[cTitle] = p.Runes(2)
   151  			ri.Special[cUpper] = p.Runes(3)
   152  		} else {
   153  			ri.Conditional = true
   154  		}
   155  	})
   156  
   157  	// TODO: Use text breaking according to UAX #29.
   158  	// <code>; <word break type>
   159  	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
   160  		ri := get(p.Rune(0))
   161  		ri.BreakType = p.String(1)
   162  
   163  		// We collapse the word breaking properties onto the categories we need.
   164  		switch p.String(1) { // TODO: officially we need to canonicalize.
   165  		case "Format", "MidLetter", "MidNumLet", "Single_Quote":
   166  			ri.BreakCat = breakIgnored
   167  		case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet":
   168  			ri.BreakCat = breakLetter
   169  		}
   170  	})
   171  
   172  	// <code>; <type>; <mapping>
   173  	parse("CaseFolding.txt", func(p *ucd.Parser) {
   174  		ri := get(p.Rune(0))
   175  		switch p.String(1) {
   176  		case "C":
   177  			ri.FoldSimple = p.Rune(2)
   178  			ri.FoldFull = p.Runes(2)
   179  		case "S":
   180  			ri.FoldSimple = p.Rune(2)
   181  		case "T":
   182  			ri.FoldSpecial = p.Rune(2)
   183  		case "F":
   184  			ri.FoldFull = p.Runes(2)
   185  		default:
   186  			log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
   187  		}
   188  	})
   189  
   190  	return chars
   191  }
   192  
   193  func genTables() {
   194  	chars := parseUCD()
   195  	verifyProperties(chars)
   196  
   197  	t := triegen.NewTrie("case")
   198  	for i := range chars {
   199  		c := &chars[i]
   200  		makeEntry(c)
   201  		t.Insert(rune(i), uint64(c.entry))
   202  	}
   203  
   204  	w := gen.NewCodeWriter()
   205  	defer w.WriteGoFile("tables.go", "cases")
   206  
   207  	gen.WriteUnicodeVersion(w)
   208  
   209  	// TODO: write CLDR version after adding a mechanism to detect that the
   210  	// tables on which the manually created locale-sensitive casing code is
   211  	// based hasn't changed.
   212  
   213  	w.WriteVar("xorData", string(xorData))
   214  	w.WriteVar("exceptions", string(exceptionData))
   215  
   216  	sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
   217  	if err != nil {
   218  		log.Fatal(err)
   219  	}
   220  	w.Size += sz
   221  }
   222  
   223  func makeEntry(ri *runeInfo) {
   224  	if ri.CaseIgnorable {
   225  		if ri.Cased {
   226  			ri.entry = cIgnorableCased
   227  		} else {
   228  			ri.entry = cIgnorableUncased
   229  		}
   230  	} else {
   231  		ri.entry = ri.CaseMode
   232  	}
   233  
   234  	// TODO: handle soft-dotted.
   235  
   236  	ccc := cccOther
   237  	switch ri.CCC {
   238  	case 0: // Not_Reordered
   239  		ccc = cccZero
   240  	case above: // Above
   241  		ccc = cccAbove
   242  	}
   243  	if ri.BreakCat == breakBreak {
   244  		ccc = cccBreak
   245  	}
   246  
   247  	ri.entry |= ccc
   248  
   249  	if ri.CaseMode == cUncased {
   250  		return
   251  	}
   252  
   253  	// Need to do something special.
   254  	if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
   255  		makeException(ri)
   256  		return
   257  	}
   258  	if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
   259  		makeException(ri)
   260  		return
   261  	}
   262  
   263  	// Rune is either lowercase or uppercase.
   264  
   265  	orig := string(ri.Rune)
   266  	mapped := ""
   267  	if ri.CaseMode == cUpper {
   268  		mapped = ri.mapping(cLower)
   269  	} else {
   270  		mapped = ri.mapping(cUpper)
   271  	}
   272  
   273  	if len(orig) != len(mapped) {
   274  		makeException(ri)
   275  		return
   276  	}
   277  
   278  	if string(ri.FoldFull) == ri.mapping(cUpper) {
   279  		ri.entry |= inverseFoldBit
   280  	}
   281  
   282  	n := len(orig)
   283  
   284  	// Create per-byte XOR mask.
   285  	var b []byte
   286  	for i := 0; i < n; i++ {
   287  		b = append(b, orig[i]^mapped[i])
   288  	}
   289  
   290  	// Remove leading 0 bytes, but keep at least one byte.
   291  	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
   292  	}
   293  
   294  	if len(b) == 1 && b[0]&0xc0 == 0 {
   295  		ri.entry |= info(b[0]) << xorShift
   296  		return
   297  	}
   298  
   299  	key := string(b)
   300  	x, ok := xorCache[key]
   301  	if !ok {
   302  		xorData = append(xorData, 0) // for detecting start of sequence
   303  		xorData = append(xorData, b...)
   304  
   305  		x = len(xorData) - 1
   306  		xorCache[key] = x
   307  	}
   308  	ri.entry |= info(x<<xorShift) | xorIndexBit
   309  }
   310  
   311  var xorCache = map[string]int{}
   312  
   313  // xorData contains byte-wise XOR data for the least significant bytes of a
   314  // UTF-8 encoded rune. An index points to the last byte. The sequence starts
   315  // with a zero terminator.
   316  var xorData = []byte{}
   317  
   318  // See the comments in gen_trieval.go re "the exceptions slice".
   319  var exceptionData = []byte{0}
   320  
   321  // makeException encodes case mappings that cannot be expressed in a simple
   322  // XOR diff.
   323  func makeException(ri *runeInfo) {
   324  	ccc := ri.entry & cccMask
   325  	// Set exception bit and retain case type.
   326  	ri.entry &= 0x0007
   327  	ri.entry |= exceptionBit
   328  
   329  	if len(exceptionData) >= 1<<numExceptionBits {
   330  		log.Fatalf("%U:exceptionData too large %x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
   331  	}
   332  
   333  	// Set the offset in the exceptionData array.
   334  	ri.entry |= info(len(exceptionData) << exceptionShift)
   335  
   336  	orig := string(ri.Rune)
   337  	tc := ri.mapping(cTitle)
   338  	uc := ri.mapping(cUpper)
   339  	lc := ri.mapping(cLower)
   340  	ff := string(ri.FoldFull)
   341  
   342  	// addString sets the length of a string and adds it to the expansions array.
   343  	addString := func(s string, b *byte) {
   344  		if len(s) == 0 {
   345  			// Zero-length mappings exist, but only for conditional casing,
   346  			// which we are representing outside of this table.
   347  			log.Fatalf("%U: has zero-length mapping.", ri.Rune)
   348  		}
   349  		*b <<= 3
   350  		if s != orig {
   351  			n := len(s)
   352  			if n > 7 {
   353  				log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
   354  			}
   355  			*b |= byte(n)
   356  			exceptionData = append(exceptionData, s...)
   357  		}
   358  	}
   359  
   360  	// byte 0:
   361  	exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
   362  
   363  	// byte 1:
   364  	p := len(exceptionData)
   365  	exceptionData = append(exceptionData, 0)
   366  
   367  	if len(ff) > 7 { // May be zero-length.
   368  		log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
   369  	}
   370  	exceptionData = append(exceptionData, ff...)
   371  	ct := ri.CaseMode
   372  	if ct != cLower {
   373  		addString(lc, &exceptionData[p])
   374  	}
   375  	if ct != cUpper {
   376  		addString(uc, &exceptionData[p])
   377  	}
   378  	if ct != cTitle {
   379  		// If title is the same as upper, we set it to the original string so
   380  		// that it will be marked as not present. This implies title case is
   381  		// the same as upper case.
   382  		if tc == uc {
   383  			tc = orig
   384  		}
   385  		addString(tc, &exceptionData[p])
   386  	}
   387  }
   388  
   389  // sparseCompacter is a trie value block Compacter. There are many cases where
   390  // successive runes alternate between lower- and upper-case. This Compacter
   391  // exploits this by adding a special case type where the case value is obtained
   392  // from or-ing it with the least-significant bit of the rune, creating large
   393  // ranges of equal case values that compress well.
   394  type sparseCompacter struct {
   395  	sparseBlocks  [][]uint16
   396  	sparseOffsets []uint16
   397  	sparseCount   int
   398  }
   399  
   400  // makeSparse returns the number of elements that compact block would contain
   401  // as well as the modified values.
   402  func makeSparse(vals []uint64) ([]uint16, int) {
   403  	// Copy the values.
   404  	values := make([]uint16, len(vals))
   405  	for i, v := range vals {
   406  		values[i] = uint16(v)
   407  	}
   408  
   409  	alt := func(i int, v uint16) uint16 {
   410  		if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
   411  			// Convert cLower or cUpper to cXORCase value, which has the form 11x.
   412  			xor := v
   413  			xor &^= 1
   414  			xor |= uint16(i&1) ^ (v & 1)
   415  			xor |= 0x4
   416  			return xor
   417  		}
   418  		return v
   419  	}
   420  
   421  	var count int
   422  	var previous uint16
   423  	for i, v := range values {
   424  		if v != 0 {
   425  			// Try if the unmodified value is equal to the previous.
   426  			if v == previous {
   427  				continue
   428  			}
   429  
   430  			// Try if the xor-ed value is equal to the previous value.
   431  			a := alt(i, v)
   432  			if a == previous {
   433  				values[i] = a
   434  				continue
   435  			}
   436  
   437  			// This is a new value.
   438  			count++
   439  
   440  			// Use the xor-ed value if it will be identical to the next value.
   441  			if p := i + 1; p < len(values) && alt(p, values[p]) == a {
   442  				values[i] = a
   443  				v = a
   444  			}
   445  		}
   446  		previous = v
   447  	}
   448  	return values, count
   449  }
   450  
   451  func (s *sparseCompacter) Size(v []uint64) (int, bool) {
   452  	_, n := makeSparse(v)
   453  
   454  	// We limit using this method to having 16 entries.
   455  	if n > 16 {
   456  		return 0, false
   457  	}
   458  
   459  	return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
   460  }
   461  
   462  func (s *sparseCompacter) Store(v []uint64) uint32 {
   463  	h := uint32(len(s.sparseOffsets))
   464  	values, sz := makeSparse(v)
   465  	s.sparseBlocks = append(s.sparseBlocks, values)
   466  	s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
   467  	s.sparseCount += sz
   468  	return h
   469  }
   470  
   471  func (s *sparseCompacter) Handler() string {
   472  	// The sparse global variable and its lookup method is defined in gen_trieval.go.
   473  	return "sparse.lookup"
   474  }
   475  
   476  func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
   477  	p := func(format string, args ...interface{}) {
   478  		_, err := fmt.Fprintf(w, format, args...)
   479  		if retErr == nil && err != nil {
   480  			retErr = err
   481  		}
   482  	}
   483  
   484  	ls := len(s.sparseBlocks)
   485  	if ls == len(s.sparseOffsets) {
   486  		s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
   487  	}
   488  	p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
   489  	p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
   490  
   491  	ns := s.sparseCount
   492  	p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
   493  	p("var sparseValues = [%d]valueRange {", ns)
   494  	for i, values := range s.sparseBlocks {
   495  		p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
   496  		var v uint16
   497  		for i, nv := range values {
   498  			if nv != v {
   499  				if v != 0 {
   500  					p(",hi:%#02x},", 0x80+i-1)
   501  				}
   502  				if nv != 0 {
   503  					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
   504  				}
   505  			}
   506  			v = nv
   507  		}
   508  		if v != 0 {
   509  			p(",hi:%#02x},", 0x80+len(values)-1)
   510  		}
   511  	}
   512  	p("\n}\n\n")
   513  	return
   514  }
   515  
   516  // verifyProperties that properties of the runes that are relied upon in the
   517  // implementation. Each property is marked with an identifier that is referred
   518  // to in the places where it is used.
   519  func verifyProperties(chars []runeInfo) {
   520  	for i, c := range chars {
   521  		r := rune(i)
   522  
   523  		// Rune properties.
   524  
   525  		// A.1: modifier never changes on lowercase. [ltLower]
   526  		if c.CCC > 0 && unicode.ToLower(r) != r {
   527  			log.Fatalf("%U: non-starter changes when lowercased", r)
   528  		}
   529  
   530  		// A.2: properties of decompositions starting with I or J. [ltLower]
   531  		d := norm.NFD.PropertiesString(string(r)).Decomposition()
   532  		if len(d) > 0 {
   533  			if d[0] == 'I' || d[0] == 'J' {
   534  				// A.2.1: we expect at least an ASCII character and a modifier.
   535  				if len(d) < 3 {
   536  					log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
   537  				}
   538  
   539  				// All subsequent runes are modifiers and all have the same CCC.
   540  				runes := []rune(string(d[1:]))
   541  				ccc := chars[runes[0]].CCC
   542  
   543  				for _, mr := range runes[1:] {
   544  					mc := chars[mr]
   545  
   546  					// A.2.2: all modifiers have a CCC of Above or less.
   547  					if ccc == 0 || ccc > above {
   548  						log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
   549  					}
   550  
   551  					// A.2.3: a sequence of modifiers all have the same CCC.
   552  					if mc.CCC != ccc {
   553  						log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
   554  					}
   555  
   556  					// A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
   557  					if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
   558  						log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
   559  					}
   560  
   561  					if i += len(string(mr)); i >= len(d) {
   562  						break
   563  					}
   564  				}
   565  			}
   566  		}
   567  
   568  		// A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
   569  		if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
   570  			log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
   571  		}
   572  
   573  		// A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
   574  		if c.CCC == iotaSubscript && r != 0x0345 {
   575  			log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
   576  		}
   577  
   578  		// A.5: soft-dotted runes do not have exceptions.
   579  		if c.SoftDotted && c.entry&exceptionBit != 0 {
   580  			log.Fatalf("%U: soft-dotted has exception", r)
   581  		}
   582  
   583  		// A.6: Greek decomposition. [elUpper]
   584  		if unicode.Is(unicode.Greek, r) {
   585  			if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
   586  				runes := []rune(string(b))
   587  				// A.6.1: If a Greek rune decomposes and the first rune of the
   588  				// decomposition is greater than U+00FF, the rune is always
   589  				// great and not a modifier.
   590  				if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
   591  					log.Fatalf("%U: expeced first rune of Greek decomposition to be letter, found %U", r, f)
   592  				}
   593  				// A.6.2: Any follow-up rune in a Greek decomposition is a
   594  				// modifier of which the first should be gobbled in
   595  				// decomposition.
   596  				for _, m := range runes[1:] {
   597  					switch m {
   598  					case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
   599  					default:
   600  						log.Fatalf("%U: modifier %U is outside of expeced Greek modifier set", r, m)
   601  					}
   602  				}
   603  			}
   604  		}
   605  
   606  		// Breaking properties.
   607  
   608  		// B.1: all runes with CCC > 0 are of break type Extend.
   609  		if c.CCC > 0 && c.BreakType != "Extend" {
   610  			log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
   611  		}
   612  
   613  		// B.2: all cased runes with c.CCC == 0 are of break type ALetter.
   614  		if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
   615  			log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
   616  		}
   617  
   618  		// B.3: letter category.
   619  		if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
   620  			if c.BreakCat != breakLetter {
   621  				log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
   622  			}
   623  		}
   624  	}
   625  }
   626  
   627  func genTablesTest() {
   628  	w := &bytes.Buffer{}
   629  
   630  	fmt.Fprintln(w, "var (")
   631  	printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
   632  
   633  	// We discard the output as we know we have perfect functions. We run them
   634  	// just to verify the properties are correct.
   635  	n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
   636  	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
   637  	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
   638  	if n > 0 {
   639  		log.Fatalf("One of the discarded properties does not have a perfect filter.")
   640  	}
   641  
   642  	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
   643  	fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
   644  	parse("SpecialCasing.txt", func(p *ucd.Parser) {
   645  		// Skip conditional entries.
   646  		if p.String(4) != "" {
   647  			return
   648  		}
   649  		r := p.Rune(0)
   650  		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
   651  			r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
   652  	})
   653  	fmt.Fprint(w, "\t}\n\n")
   654  
   655  	// <code>; <type>; <runes>
   656  	table := map[rune]struct{ simple, full, special string }{}
   657  	parse("CaseFolding.txt", func(p *ucd.Parser) {
   658  		r := p.Rune(0)
   659  		t := p.String(1)
   660  		v := string(p.Runes(2))
   661  		if t != "T" && v == string(unicode.ToLower(r)) {
   662  			return
   663  		}
   664  		x := table[r]
   665  		switch t {
   666  		case "C":
   667  			x.full = v
   668  			x.simple = v
   669  		case "S":
   670  			x.simple = v
   671  		case "F":
   672  			x.full = v
   673  		case "T":
   674  			x.special = v
   675  		}
   676  		table[r] = x
   677  	})
   678  	fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
   679  	for r := rune(0); r < 0x10FFFF; r++ {
   680  		x, ok := table[r]
   681  		if !ok {
   682  			continue
   683  		}
   684  		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
   685  	}
   686  	fmt.Fprint(w, "\t}\n\n")
   687  
   688  	// Break property
   689  	notBreak := map[rune]bool{}
   690  	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
   691  		switch p.String(1) {
   692  		case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
   693  			"ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet":
   694  			notBreak[p.Rune(0)] = true
   695  		}
   696  	})
   697  
   698  	fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
   699  	inBreak := false
   700  	for r := rune(0); r <= lastRuneForTesting; r++ {
   701  		if isBreak := !notBreak[r]; isBreak != inBreak {
   702  			if isBreak {
   703  				fmt.Fprintf(w, "\t\t{0x%x, ", r)
   704  			} else {
   705  				fmt.Fprintf(w, "0x%x},\n", r-1)
   706  			}
   707  			inBreak = isBreak
   708  		}
   709  	}
   710  	if inBreak {
   711  		fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
   712  	}
   713  	fmt.Fprint(w, "\t}\n\n")
   714  
   715  	// Word break test
   716  	// Filter out all samples that do not contain cased characters.
   717  	cased := map[rune]bool{}
   718  	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
   719  		if p.String(1) == "Cased" {
   720  			cased[p.Rune(0)] = true
   721  		}
   722  	})
   723  
   724  	fmt.Fprintln(w, "\tbreakTest = []string{")
   725  	parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
   726  		c := strings.Split(p.String(0), " ")
   727  
   728  		const sep = '|'
   729  		numCased := 0
   730  		test := ""
   731  		for ; len(c) >= 2; c = c[2:] {
   732  			if c[0] == "รท" && test != "" {
   733  				test += string(sep)
   734  			}
   735  			i, err := strconv.ParseUint(c[1], 16, 32)
   736  			r := rune(i)
   737  			if err != nil {
   738  				log.Fatalf("Invalid rune %q.", c[1])
   739  			}
   740  			if r == sep {
   741  				log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
   742  			}
   743  			if cased[r] {
   744  				numCased++
   745  			}
   746  			test += string(r)
   747  		}
   748  		if numCased > 1 {
   749  			fmt.Fprintf(w, "\t\t%q,\n", test)
   750  		}
   751  	})
   752  	fmt.Fprintln(w, "\t}")
   753  
   754  	fmt.Fprintln(w, ")")
   755  
   756  	gen.WriteGoFile("tables_test.go", "cases", w.Bytes())
   757  }
   758  
   759  // These functions are just used for verification that their definition have not
   760  // changed in the Unicode Standard.
   761  
   762  func verifyCased(r rune) bool {
   763  	return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
   764  }
   765  
   766  func verifyLower(r rune) bool {
   767  	return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
   768  }
   769  
   770  func verifyUpper(r rune) bool {
   771  	return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
   772  }
   773  
   774  // verifyIgnore is an approximation of the Case_Ignorable property using the
   775  // core unicode package. It is used to reduce the size of the test data.
   776  func verifyIgnore(r rune) bool {
   777  	props := []*unicode.RangeTable{
   778  		unicode.Mn,
   779  		unicode.Me,
   780  		unicode.Cf,
   781  		unicode.Lm,
   782  		unicode.Sk,
   783  	}
   784  	for _, p := range props {
   785  		if unicode.Is(p, r) {
   786  			return true
   787  		}
   788  	}
   789  	return false
   790  }
   791  
   792  // printProperties prints tables of rune properties from the given UCD file.
   793  // A filter func f can be given to exclude certain values. A rune r will have
   794  // the indicated property if it is in the generated table or if f(r).
   795  func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
   796  	verify := map[rune]bool{}
   797  	n := 0
   798  	varNameParts := strings.Split(property, "_")
   799  	varNameParts[0] = strings.ToLower(varNameParts[0])
   800  	fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
   801  	parse(file, func(p *ucd.Parser) {
   802  		if p.String(1) == property {
   803  			r := p.Rune(0)
   804  			verify[r] = true
   805  			if !f(r) {
   806  				n++
   807  				fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
   808  			}
   809  		}
   810  	})
   811  	fmt.Fprint(w, "\t}\n\n")
   812  
   813  	// Verify that f is correct, that is, it represents a subset of the property.
   814  	for r := rune(0); r <= lastRuneForTesting; r++ {
   815  		if !verify[r] && f(r) {
   816  			log.Fatalf("Incorrect filter func for property %q.", property)
   817  		}
   818  	}
   819  	return n
   820  }
   821  
   822  // The newCaseTrie, sparseValues and sparseOffsets definitions below are
   823  // placeholders referred to by gen_trieval.go. The real definitions are
   824  // generated by this program and written to tables.go.
   825  
   826  func newCaseTrie(int) int { return 0 }
   827  
   828  var (
   829  	sparseValues  [0]valueRange
   830  	sparseOffsets [0]uint16
   831  )