github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/cases/gen.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // This program generates the trie for casing operations. The Unicode casing 8 // algorithm requires the lookup of various properties and mappings for each 9 // rune. The table generated by this generator combines several of the most 10 // frequently used of these into a single trie so that they can be accessed 11 // with a single lookup. 12 package main 13 14 import ( 15 "bytes" 16 "fmt" 17 "io" 18 "io/ioutil" 19 "log" 20 "reflect" 21 "strconv" 22 "strings" 23 "unicode" 24 25 "golang.org/x/text/internal/gen" 26 "golang.org/x/text/internal/triegen" 27 "golang.org/x/text/internal/ucd" 28 "golang.org/x/text/unicode/norm" 29 ) 30 31 func main() { 32 gen.Init() 33 genTables() 34 genTablesTest() 35 gen.Repackage("gen_trieval.go", "trieval.go", "cases") 36 } 37 38 // runeInfo contains all information for a rune that we care about for casing 39 // operations. 40 type runeInfo struct { 41 Rune rune 42 43 entry info // trie value for this rune. 44 45 CaseMode info 46 47 // Simple case mappings. 48 Simple [1 + maxCaseMode][]rune 49 50 // Special casing 51 HasSpecial bool 52 Conditional bool 53 Special [1 + maxCaseMode][]rune 54 55 // Folding (TODO) 56 FoldSimple rune 57 FoldSpecial rune 58 FoldFull []rune 59 60 // TODO: FC_NFKC, or equivalent data. 61 62 // Properties 63 SoftDotted bool 64 CaseIgnorable bool 65 Cased bool 66 DecomposeGreek bool 67 BreakType string 68 BreakCat breakCategory 69 70 // We care mostly about 0, Above, and IotaSubscript. 71 CCC byte 72 } 73 74 type breakCategory int 75 76 const ( 77 breakBreak breakCategory = iota 78 breakLetter 79 breakIgnored 80 ) 81 82 // mapping returns the case mapping for the given case type. 83 func (r *runeInfo) mapping(c info) string { 84 if r.HasSpecial { 85 return string(r.Special[c]) 86 } 87 if len(r.Simple[c]) != 0 { 88 return string(r.Simple[c]) 89 } 90 return string(r.Rune) 91 } 92 93 func parse(file string, f func(p *ucd.Parser)) { 94 ucd.Parse(gen.OpenUCDFile(file), f) 95 } 96 97 func parseUCD() []runeInfo { 98 chars := make([]runeInfo, unicode.MaxRune) 99 100 get := func(r rune) *runeInfo { 101 c := &chars[r] 102 c.Rune = r 103 return c 104 } 105 106 parse("UnicodeData.txt", func(p *ucd.Parser) { 107 ri := get(p.Rune(0)) 108 ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass)) 109 ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping) 110 ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping) 111 ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping) 112 if p.String(ucd.GeneralCategory) == "Lt" { 113 ri.CaseMode = cTitle 114 } 115 }) 116 117 // <code>; <property> 118 parse("PropList.txt", func(p *ucd.Parser) { 119 if p.String(1) == "Soft_Dotted" { 120 chars[p.Rune(0)].SoftDotted = true 121 } 122 }) 123 124 // <code>; <word break type> 125 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 126 ri := get(p.Rune(0)) 127 switch p.String(1) { 128 case "Case_Ignorable": 129 ri.CaseIgnorable = true 130 case "Cased": 131 ri.Cased = true 132 case "Lowercase": 133 ri.CaseMode = cLower 134 case "Uppercase": 135 ri.CaseMode = cUpper 136 } 137 }) 138 139 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 140 parse("SpecialCasing.txt", func(p *ucd.Parser) { 141 // We drop all conditional special casing and deal with them manually in 142 // the language-specific case mappers. Rune 0x03A3 is the only one with 143 // a conditional formatting that is not language-specific. However, 144 // dealing with this letter is tricky, especially in a streaming 145 // context, so we deal with it in the Caser for Greek specifically. 146 ri := get(p.Rune(0)) 147 if p.String(4) == "" { 148 ri.HasSpecial = true 149 ri.Special[cLower] = p.Runes(1) 150 ri.Special[cTitle] = p.Runes(2) 151 ri.Special[cUpper] = p.Runes(3) 152 } else { 153 ri.Conditional = true 154 } 155 }) 156 157 // TODO: Use text breaking according to UAX #29. 158 // <code>; <word break type> 159 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 160 ri := get(p.Rune(0)) 161 ri.BreakType = p.String(1) 162 163 // We collapse the word breaking properties onto the categories we need. 164 switch p.String(1) { // TODO: officially we need to canonicalize. 165 case "Format", "MidLetter", "MidNumLet", "Single_Quote": 166 ri.BreakCat = breakIgnored 167 case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet": 168 ri.BreakCat = breakLetter 169 } 170 }) 171 172 // TODO: Support case folding. 173 // // <code>; <status>; <mapping>; 174 // parse("CaseFolding.txt", func (p *ucd.Parser) { 175 // ri := get(p.Rune(0)) 176 // switch p.String(1) { 177 // case "C": 178 // ri.FoldSimple = p.Rune(2) 179 // ri.FoldFull = p.Runes(2) 180 // case "S": 181 // ri.FoldSimple = p.Rune(2) 182 // case "T": 183 // ri.FoldSpecial = p.Rune(2) 184 // case "F": 185 // ri.FoldFull = p.Runes(2) 186 // } 187 // }) 188 189 return chars 190 } 191 192 func genTables() { 193 chars := parseUCD() 194 verifyProperties(chars) 195 196 t := triegen.NewTrie("case") 197 for i := range chars { 198 c := &chars[i] 199 makeEntry(c) 200 t.Insert(rune(i), uint64(c.entry)) 201 } 202 203 w := &bytes.Buffer{} 204 205 sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) 206 if err != nil { 207 log.Fatal(err) 208 } 209 210 gen.WriteUnicodeVersion(w) 211 // TODO: write CLDR version after adding a mechanism to detect that the 212 // tables on which the manually created locale-sensitive casing code is 213 // based hasn't changed. 214 215 fmt.Fprintf(w, "// xorData: %d bytes\n", len(xorData)) 216 fmt.Fprintf(w, "var xorData = %+q\n\n", string(xorData)) 217 218 fmt.Fprintf(w, "// exceptions: %d bytes\n", len(exceptionData)) 219 fmt.Fprintf(w, "var exceptions = %q\n\n", string(exceptionData)) 220 221 sz += len(exceptionData) 222 fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) 223 224 gen.WriteGoFile("tables.go", "cases", w.Bytes()) 225 } 226 227 func makeEntry(ri *runeInfo) { 228 if ri.CaseIgnorable { 229 if ri.Cased { 230 ri.entry = cIgnorableCased 231 } else { 232 ri.entry = cIgnorableUncased 233 } 234 } else { 235 ri.entry = ri.CaseMode 236 } 237 238 // TODO: handle soft-dotted. 239 240 ccc := cccOther 241 switch ri.CCC { 242 case 0: // Not_Reordered 243 ccc = cccZero 244 case above: // Above 245 ccc = cccAbove 246 } 247 if ri.BreakCat == breakBreak { 248 ccc = cccBreak 249 } 250 251 ri.entry |= ccc 252 253 if ri.CaseMode == cUncased { 254 return 255 } 256 257 // Need to do something special. 258 if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) { 259 makeException(ri) 260 return 261 } 262 263 // Rune is either lowercase or uppercase. 264 265 orig := string(ri.Rune) 266 mapped := "" 267 if ri.CaseMode == cUpper { 268 mapped = ri.mapping(cLower) 269 } else { 270 mapped = ri.mapping(cUpper) 271 } 272 273 if len(orig) != len(mapped) { 274 makeException(ri) 275 return 276 } 277 278 n := len(orig) 279 280 // Create per-byte XOR mask. 281 var b []byte 282 for i := 0; i < n; i++ { 283 b = append(b, orig[i]^mapped[i]) 284 } 285 286 // Remove leading 0 bytes, but keep at least one byte. 287 for ; len(b) > 1 && b[0] == 0; b = b[1:] { 288 } 289 290 if len(b) == 1 && b[0]&0xc0 == 0 { 291 ri.entry |= info(b[0]) << xorShift 292 return 293 } 294 295 key := string(b) 296 x, ok := xorCache[key] 297 if !ok { 298 xorData = append(xorData, 0) // for detecting start of sequence 299 xorData = append(xorData, b...) 300 301 x = len(xorData) - 1 302 xorCache[key] = x 303 } 304 ri.entry |= info(x<<xorShift) | xorIndexBit 305 } 306 307 var xorCache = map[string]int{} 308 309 // xorData contains byte-wise XOR data for the least significant bytes of a 310 // UTF-8 encoded rune. An index points to the last byte. The sequence starts 311 // with a zero terminator. 312 var xorData = []byte{} 313 314 // See the comments in gen_trieval.go re "the exceptions slice". 315 var exceptionData = []byte{0} 316 317 // makeException encodes case mappings that cannot be expressed in a simple 318 // XOR diff. 319 func makeException(ri *runeInfo) { 320 ri.entry |= exceptionBit 321 322 if ccc := ri.entry & cccMask; ccc != cccZero { 323 log.Fatalf("%U:CCC type was %d; want %d", ri.Rune, ccc, cccZero) 324 } 325 326 if len(exceptionData) >= 1<<numExceptionBits { 327 log.Fatalf("%U:exceptionData too large %x > %d bits", ri.Rune, len(exceptionData), numExceptionBits) 328 } 329 330 // Set the offset in the exceptionData array. 331 ri.entry |= info(len(exceptionData) << exceptionShift) 332 333 orig := string(ri.Rune) 334 tc := ri.mapping(cTitle) 335 uc := ri.mapping(cUpper) 336 lc := ri.mapping(cLower) 337 338 // addString sets the length of a string and adds it to the expansions array. 339 addString := func(s string, b *byte) { 340 if len(s) == 0 { 341 // Zero-length mappings exist, but only for conditional casing, 342 // which we are representing outside of this table. 343 log.Fatalf("%U: has zero-length mapping.", ri.Rune) 344 } 345 *b <<= 3 346 if s != orig { 347 n := len(s) 348 if n > 7 { 349 log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n) 350 } 351 *b |= byte(n) 352 exceptionData = append(exceptionData, s...) 353 } 354 } 355 356 // byte 0: 357 exceptionData = append(exceptionData, 0) 358 359 // byte 1: 360 p := len(exceptionData) 361 exceptionData = append(exceptionData, 0) 362 363 ct := ri.CaseMode 364 if ct != cLower { 365 addString(lc, &exceptionData[p]) 366 } 367 if ct != cUpper { 368 addString(uc, &exceptionData[p]) 369 } 370 if ct != cTitle { 371 // If title is the same as upper, we set it to the original string so 372 // that it will be marked as not present. This implies title case is 373 // the same as upper case. 374 if tc == uc { 375 tc = orig 376 } 377 addString(tc, &exceptionData[p]) 378 } 379 } 380 381 // sparseCompacter is a trie value block Compacter. There are many cases where 382 // successive runes alternate between lower- and upper-case. This Compacter 383 // exploits this by adding a special case type where the case value is obtained 384 // from or-ing it with the least-significant bit of the rune, creating large 385 // ranges of equal case values that compress well. 386 type sparseCompacter struct { 387 sparseBlocks [][]uint16 388 sparseOffsets []uint16 389 sparseCount int 390 } 391 392 // makeSparse returns the number of elements that compact block would contain 393 // as well as the modified values. 394 func makeSparse(vals []uint64) ([]uint16, int) { 395 // Copy the values. 396 values := make([]uint16, len(vals)) 397 for i, v := range vals { 398 values[i] = uint16(v) 399 } 400 401 alt := func(i int, v uint16) uint16 { 402 if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower { 403 // Convert cLower or cUpper to cXORCase value, which has the form 11x. 404 xor := v 405 xor &^= 1 406 xor |= uint16(i&1) ^ (v & 1) 407 xor |= 0x4 408 return xor 409 } 410 return v 411 } 412 413 var count int 414 var previous uint16 415 for i, v := range values { 416 if v != 0 { 417 // Try if the unmodified value is equal to the previous. 418 if v == previous { 419 continue 420 } 421 422 // Try if the xor-ed value is equal to the previous value. 423 a := alt(i, v) 424 if a == previous { 425 values[i] = a 426 continue 427 } 428 429 // This is a new value. 430 count++ 431 432 // Use the xor-ed value if it will be identical to the next value. 433 if p := i + 1; p < len(values) && alt(p, values[p]) == a { 434 values[i] = a 435 v = a 436 } 437 } 438 previous = v 439 } 440 return values, count 441 } 442 443 func (s *sparseCompacter) Size(v []uint64) (int, bool) { 444 _, n := makeSparse(v) 445 446 // We limit using this method to having 16 entries. 447 if n > 16 { 448 return 0, false 449 } 450 451 return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true 452 } 453 454 func (s *sparseCompacter) Store(v []uint64) uint32 { 455 h := uint32(len(s.sparseOffsets)) 456 values, sz := makeSparse(v) 457 s.sparseBlocks = append(s.sparseBlocks, values) 458 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 459 s.sparseCount += sz 460 return h 461 } 462 463 func (s *sparseCompacter) Handler() string { 464 // The sparse global variable and its lookup method is defined in gen_trieval.go. 465 return "sparse.lookup" 466 } 467 468 func (s *sparseCompacter) Print(w io.Writer) (retErr error) { 469 p := func(format string, args ...interface{}) { 470 _, err := fmt.Fprintf(w, format, args...) 471 if retErr == nil && err != nil { 472 retErr = err 473 } 474 } 475 476 ls := len(s.sparseBlocks) 477 if ls == len(s.sparseOffsets) { 478 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 479 } 480 p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2) 481 p("var sparseOffsets = %#v\n\n", s.sparseOffsets) 482 483 ns := s.sparseCount 484 p("// sparseValues: %d entries, %d bytes\n", ns, ns*4) 485 p("var sparseValues = [%d]valueRange {", ns) 486 for i, values := range s.sparseBlocks { 487 p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i]) 488 var v uint16 489 for i, nv := range values { 490 if nv != v { 491 if v != 0 { 492 p(",hi:%#02x},", 0x80+i-1) 493 } 494 if nv != 0 { 495 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) 496 } 497 } 498 v = nv 499 } 500 if v != 0 { 501 p(",hi:%#02x},", 0x80+len(values)-1) 502 } 503 } 504 p("\n}\n\n") 505 return 506 } 507 508 // verifyProperties that properties of the runes that are relied upon in the 509 // implementation. Each property is marked with an identifier that is referred 510 // to in the places where it is used. 511 func verifyProperties(chars []runeInfo) { 512 for i, c := range chars { 513 r := rune(i) 514 515 // Rune properties. 516 517 // A.1: modifier never changes on lowercase. [ltLower] 518 if c.CCC > 0 && unicode.ToLower(r) != r { 519 log.Fatalf("%U: non-starter changes when lowercased", r) 520 } 521 522 // A.2: properties of decompositions starting with I or J. [ltLower] 523 d := norm.NFD.PropertiesString(string(r)).Decomposition() 524 if len(d) > 0 { 525 if d[0] == 'I' || d[0] == 'J' { 526 // A.2.1: we expect at least an ASCII character and a modifier. 527 if len(d) < 3 { 528 log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d)) 529 } 530 531 // All subsequent runes are modifiers and all have the same CCC. 532 runes := []rune(string(d[1:])) 533 ccc := chars[runes[0]].CCC 534 535 for _, mr := range runes[1:] { 536 mc := chars[mr] 537 538 // A.2.2: all modifiers have a CCC of Above or less. 539 if ccc == 0 || ccc > above { 540 log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc) 541 } 542 543 // A.2.3: a sequence of modifiers all have the same CCC. 544 if mc.CCC != ccc { 545 log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc) 546 } 547 548 // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above. 549 if (ccc == above) != (0x300 <= mr && mr <= 0x311) { 550 log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr) 551 } 552 553 if i += len(string(mr)); i >= len(d) { 554 break 555 } 556 } 557 } 558 } 559 560 // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper] 561 if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") { 562 log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r) 563 } 564 565 // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper] 566 if c.CCC == iotaSubscript && r != 0x0345 { 567 log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r) 568 } 569 570 // A.5: soft-dotted runes do not have exceptions. 571 if c.SoftDotted && c.entry&exceptionBit != 0 { 572 log.Fatalf("%U: soft-dotted has exception", r) 573 } 574 575 // A.6: Greek decomposition. [elUpper] 576 if unicode.Is(unicode.Greek, r) { 577 if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil { 578 runes := []rune(string(b)) 579 // A.6.1: If a Greek rune decomposes and the first rune of the 580 // decomposition is greater than U+00FF, the rune is always 581 // great and not a modifier. 582 if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) { 583 log.Fatalf("%U: expeced first rune of Greek decomposition to be letter, found %U", r, f) 584 } 585 // A.6.2: Any follow-up rune in a Greek decomposition is a 586 // modifier of which the first should be gobbled in 587 // decomposition. 588 for _, m := range runes[1:] { 589 switch m { 590 case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345: 591 default: 592 log.Fatalf("%U: modifier %U is outside of expeced Greek modifier set", r, m) 593 } 594 } 595 } 596 } 597 598 // Breaking properties. 599 600 // B.1: all runes with CCC > 0 are of break type Extend. 601 if c.CCC > 0 && c.BreakType != "Extend" { 602 log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType) 603 } 604 605 // B.2: all cased runes with c.CCC == 0 are of break type ALetter. 606 if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" { 607 log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType) 608 } 609 610 // B.3: letter category. 611 if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable { 612 if c.BreakCat != breakLetter { 613 log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter) 614 } 615 } 616 } 617 } 618 619 func genTablesTest() { 620 w := &bytes.Buffer{} 621 622 fmt.Fprintln(w, "var (") 623 printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore) 624 625 // We discard the output as we know we have perfect functions. We run them 626 // just to verify the properties are correct. 627 n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased) 628 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower) 629 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper) 630 if n > 0 { 631 log.Fatalf("One of the discarded properties does not have a perfect filter.") 632 } 633 634 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 635 fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{") 636 parse("SpecialCasing.txt", func(p *ucd.Parser) { 637 // Skip conditional entries. 638 if p.String(4) != "" { 639 return 640 } 641 r := p.Rune(0) 642 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", 643 r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3))) 644 }) 645 fmt.Fprint(w, "\t}\n\n") 646 647 // Break property 648 notBreak := map[rune]bool{} 649 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 650 switch p.String(1) { 651 case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote", 652 "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet": 653 notBreak[p.Rune(0)] = true 654 } 655 }) 656 657 fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{") 658 inBreak := false 659 for r := rune(0); r <= lastRuneForTesting; r++ { 660 if isBreak := !notBreak[r]; isBreak != inBreak { 661 if isBreak { 662 fmt.Fprintf(w, "\t\t{0x%x, ", r) 663 } else { 664 fmt.Fprintf(w, "0x%x},\n", r-1) 665 } 666 inBreak = isBreak 667 } 668 } 669 if inBreak { 670 fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting) 671 } 672 fmt.Fprint(w, "\t}\n\n") 673 674 // Word break test 675 // Filter out all samples that do not contain cased characters. 676 cased := map[rune]bool{} 677 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 678 if p.String(1) == "Cased" { 679 cased[p.Rune(0)] = true 680 } 681 }) 682 683 fmt.Fprintln(w, "\tbreakTest = []string{") 684 parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) { 685 c := strings.Split(p.String(0), " ") 686 687 const sep = '|' 688 numCased := 0 689 test := "" 690 for ; len(c) >= 2; c = c[2:] { 691 if c[0] == "รท" && test != "" { 692 test += string(sep) 693 } 694 i, err := strconv.ParseUint(c[1], 16, 32) 695 r := rune(i) 696 if err != nil { 697 log.Fatalf("Invalid rune %q.", c[1]) 698 } 699 if r == sep { 700 log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep) 701 } 702 if cased[r] { 703 numCased++ 704 } 705 test += string(r) 706 } 707 if numCased > 1 { 708 fmt.Fprintf(w, "\t\t%q,\n", test) 709 } 710 }) 711 fmt.Fprintln(w, "\t}") 712 713 fmt.Fprintln(w, ")") 714 715 gen.WriteGoFile("tables_test.go", "cases", w.Bytes()) 716 } 717 718 // These functions are just used for verification that their definition have not 719 // changed in the Unicode Standard. 720 721 func verifyCased(r rune) bool { 722 return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r) 723 } 724 725 func verifyLower(r rune) bool { 726 return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r) 727 } 728 729 func verifyUpper(r rune) bool { 730 return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r) 731 } 732 733 // verifyIgnore is an approximation of the Case_Ignorable property using the 734 // core unicode package. It is used to reduce the size of the test data. 735 func verifyIgnore(r rune) bool { 736 props := []*unicode.RangeTable{ 737 unicode.Mn, 738 unicode.Me, 739 unicode.Cf, 740 unicode.Lm, 741 unicode.Sk, 742 } 743 for _, p := range props { 744 if unicode.Is(p, r) { 745 return true 746 } 747 } 748 return false 749 } 750 751 // printProperties prints tables of rune properties from the given UCD file. 752 // A filter func f can be given to exclude certain values. A rune r will have 753 // the indicated property if it is in the generated table or if f(r). 754 func printProperties(w io.Writer, file, property string, f func(r rune) bool) int { 755 verify := map[rune]bool{} 756 n := 0 757 varNameParts := strings.Split(property, "_") 758 varNameParts[0] = strings.ToLower(varNameParts[0]) 759 fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, "")) 760 parse(file, func(p *ucd.Parser) { 761 if p.String(1) == property { 762 r := p.Rune(0) 763 verify[r] = true 764 if !f(r) { 765 n++ 766 fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r) 767 } 768 } 769 }) 770 fmt.Fprint(w, "\t}\n\n") 771 772 // Verify that f is correct, that is, it represents a subset of the property. 773 for r := rune(0); r <= lastRuneForTesting; r++ { 774 if !verify[r] && f(r) { 775 log.Fatalf("Incorrect filter func for property %q.", property) 776 } 777 } 778 return n 779 } 780 781 // The newCaseTrie, sparseValues and sparseOffsets definitions below are 782 // placeholders referred to by gen_trieval.go. The real definitions are 783 // generated by this program and written to tables.go. 784 785 func newCaseTrie(int) int { return 0 } 786 787 var ( 788 sparseValues [0]valueRange 789 sparseOffsets [0]uint16 790 )