golang.org/x/text@v0.14.0/cases/gen.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 7 // This program generates the trie for casing operations. The Unicode casing 8 // algorithm requires the lookup of various properties and mappings for each 9 // rune. The table generated by this generator combines several of the most 10 // frequently used of these into a single trie so that they can be accessed 11 // with a single lookup. 12 package main 13 14 import ( 15 "bytes" 16 "fmt" 17 "io" 18 "log" 19 "reflect" 20 "strconv" 21 "strings" 22 "unicode" 23 24 "golang.org/x/text/internal/gen" 25 "golang.org/x/text/internal/triegen" 26 "golang.org/x/text/internal/ucd" 27 "golang.org/x/text/unicode/norm" 28 ) 29 30 func main() { 31 gen.Init() 32 genTables() 33 genTablesTest() 34 gen.Repackage("gen_trieval.go", "trieval.go", "cases") 35 } 36 37 // runeInfo contains all information for a rune that we care about for casing 38 // operations. 39 type runeInfo struct { 40 Rune rune 41 42 entry info // trie value for this rune. 43 44 CaseMode info 45 46 // Simple case mappings. 47 Simple [1 + maxCaseMode][]rune 48 49 // Special casing 50 HasSpecial bool 51 Conditional bool 52 Special [1 + maxCaseMode][]rune 53 54 // Folding 55 FoldSimple rune 56 FoldSpecial rune 57 FoldFull []rune 58 59 // TODO: FC_NFKC, or equivalent data. 60 61 // Properties 62 SoftDotted bool 63 CaseIgnorable bool 64 Cased bool 65 DecomposeGreek bool 66 BreakType string 67 BreakCat breakCategory 68 69 // We care mostly about 0, Above, and IotaSubscript. 70 CCC byte 71 } 72 73 type breakCategory int 74 75 const ( 76 breakBreak breakCategory = iota 77 breakLetter 78 breakMid 79 ) 80 81 // mapping returns the case mapping for the given case type. 82 func (r *runeInfo) mapping(c info) string { 83 if r.HasSpecial { 84 return string(r.Special[c]) 85 } 86 if len(r.Simple[c]) != 0 { 87 return string(r.Simple[c]) 88 } 89 return string(r.Rune) 90 } 91 92 func parse(file string, f func(p *ucd.Parser)) { 93 ucd.Parse(gen.OpenUCDFile(file), f) 94 } 95 96 func parseUCD() []runeInfo { 97 chars := make([]runeInfo, unicode.MaxRune) 98 99 get := func(r rune) *runeInfo { 100 c := &chars[r] 101 c.Rune = r 102 return c 103 } 104 105 parse("UnicodeData.txt", func(p *ucd.Parser) { 106 ri := get(p.Rune(0)) 107 ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass)) 108 ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping) 109 ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping) 110 ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping) 111 if p.String(ucd.GeneralCategory) == "Lt" { 112 ri.CaseMode = cTitle 113 } 114 }) 115 116 // <code>; <property> 117 parse("PropList.txt", func(p *ucd.Parser) { 118 if p.String(1) == "Soft_Dotted" { 119 chars[p.Rune(0)].SoftDotted = true 120 } 121 }) 122 123 // <code>; <word break type> 124 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 125 ri := get(p.Rune(0)) 126 switch p.String(1) { 127 case "Case_Ignorable": 128 ri.CaseIgnorable = true 129 case "Cased": 130 ri.Cased = true 131 case "Lowercase": 132 ri.CaseMode = cLower 133 case "Uppercase": 134 ri.CaseMode = cUpper 135 } 136 }) 137 138 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 139 parse("SpecialCasing.txt", func(p *ucd.Parser) { 140 // We drop all conditional special casing and deal with them manually in 141 // the language-specific case mappers. Rune 0x03A3 is the only one with 142 // a conditional formatting that is not language-specific. However, 143 // dealing with this letter is tricky, especially in a streaming 144 // context, so we deal with it in the Caser for Greek specifically. 145 ri := get(p.Rune(0)) 146 if p.String(4) == "" { 147 ri.HasSpecial = true 148 ri.Special[cLower] = p.Runes(1) 149 ri.Special[cTitle] = p.Runes(2) 150 ri.Special[cUpper] = p.Runes(3) 151 } else { 152 ri.Conditional = true 153 } 154 }) 155 156 // TODO: Use text breaking according to UAX #29. 157 // <code>; <word break type> 158 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 159 ri := get(p.Rune(0)) 160 ri.BreakType = p.String(1) 161 162 // We collapse the word breaking properties onto the categories we need. 163 switch p.String(1) { // TODO: officially we need to canonicalize. 164 case "MidLetter", "MidNumLet", "Single_Quote": 165 ri.BreakCat = breakMid 166 if !ri.CaseIgnorable { 167 // finalSigma relies on the fact that all breakMid runes are 168 // also a Case_Ignorable. Revisit this code when this changes. 169 log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri) 170 } 171 case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ": 172 ri.BreakCat = breakLetter 173 } 174 }) 175 176 // <code>; <type>; <mapping> 177 parse("CaseFolding.txt", func(p *ucd.Parser) { 178 ri := get(p.Rune(0)) 179 switch p.String(1) { 180 case "C": 181 ri.FoldSimple = p.Rune(2) 182 ri.FoldFull = p.Runes(2) 183 case "S": 184 ri.FoldSimple = p.Rune(2) 185 case "T": 186 ri.FoldSpecial = p.Rune(2) 187 case "F": 188 ri.FoldFull = p.Runes(2) 189 default: 190 log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1)) 191 } 192 }) 193 194 return chars 195 } 196 197 func genTables() { 198 chars := parseUCD() 199 verifyProperties(chars) 200 201 t := triegen.NewTrie("case") 202 for i := range chars { 203 c := &chars[i] 204 makeEntry(c) 205 t.Insert(rune(i), uint64(c.entry)) 206 } 207 208 w := gen.NewCodeWriter() 209 defer w.WriteVersionedGoFile("tables.go", "cases") 210 211 gen.WriteUnicodeVersion(w) 212 213 // TODO: write CLDR version after adding a mechanism to detect that the 214 // tables on which the manually created locale-sensitive casing code is 215 // based hasn't changed. 216 217 w.WriteVar("xorData", string(xorData)) 218 w.WriteVar("exceptions", string(exceptionData)) 219 220 sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) 221 if err != nil { 222 log.Fatal(err) 223 } 224 w.Size += sz 225 } 226 227 func makeEntry(ri *runeInfo) { 228 if ri.CaseIgnorable { 229 if ri.Cased { 230 ri.entry = cIgnorableCased 231 } else { 232 ri.entry = cIgnorableUncased 233 } 234 } else { 235 ri.entry = ri.CaseMode 236 } 237 238 // TODO: handle soft-dotted. 239 240 ccc := cccOther 241 switch ri.CCC { 242 case 0: // Not_Reordered 243 ccc = cccZero 244 case above: // Above 245 ccc = cccAbove 246 } 247 switch ri.BreakCat { 248 case breakBreak: 249 ccc = cccBreak 250 case breakMid: 251 ri.entry |= isMidBit 252 } 253 254 ri.entry |= ccc 255 256 if ri.CaseMode == cUncased { 257 return 258 } 259 260 // Need to do something special. 261 if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) { 262 makeException(ri) 263 return 264 } 265 if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) { 266 makeException(ri) 267 return 268 } 269 270 // Rune is either lowercase or uppercase. 271 272 orig := string(ri.Rune) 273 mapped := "" 274 if ri.CaseMode == cUpper { 275 mapped = ri.mapping(cLower) 276 } else { 277 mapped = ri.mapping(cUpper) 278 } 279 280 if len(orig) != len(mapped) { 281 makeException(ri) 282 return 283 } 284 285 if string(ri.FoldFull) == ri.mapping(cUpper) { 286 ri.entry |= inverseFoldBit 287 } 288 289 n := len(orig) 290 291 // Create per-byte XOR mask. 292 var b []byte 293 for i := 0; i < n; i++ { 294 b = append(b, orig[i]^mapped[i]) 295 } 296 297 // Remove leading 0 bytes, but keep at least one byte. 298 for ; len(b) > 1 && b[0] == 0; b = b[1:] { 299 } 300 301 if len(b) == 1 && b[0]&0xc0 == 0 { 302 ri.entry |= info(b[0]) << xorShift 303 return 304 } 305 306 key := string(b) 307 x, ok := xorCache[key] 308 if !ok { 309 xorData = append(xorData, 0) // for detecting start of sequence 310 xorData = append(xorData, b...) 311 312 x = len(xorData) - 1 313 xorCache[key] = x 314 } 315 ri.entry |= info(x<<xorShift) | xorIndexBit 316 } 317 318 var xorCache = map[string]int{} 319 320 // xorData contains byte-wise XOR data for the least significant bytes of a 321 // UTF-8 encoded rune. An index points to the last byte. The sequence starts 322 // with a zero terminator. 323 var xorData = []byte{} 324 325 // See the comments in gen_trieval.go re "the exceptions slice". 326 var exceptionData = []byte{0} 327 328 // makeException encodes case mappings that cannot be expressed in a simple 329 // XOR diff. 330 func makeException(ri *runeInfo) { 331 ccc := ri.entry & cccMask 332 // Set exception bit and retain case type. 333 ri.entry &= 0x0007 334 ri.entry |= exceptionBit 335 336 if len(exceptionData) >= 1<<numExceptionBits { 337 log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits) 338 } 339 340 // Set the offset in the exceptionData array. 341 ri.entry |= info(len(exceptionData) << exceptionShift) 342 343 orig := string(ri.Rune) 344 tc := ri.mapping(cTitle) 345 uc := ri.mapping(cUpper) 346 lc := ri.mapping(cLower) 347 ff := string(ri.FoldFull) 348 349 // addString sets the length of a string and adds it to the expansions array. 350 addString := func(s string, b *byte) { 351 if len(s) == 0 { 352 // Zero-length mappings exist, but only for conditional casing, 353 // which we are representing outside of this table. 354 log.Fatalf("%U: has zero-length mapping.", ri.Rune) 355 } 356 *b <<= 3 357 if s != orig || ri.CaseMode == cLower { 358 n := len(s) 359 if n > 7 { 360 log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n) 361 } 362 *b |= byte(n) 363 exceptionData = append(exceptionData, s...) 364 } 365 } 366 367 // byte 0: 368 exceptionData = append(exceptionData, byte(ccc)|byte(len(ff))) 369 370 // byte 1: 371 p := len(exceptionData) 372 exceptionData = append(exceptionData, 0) 373 374 if len(ff) > 7 { // May be zero-length. 375 log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff)) 376 } 377 exceptionData = append(exceptionData, ff...) 378 ct := ri.CaseMode 379 if ct != cLower { 380 addString(lc, &exceptionData[p]) 381 } 382 if ct != cUpper { 383 addString(uc, &exceptionData[p]) 384 } 385 if ct != cTitle { 386 addString(tc, &exceptionData[p]) 387 } 388 } 389 390 // sparseCompacter is a trie value block Compacter. There are many cases where 391 // successive runes alternate between lower- and upper-case. This Compacter 392 // exploits this by adding a special case type where the case value is obtained 393 // from or-ing it with the least-significant bit of the rune, creating large 394 // ranges of equal case values that compress well. 395 type sparseCompacter struct { 396 sparseBlocks [][]uint16 397 sparseOffsets []uint16 398 sparseCount int 399 } 400 401 // makeSparse returns the number of elements that compact block would contain 402 // as well as the modified values. 403 func makeSparse(vals []uint64) ([]uint16, int) { 404 // Copy the values. 405 values := make([]uint16, len(vals)) 406 for i, v := range vals { 407 values[i] = uint16(v) 408 } 409 410 alt := func(i int, v uint16) uint16 { 411 if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower { 412 // Convert cLower or cUpper to cXORCase value, which has the form 11x. 413 xor := v 414 xor &^= 1 415 xor |= uint16(i&1) ^ (v & 1) 416 xor |= 0x4 417 return xor 418 } 419 return v 420 } 421 422 var count int 423 var previous uint16 424 for i, v := range values { 425 if v != 0 { 426 // Try if the unmodified value is equal to the previous. 427 if v == previous { 428 continue 429 } 430 431 // Try if the xor-ed value is equal to the previous value. 432 a := alt(i, v) 433 if a == previous { 434 values[i] = a 435 continue 436 } 437 438 // This is a new value. 439 count++ 440 441 // Use the xor-ed value if it will be identical to the next value. 442 if p := i + 1; p < len(values) && alt(p, values[p]) == a { 443 values[i] = a 444 v = a 445 } 446 } 447 previous = v 448 } 449 return values, count 450 } 451 452 func (s *sparseCompacter) Size(v []uint64) (int, bool) { 453 _, n := makeSparse(v) 454 455 // We limit using this method to having 16 entries. 456 if n > 16 { 457 return 0, false 458 } 459 460 return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true 461 } 462 463 func (s *sparseCompacter) Store(v []uint64) uint32 { 464 h := uint32(len(s.sparseOffsets)) 465 values, sz := makeSparse(v) 466 s.sparseBlocks = append(s.sparseBlocks, values) 467 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 468 s.sparseCount += sz 469 return h 470 } 471 472 func (s *sparseCompacter) Handler() string { 473 // The sparse global variable and its lookup method is defined in gen_trieval.go. 474 return "sparse.lookup" 475 } 476 477 func (s *sparseCompacter) Print(w io.Writer) (retErr error) { 478 p := func(format string, args ...interface{}) { 479 _, err := fmt.Fprintf(w, format, args...) 480 if retErr == nil && err != nil { 481 retErr = err 482 } 483 } 484 485 ls := len(s.sparseBlocks) 486 if ls == len(s.sparseOffsets) { 487 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 488 } 489 p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2) 490 p("var sparseOffsets = %#v\n\n", s.sparseOffsets) 491 492 ns := s.sparseCount 493 p("// sparseValues: %d entries, %d bytes\n", ns, ns*4) 494 p("var sparseValues = [%d]valueRange {", ns) 495 for i, values := range s.sparseBlocks { 496 p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i]) 497 var v uint16 498 for i, nv := range values { 499 if nv != v { 500 if v != 0 { 501 p(",hi:%#02x},", 0x80+i-1) 502 } 503 if nv != 0 { 504 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) 505 } 506 } 507 v = nv 508 } 509 if v != 0 { 510 p(",hi:%#02x},", 0x80+len(values)-1) 511 } 512 } 513 p("\n}\n\n") 514 return 515 } 516 517 // verifyProperties that properties of the runes that are relied upon in the 518 // implementation. Each property is marked with an identifier that is referred 519 // to in the places where it is used. 520 func verifyProperties(chars []runeInfo) { 521 for i, c := range chars { 522 r := rune(i) 523 524 // Rune properties. 525 526 // A.1: modifier never changes on lowercase. [ltLower] 527 if c.CCC > 0 && unicode.ToLower(r) != r { 528 log.Fatalf("%U: non-starter changes when lowercased", r) 529 } 530 531 // A.2: properties of decompositions starting with I or J. [ltLower] 532 d := norm.NFD.PropertiesString(string(r)).Decomposition() 533 if len(d) > 0 { 534 if d[0] == 'I' || d[0] == 'J' { 535 // A.2.1: we expect at least an ASCII character and a modifier. 536 if len(d) < 3 { 537 log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d)) 538 } 539 540 // All subsequent runes are modifiers and all have the same CCC. 541 runes := []rune(string(d[1:])) 542 ccc := chars[runes[0]].CCC 543 544 for _, mr := range runes[1:] { 545 mc := chars[mr] 546 547 // A.2.2: all modifiers have a CCC of Above or less. 548 if ccc == 0 || ccc > above { 549 log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc) 550 } 551 552 // A.2.3: a sequence of modifiers all have the same CCC. 553 if mc.CCC != ccc { 554 log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc) 555 } 556 557 // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above. 558 if (ccc == above) != (0x300 <= mr && mr <= 0x311) { 559 log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr) 560 } 561 562 if i += len(string(mr)); i >= len(d) { 563 break 564 } 565 } 566 } 567 } 568 569 // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper] 570 if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") { 571 log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r) 572 } 573 574 // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper] 575 if c.CCC == iotaSubscript && r != 0x0345 { 576 log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r) 577 } 578 579 // A.5: soft-dotted runes do not have exceptions. 580 if c.SoftDotted && c.entry&exceptionBit != 0 { 581 log.Fatalf("%U: soft-dotted has exception", r) 582 } 583 584 // A.6: Greek decomposition. [elUpper] 585 if unicode.Is(unicode.Greek, r) { 586 if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil { 587 runes := []rune(string(b)) 588 // A.6.1: If a Greek rune decomposes and the first rune of the 589 // decomposition is greater than U+00FF, the rune is always 590 // great and not a modifier. 591 if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) { 592 log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f) 593 } 594 // A.6.2: Any follow-up rune in a Greek decomposition is a 595 // modifier of which the first should be gobbled in 596 // decomposition. 597 for _, m := range runes[1:] { 598 switch m { 599 case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345: 600 default: 601 log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m) 602 } 603 } 604 } 605 } 606 607 // Breaking properties. 608 609 // B.1: all runes with CCC > 0 are of break type Extend. 610 if c.CCC > 0 && c.BreakType != "Extend" { 611 log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType) 612 } 613 614 // B.2: all cased runes with c.CCC == 0 are of break type ALetter. 615 if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" { 616 log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType) 617 } 618 619 // B.3: letter category. 620 if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable { 621 if c.BreakCat != breakLetter { 622 log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter) 623 } 624 } 625 } 626 } 627 628 func genTablesTest() { 629 w := &bytes.Buffer{} 630 631 fmt.Fprintln(w, "var (") 632 printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore) 633 634 // We discard the output as we know we have perfect functions. We run them 635 // just to verify the properties are correct. 636 n := printProperties(io.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased) 637 n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower) 638 n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper) 639 if n > 0 { 640 log.Fatalf("One of the discarded properties does not have a perfect filter.") 641 } 642 643 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 644 fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{") 645 parse("SpecialCasing.txt", func(p *ucd.Parser) { 646 // Skip conditional entries. 647 if p.String(4) != "" { 648 return 649 } 650 r := p.Rune(0) 651 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", 652 r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3))) 653 }) 654 fmt.Fprint(w, "\t}\n\n") 655 656 // <code>; <type>; <runes> 657 table := map[rune]struct{ simple, full, special string }{} 658 parse("CaseFolding.txt", func(p *ucd.Parser) { 659 r := p.Rune(0) 660 t := p.String(1) 661 v := string(p.Runes(2)) 662 if t != "T" && v == string(unicode.ToLower(r)) { 663 return 664 } 665 x := table[r] 666 switch t { 667 case "C": 668 x.full = v 669 x.simple = v 670 case "S": 671 x.simple = v 672 case "F": 673 x.full = v 674 case "T": 675 x.special = v 676 } 677 table[r] = x 678 }) 679 fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{") 680 for r := rune(0); r < 0x10FFFF; r++ { 681 x, ok := table[r] 682 if !ok { 683 continue 684 } 685 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special) 686 } 687 fmt.Fprint(w, "\t}\n\n") 688 689 // Break property 690 notBreak := map[rune]bool{} 691 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 692 switch p.String(1) { 693 case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote", 694 "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ": 695 notBreak[p.Rune(0)] = true 696 } 697 }) 698 699 fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{") 700 inBreak := false 701 for r := rune(0); r <= lastRuneForTesting; r++ { 702 if isBreak := !notBreak[r]; isBreak != inBreak { 703 if isBreak { 704 fmt.Fprintf(w, "\t\t{0x%x, ", r) 705 } else { 706 fmt.Fprintf(w, "0x%x},\n", r-1) 707 } 708 inBreak = isBreak 709 } 710 } 711 if inBreak { 712 fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting) 713 } 714 fmt.Fprint(w, "\t}\n\n") 715 716 // Word break test 717 // Filter out all samples that do not contain cased characters. 718 cased := map[rune]bool{} 719 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 720 if p.String(1) == "Cased" { 721 cased[p.Rune(0)] = true 722 } 723 }) 724 725 fmt.Fprintln(w, "\tbreakTest = []string{") 726 parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) { 727 c := strings.Split(p.String(0), " ") 728 729 const sep = '|' 730 numCased := 0 731 test := "" 732 for ; len(c) >= 2; c = c[2:] { 733 if c[0] == "รท" && test != "" { 734 test += string(sep) 735 } 736 i, err := strconv.ParseUint(c[1], 16, 32) 737 r := rune(i) 738 if err != nil { 739 log.Fatalf("Invalid rune %q.", c[1]) 740 } 741 if r == sep { 742 log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep) 743 } 744 if cased[r] { 745 numCased++ 746 } 747 test += string(r) 748 } 749 if numCased > 1 { 750 fmt.Fprintf(w, "\t\t%q,\n", test) 751 } 752 }) 753 fmt.Fprintln(w, "\t}") 754 755 fmt.Fprintln(w, ")") 756 757 gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes()) 758 } 759 760 // These functions are just used for verification that their definition have not 761 // changed in the Unicode Standard. 762 763 func verifyCased(r rune) bool { 764 return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r) 765 } 766 767 func verifyLower(r rune) bool { 768 return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r) 769 } 770 771 func verifyUpper(r rune) bool { 772 return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r) 773 } 774 775 // verifyIgnore is an approximation of the Case_Ignorable property using the 776 // core unicode package. It is used to reduce the size of the test data. 777 func verifyIgnore(r rune) bool { 778 props := []*unicode.RangeTable{ 779 unicode.Mn, 780 unicode.Me, 781 unicode.Cf, 782 unicode.Lm, 783 unicode.Sk, 784 } 785 for _, p := range props { 786 if unicode.Is(p, r) { 787 return true 788 } 789 } 790 return false 791 } 792 793 // printProperties prints tables of rune properties from the given UCD file. 794 // A filter func f can be given to exclude certain values. A rune r will have 795 // the indicated property if it is in the generated table or if f(r). 796 func printProperties(w io.Writer, file, property string, f func(r rune) bool) int { 797 verify := map[rune]bool{} 798 n := 0 799 varNameParts := strings.Split(property, "_") 800 varNameParts[0] = strings.ToLower(varNameParts[0]) 801 fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, "")) 802 parse(file, func(p *ucd.Parser) { 803 if p.String(1) == property { 804 r := p.Rune(0) 805 verify[r] = true 806 if !f(r) { 807 n++ 808 fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r) 809 } 810 } 811 }) 812 fmt.Fprint(w, "\t}\n\n") 813 814 // Verify that f is correct, that is, it represents a subset of the property. 815 for r := rune(0); r <= lastRuneForTesting; r++ { 816 if !verify[r] && f(r) { 817 log.Fatalf("Incorrect filter func for property %q.", property) 818 } 819 } 820 return n 821 } 822 823 // The newCaseTrie, sparseValues and sparseOffsets definitions below are 824 // placeholders referred to by gen_trieval.go. The real definitions are 825 // generated by this program and written to tables.go. 826 827 func newCaseTrie(int) int { return 0 } 828 829 var ( 830 sparseValues [0]valueRange 831 sparseOffsets [0]uint16 832 )