github.com/sean-/go@v0.0.0-20151219100004-97f854cd7bb6/src/unicode/maketables.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Unicode table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "log" 18 "net/http" 19 "os" 20 "os/exec" 21 "path/filepath" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 "unicode" 27 ) 28 29 func main() { 30 flag.Parse() 31 setupOutput() 32 loadChars() // always needed 33 loadCasefold() 34 printCategories() 35 printScriptOrProperty(false) 36 printScriptOrProperty(true) 37 printCases() 38 printLatinProperties() 39 printCasefold() 40 printSizes() 41 flushOutput() 42 } 43 44 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") 45 var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") 46 var url = flag.String("url", 47 "http://www.unicode.org/Public/8.0.0/ucd/", 48 "URL of Unicode database directory") 49 var tablelist = flag.String("tables", 50 "all", 51 "comma-separated list of which tables to generate; can be letter") 52 var scriptlist = flag.String("scripts", 53 "all", 54 "comma-separated list of which script tables to generate") 55 var proplist = flag.String("props", 56 "all", 57 "comma-separated list of which property tables to generate") 58 var cases = flag.Bool("cases", 59 true, 60 "generate case tables") 61 var test = flag.Bool("test", 62 false, 63 "test existing tables; can be used to compare web data with package data") 64 var localFiles = flag.Bool("local", 65 false, 66 "data files have been copied to current directory; for debugging only") 67 var outputFile = flag.String("output", 68 "", 69 "output file for generated tables; default stdout") 70 71 var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 72 var logger = log.New(os.Stderr, "", log.Lshortfile) 73 74 var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile" 75 76 func setupOutput() { 77 output = bufio.NewWriter(startGofmt()) 78 } 79 80 // startGofmt connects output to a gofmt process if -output is set. 81 func startGofmt() io.Writer { 82 if *outputFile == "" { 83 return os.Stdout 84 } 85 stdout, err := os.Create(*outputFile) 86 if err != nil { 87 logger.Fatal(err) 88 } 89 // Pipe output to gofmt. 90 gofmt := exec.Command("gofmt") 91 fd, err := gofmt.StdinPipe() 92 if err != nil { 93 logger.Fatal(err) 94 } 95 gofmt.Stdout = stdout 96 gofmt.Stderr = os.Stderr 97 err = gofmt.Start() 98 if err != nil { 99 logger.Fatal(err) 100 } 101 return fd 102 } 103 104 func flushOutput() { 105 err := output.Flush() 106 if err != nil { 107 logger.Fatal(err) 108 } 109 } 110 111 func printf(format string, args ...interface{}) { 112 fmt.Fprintf(output, format, args...) 113 } 114 115 func print(args ...interface{}) { 116 fmt.Fprint(output, args...) 117 } 118 119 func println(args ...interface{}) { 120 fmt.Fprintln(output, args...) 121 } 122 123 type reader struct { 124 *bufio.Reader 125 fd *os.File 126 resp *http.Response 127 } 128 129 func open(url string) *reader { 130 file := filepath.Base(url) 131 if *localFiles { 132 fd, err := os.Open(file) 133 if err != nil { 134 logger.Fatal(err) 135 } 136 return &reader{bufio.NewReader(fd), fd, nil} 137 } 138 resp, err := http.Get(url) 139 if err != nil { 140 logger.Fatal(err) 141 } 142 if resp.StatusCode != 200 { 143 logger.Fatalf("bad GET status for %s: %d", file, resp.Status) 144 } 145 return &reader{bufio.NewReader(resp.Body), nil, resp} 146 147 } 148 149 func (r *reader) close() { 150 if r.fd != nil { 151 r.fd.Close() 152 } else { 153 r.resp.Body.Close() 154 } 155 } 156 157 var category = map[string]bool{ 158 // Nd Lu etc. 159 // We use one-character names to identify merged categories 160 "L": true, // Lu Ll Lt Lm Lo 161 "P": true, // Pc Pd Ps Pe Pu Pf Po 162 "M": true, // Mn Mc Me 163 "N": true, // Nd Nl No 164 "S": true, // Sm Sc Sk So 165 "Z": true, // Zs Zl Zp 166 "C": true, // Cc Cf Cs Co Cn 167 } 168 169 // UnicodeData.txt has form: 170 // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; 171 // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A 172 // See http://www.unicode.org/reports/tr44/ for a full explanation 173 // The fields: 174 const ( 175 FCodePoint = iota 176 FName 177 FGeneralCategory 178 FCanonicalCombiningClass 179 FBidiClass 180 FDecompositionTypeAndMapping 181 FNumericType 182 FNumericDigit // If a decimal digit. 183 FNumericValue // Includes non-decimal, e.g. U+2155=1/5 184 FBidiMirrored 185 FUnicode1Name 186 FISOComment 187 FSimpleUppercaseMapping 188 FSimpleLowercaseMapping 189 FSimpleTitlecaseMapping 190 NumField 191 192 MaxChar = 0x10FFFF // anything above this shouldn't exist 193 ) 194 195 var fieldName = []string{ 196 FCodePoint: "CodePoint", 197 FName: "Name", 198 FGeneralCategory: "GeneralCategory", 199 FCanonicalCombiningClass: "CanonicalCombiningClass", 200 FBidiClass: "BidiClass", 201 FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", 202 FNumericType: "NumericType", 203 FNumericDigit: "NumericDigit", 204 FNumericValue: "NumericValue", 205 FBidiMirrored: "BidiMirrored", 206 FUnicode1Name: "Unicode1Name", 207 FISOComment: "ISOComment", 208 FSimpleUppercaseMapping: "SimpleUppercaseMapping", 209 FSimpleLowercaseMapping: "SimpleLowercaseMapping", 210 FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", 211 } 212 213 // This contains only the properties we're interested in. 214 type Char struct { 215 field []string // debugging only; could be deleted if we take out char.dump() 216 codePoint rune // if zero, this index is not a valid code point. 217 category string 218 upperCase rune 219 lowerCase rune 220 titleCase rune 221 foldCase rune // simple case folding 222 caseOrbit rune // next in simple case folding orbit 223 } 224 225 // Scripts.txt has form: 226 // A673 ; Cyrillic # Po SLAVONIC ASTERISK 227 // A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK 228 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation 229 230 type Script struct { 231 lo, hi uint32 // range of code points 232 script string 233 } 234 235 var chars = make([]Char, MaxChar+1) 236 var scripts = make(map[string][]Script) 237 var props = make(map[string][]Script) // a property looks like a script; can share the format 238 239 var lastChar rune = 0 240 241 // In UnicodeData.txt, some ranges are marked like this: 242 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 243 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 244 // parseCategory returns a state variable indicating the weirdness. 245 type State int 246 247 const ( 248 SNormal State = iota // known to be zero for the type 249 SFirst 250 SLast 251 SMissing 252 ) 253 254 func parseCategory(line string) (state State) { 255 field := strings.Split(line, ";") 256 if len(field) != NumField { 257 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) 258 } 259 point, err := strconv.ParseUint(field[FCodePoint], 16, 64) 260 if err != nil { 261 logger.Fatalf("%.5s...: %s", line, err) 262 } 263 lastChar = rune(point) 264 if point > MaxChar { 265 return 266 } 267 char := &chars[point] 268 char.field = field 269 if char.codePoint != 0 { 270 logger.Fatalf("point %U reused", point) 271 } 272 char.codePoint = lastChar 273 char.category = field[FGeneralCategory] 274 category[char.category] = true 275 switch char.category { 276 case "Nd": 277 // Decimal digit 278 _, err := strconv.Atoi(field[FNumericValue]) 279 if err != nil { 280 logger.Fatalf("%U: bad numeric field: %s", point, err) 281 } 282 case "Lu": 283 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 284 case "Ll": 285 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) 286 case "Lt": 287 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) 288 default: 289 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 290 } 291 switch { 292 case strings.Index(field[FName], ", First>") > 0: 293 state = SFirst 294 case strings.Index(field[FName], ", Last>") > 0: 295 state = SLast 296 } 297 return 298 } 299 300 func (char *Char) dump(s string) { 301 print(s, " ") 302 for i := 0; i < len(char.field); i++ { 303 printf("%s:%q ", fieldName[i], char.field[i]) 304 } 305 print("\n") 306 } 307 308 func (char *Char) letter(u, l, t string) { 309 char.upperCase = char.letterValue(u, "U") 310 char.lowerCase = char.letterValue(l, "L") 311 char.titleCase = char.letterValue(t, "T") 312 } 313 314 func (char *Char) letterValue(s string, cas string) rune { 315 if s == "" { 316 return 0 317 } 318 v, err := strconv.ParseUint(s, 16, 64) 319 if err != nil { 320 char.dump(cas) 321 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err) 322 } 323 return rune(v) 324 } 325 326 func allCategories() []string { 327 a := make([]string, 0, len(category)) 328 for k := range category { 329 a = append(a, k) 330 } 331 sort.Strings(a) 332 return a 333 } 334 335 func all(scripts map[string][]Script) []string { 336 a := make([]string, 0, len(scripts)) 337 for k := range scripts { 338 a = append(a, k) 339 } 340 sort.Strings(a) 341 return a 342 } 343 344 func allCatFold(m map[string]map[rune]bool) []string { 345 a := make([]string, 0, len(m)) 346 for k := range m { 347 a = append(a, k) 348 } 349 sort.Strings(a) 350 return a 351 } 352 353 // Extract the version number from the URL 354 func version() string { 355 // Break on slashes and look for the first numeric field 356 fields := strings.Split(*url, "/") 357 for _, f := range fields { 358 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { 359 return f 360 } 361 } 362 logger.Fatal("unknown version") 363 return "Unknown" 364 } 365 366 func categoryOp(code rune, class uint8) bool { 367 category := chars[code].category 368 return len(category) > 0 && category[0] == class 369 } 370 371 func loadChars() { 372 if *dataURL == "" { 373 flag.Set("data", *url+"UnicodeData.txt") 374 } 375 input := open(*dataURL) 376 defer input.close() 377 scanner := bufio.NewScanner(input) 378 var first rune = 0 379 for scanner.Scan() { 380 switch parseCategory(scanner.Text()) { 381 case SNormal: 382 if first != 0 { 383 logger.Fatalf("bad state normal at %U", lastChar) 384 } 385 case SFirst: 386 if first != 0 { 387 logger.Fatalf("bad state first at %U", lastChar) 388 } 389 first = lastChar 390 case SLast: 391 if first == 0 { 392 logger.Fatalf("bad state last at %U", lastChar) 393 } 394 for i := first + 1; i <= lastChar; i++ { 395 chars[i] = chars[first] 396 chars[i].codePoint = i 397 } 398 first = 0 399 } 400 } 401 if scanner.Err() != nil { 402 logger.Fatal(scanner.Err()) 403 } 404 } 405 406 func loadCasefold() { 407 if *casefoldingURL == "" { 408 flag.Set("casefolding", *url+"CaseFolding.txt") 409 } 410 input := open(*casefoldingURL) 411 defer input.close() 412 scanner := bufio.NewScanner(input) 413 for scanner.Scan() { 414 line := scanner.Text() 415 if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 { 416 continue 417 } 418 field := strings.Split(line, "; ") 419 if len(field) != 4 { 420 logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) 421 } 422 kind := field[1] 423 if kind != "C" && kind != "S" { 424 // Only care about 'common' and 'simple' foldings. 425 continue 426 } 427 p1, err := strconv.ParseUint(field[0], 16, 64) 428 if err != nil { 429 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 430 } 431 p2, err := strconv.ParseUint(field[2], 16, 64) 432 if err != nil { 433 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 434 } 435 chars[p1].foldCase = rune(p2) 436 } 437 if scanner.Err() != nil { 438 logger.Fatal(scanner.Err()) 439 } 440 } 441 442 const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. 443 // Use of this source code is governed by a BSD-style 444 // license that can be found in the LICENSE file. 445 446 // Generated by running 447 // maketables --tables=%s --data=%s --casefolding=%s 448 // DO NOT EDIT 449 450 package unicode 451 452 ` 453 454 func printCategories() { 455 if *tablelist == "" { 456 return 457 } 458 // Find out which categories to dump 459 list := strings.Split(*tablelist, ",") 460 if *tablelist == "all" { 461 list = allCategories() 462 } 463 if *test { 464 fullCategoryTest(list) 465 return 466 } 467 printf(progHeader, *tablelist, *dataURL, *casefoldingURL) 468 469 println("// Version is the Unicode edition from which the tables are derived.") 470 printf("const Version = %q\n\n", version()) 471 472 if *tablelist == "all" { 473 println("// Categories is the set of Unicode category tables.") 474 println("var Categories = map[string] *RangeTable {") 475 for _, k := range allCategories() { 476 printf("\t%q: %s,\n", k, k) 477 } 478 print("}\n\n") 479 } 480 481 decl := make(sort.StringSlice, len(list)) 482 ndecl := 0 483 for _, name := range list { 484 if _, ok := category[name]; !ok { 485 logger.Fatal("unknown category", name) 486 } 487 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 488 // name to store the data. This stops godoc dumping all the tables but keeps them 489 // available to clients. 490 // Cases deserving special comments 491 varDecl := "" 492 switch name { 493 case "C": 494 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 495 varDecl += "\tC = _C\n" 496 case "L": 497 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 498 varDecl += "\tL = _L\n" 499 case "M": 500 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 501 varDecl += "\tM = _M\n" 502 case "N": 503 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 504 varDecl += "\tN = _N\n" 505 case "P": 506 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 507 varDecl += "\tP = _P\n" 508 case "S": 509 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 510 varDecl += "\tS = _S\n" 511 case "Z": 512 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 513 varDecl += "\tZ = _Z\n" 514 case "Nd": 515 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 516 case "Lu": 517 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 518 case "Ll": 519 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 520 case "Lt": 521 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 522 } 523 if len(name) > 1 { 524 varDecl += fmt.Sprintf( 525 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 526 name, name, name, name) 527 } 528 decl[ndecl] = varDecl 529 ndecl++ 530 if len(name) == 1 { // unified categories 531 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) 532 dumpRange( 533 decl, 534 func(code rune) bool { return categoryOp(code, name[0]) }) 535 continue 536 } 537 dumpRange( 538 fmt.Sprintf("var _%s = &RangeTable{\n", name), 539 func(code rune) bool { return chars[code].category == name }) 540 } 541 decl.Sort() 542 println("// These variables have type *RangeTable.") 543 println("var (") 544 for _, d := range decl { 545 print(d) 546 } 547 print(")\n\n") 548 } 549 550 type Op func(code rune) bool 551 552 const format = "\t\t{0x%04x, 0x%04x, %d},\n" 553 554 func dumpRange(header string, inCategory Op) { 555 print(header) 556 next := rune(0) 557 latinOffset := 0 558 print("\tR16: []Range16{\n") 559 // one Range for each iteration 560 count := &range16Count 561 size := 16 562 for { 563 // look for start of range 564 for next < rune(len(chars)) && !inCategory(next) { 565 next++ 566 } 567 if next >= rune(len(chars)) { 568 // no characters remain 569 break 570 } 571 572 // start of range 573 lo := next 574 hi := next 575 stride := rune(1) 576 // accept lo 577 next++ 578 // look for another character to set the stride 579 for next < rune(len(chars)) && !inCategory(next) { 580 next++ 581 } 582 if next >= rune(len(chars)) { 583 // no more characters 584 printf(format, lo, hi, stride) 585 break 586 } 587 // set stride 588 stride = next - lo 589 // check for length of run. next points to first jump in stride 590 for i := next; i < rune(len(chars)); i++ { 591 if inCategory(i) == (((i - lo) % stride) == 0) { 592 // accept 593 if inCategory(i) { 594 hi = i 595 } 596 } else { 597 // no more characters in this run 598 break 599 } 600 } 601 if uint32(hi) <= unicode.MaxLatin1 { 602 latinOffset++ 603 } 604 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) 605 // next range: start looking where this range ends 606 next = hi + 1 607 } 608 print("\t},\n") 609 if latinOffset > 0 { 610 printf("\tLatinOffset: %d,\n", latinOffset) 611 } 612 print("}\n\n") 613 } 614 615 func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { 616 if size == 16 && hi >= 1<<16 { 617 if lo < 1<<16 { 618 if lo+stride != hi { 619 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) 620 } 621 // No range contains U+FFFF as an instance, so split 622 // the range into two entries. That way we can maintain 623 // the invariant that R32 contains only >= 1<<16. 624 printf(format, lo, lo, 1) 625 lo = hi 626 stride = 1 627 *count++ 628 } 629 print("\t},\n") 630 print("\tR32: []Range32{\n") 631 size = 32 632 count = &range32Count 633 } 634 printf(format, lo, hi, stride) 635 *count++ 636 return size, count 637 } 638 639 func fullCategoryTest(list []string) { 640 for _, name := range list { 641 if _, ok := category[name]; !ok { 642 logger.Fatal("unknown category", name) 643 } 644 r, ok := unicode.Categories[name] 645 if !ok && len(name) > 1 { 646 logger.Fatalf("unknown table %q", name) 647 } 648 if len(name) == 1 { 649 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 650 } else { 651 verifyRange( 652 name, 653 func(code rune) bool { return chars[code].category == name }, 654 r) 655 } 656 } 657 } 658 659 func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 660 count := 0 661 for j := range chars { 662 i := rune(j) 663 web := inCategory(i) 664 pkg := unicode.Is(table, i) 665 if web != pkg { 666 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 667 count++ 668 if count > 10 { 669 break 670 } 671 } 672 } 673 } 674 675 func parseScript(line string, scripts map[string][]Script) { 676 comment := strings.Index(line, "#") 677 if comment >= 0 { 678 line = line[0:comment] 679 } 680 line = strings.TrimSpace(line) 681 if len(line) == 0 { 682 return 683 } 684 field := strings.Split(line, ";") 685 if len(field) != 2 { 686 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) 687 } 688 matches := scriptRe.FindStringSubmatch(line) 689 if len(matches) != 4 { 690 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches)) 691 } 692 lo, err := strconv.ParseUint(matches[1], 16, 64) 693 if err != nil { 694 logger.Fatalf("%.5s...: %s", line, err) 695 } 696 hi := lo 697 if len(matches[2]) > 2 { // ignore leading .. 698 hi, err = strconv.ParseUint(matches[2][2:], 16, 64) 699 if err != nil { 700 logger.Fatalf("%.5s...: %s", line, err) 701 } 702 } 703 name := matches[3] 704 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) 705 } 706 707 // The script tables have a lot of adjacent elements. Fold them together. 708 func foldAdjacent(r []Script) []unicode.Range32 { 709 s := make([]unicode.Range32, 0, len(r)) 710 j := 0 711 for i := 0; i < len(r); i++ { 712 if j > 0 && r[i].lo == s[j-1].Hi+1 { 713 s[j-1].Hi = r[i].hi 714 } else { 715 s = s[0 : j+1] 716 s[j] = unicode.Range32{ 717 Lo: uint32(r[i].lo), 718 Hi: uint32(r[i].hi), 719 Stride: 1, 720 } 721 j++ 722 } 723 } 724 return s 725 } 726 727 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { 728 for _, name := range list { 729 if _, ok := scripts[name]; !ok { 730 logger.Fatal("unknown script", name) 731 } 732 _, ok := installed[name] 733 if !ok { 734 logger.Fatal("unknown table", name) 735 } 736 for _, script := range scripts[name] { 737 for r := script.lo; r <= script.hi; r++ { 738 if !unicode.Is(installed[name], rune(r)) { 739 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 740 } 741 } 742 } 743 } 744 } 745 746 // PropList.txt has the same format as Scripts.txt so we can share its parser. 747 func printScriptOrProperty(doProps bool) { 748 flag := "scripts" 749 flaglist := *scriptlist 750 file := "Scripts.txt" 751 table := scripts 752 installed := unicode.Scripts 753 if doProps { 754 flag = "props" 755 flaglist = *proplist 756 file = "PropList.txt" 757 table = props 758 installed = unicode.Properties 759 } 760 if flaglist == "" { 761 return 762 } 763 input := open(*url + file) 764 scanner := bufio.NewScanner(input) 765 for scanner.Scan() { 766 parseScript(scanner.Text(), table) 767 } 768 if scanner.Err() != nil { 769 logger.Fatal(scanner.Err()) 770 } 771 input.close() 772 773 // Find out which scripts to dump 774 list := strings.Split(flaglist, ",") 775 if flaglist == "all" { 776 list = all(table) 777 } 778 if *test { 779 fullScriptTest(list, installed, table) 780 return 781 } 782 783 printf( 784 "// Generated by running\n"+ 785 "// maketables --%s=%s --url=%s\n"+ 786 "// DO NOT EDIT\n\n", 787 flag, 788 flaglist, 789 *url) 790 if flaglist == "all" { 791 if doProps { 792 println("// Properties is the set of Unicode property tables.") 793 println("var Properties = map[string] *RangeTable{") 794 } else { 795 println("// Scripts is the set of Unicode script tables.") 796 println("var Scripts = map[string] *RangeTable{") 797 } 798 for _, k := range all(table) { 799 printf("\t%q: %s,\n", k, k) 800 } 801 print("}\n\n") 802 } 803 804 decl := make(sort.StringSlice, len(list)) 805 ndecl := 0 806 for _, name := range list { 807 if doProps { 808 decl[ndecl] = fmt.Sprintf( 809 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 810 name, name, name, name) 811 } else { 812 decl[ndecl] = fmt.Sprintf( 813 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 814 name, name, name, name) 815 } 816 ndecl++ 817 printf("var _%s = &RangeTable {\n", name) 818 ranges := foldAdjacent(table[name]) 819 print("\tR16: []Range16{\n") 820 size := 16 821 count := &range16Count 822 for _, s := range ranges { 823 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) 824 } 825 print("\t},\n") 826 if off := findLatinOffset(ranges); off > 0 { 827 printf("\tLatinOffset: %d,\n", off) 828 } 829 print("}\n\n") 830 } 831 decl.Sort() 832 println("// These variables have type *RangeTable.") 833 println("var (") 834 for _, d := range decl { 835 print(d) 836 } 837 print(")\n\n") 838 } 839 840 func findLatinOffset(ranges []unicode.Range32) int { 841 i := 0 842 for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { 843 i++ 844 } 845 return i 846 } 847 848 const ( 849 CaseUpper = 1 << iota 850 CaseLower 851 CaseTitle 852 CaseNone = 0 // must be zero 853 CaseMissing = -1 // character not present; not a valid case state 854 ) 855 856 type caseState struct { 857 point rune 858 _case int 859 deltaToUpper rune 860 deltaToLower rune 861 deltaToTitle rune 862 } 863 864 // Is d a continuation of the state of c? 865 func (c *caseState) adjacent(d *caseState) bool { 866 if d.point < c.point { 867 c, d = d, c 868 } 869 switch { 870 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 871 return false 872 case d._case != c._case: // different cases 873 return c.upperLowerAdjacent(d) 874 case c._case == CaseNone: 875 return false 876 case c._case == CaseMissing: 877 return false 878 case d.deltaToUpper != c.deltaToUpper: 879 return false 880 case d.deltaToLower != c.deltaToLower: 881 return false 882 case d.deltaToTitle != c.deltaToTitle: 883 return false 884 } 885 return true 886 } 887 888 // Is d the same as c, but opposite in upper/lower case? this would make it 889 // an element of an UpperLower sequence. 890 func (c *caseState) upperLowerAdjacent(d *caseState) bool { 891 // check they're a matched case pair. we know they have adjacent values 892 switch { 893 case c._case == CaseUpper && d._case != CaseLower: 894 return false 895 case c._case == CaseLower && d._case != CaseUpper: 896 return false 897 } 898 // matched pair (at least in upper/lower). make the order Upper Lower 899 if c._case == CaseLower { 900 c, d = d, c 901 } 902 // for an Upper Lower sequence the deltas have to be in order 903 // c: 0 1 0 904 // d: -1 0 -1 905 switch { 906 case c.deltaToUpper != 0: 907 return false 908 case c.deltaToLower != 1: 909 return false 910 case c.deltaToTitle != 0: 911 return false 912 case d.deltaToUpper != -1: 913 return false 914 case d.deltaToLower != 0: 915 return false 916 case d.deltaToTitle != -1: 917 return false 918 } 919 return true 920 } 921 922 // Does this character start an UpperLower sequence? 923 func (c *caseState) isUpperLower() bool { 924 // for an Upper Lower sequence the deltas have to be in order 925 // c: 0 1 0 926 switch { 927 case c.deltaToUpper != 0: 928 return false 929 case c.deltaToLower != 1: 930 return false 931 case c.deltaToTitle != 0: 932 return false 933 } 934 return true 935 } 936 937 // Does this character start a LowerUpper sequence? 938 func (c *caseState) isLowerUpper() bool { 939 // for an Upper Lower sequence the deltas have to be in order 940 // c: -1 0 -1 941 switch { 942 case c.deltaToUpper != -1: 943 return false 944 case c.deltaToLower != 0: 945 return false 946 case c.deltaToTitle != -1: 947 return false 948 } 949 return true 950 } 951 952 func getCaseState(i rune) (c *caseState) { 953 c = &caseState{point: i, _case: CaseNone} 954 ch := &chars[i] 955 switch ch.codePoint { 956 case 0: 957 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 958 return 959 case ch.upperCase: 960 c._case = CaseUpper 961 case ch.lowerCase: 962 c._case = CaseLower 963 case ch.titleCase: 964 c._case = CaseTitle 965 } 966 // Some things such as roman numeral U+2161 don't describe themselves 967 // as upper case, but have a lower case. Second-guess them. 968 if c._case == CaseNone && ch.lowerCase != 0 { 969 c._case = CaseUpper 970 } 971 // Same in the other direction. 972 if c._case == CaseNone && ch.upperCase != 0 { 973 c._case = CaseLower 974 } 975 976 if ch.upperCase != 0 { 977 c.deltaToUpper = ch.upperCase - i 978 } 979 if ch.lowerCase != 0 { 980 c.deltaToLower = ch.lowerCase - i 981 } 982 if ch.titleCase != 0 { 983 c.deltaToTitle = ch.titleCase - i 984 } 985 return 986 } 987 988 func printCases() { 989 if !*cases { 990 return 991 } 992 if *test { 993 fullCaseTest() 994 return 995 } 996 printf( 997 "// Generated by running\n"+ 998 "// maketables --data=%s --casefolding=%s\n"+ 999 "// DO NOT EDIT\n\n"+ 1000 "// CaseRanges is the table describing case mappings for all letters with\n"+ 1001 "// non-self mappings.\n"+ 1002 "var CaseRanges = _CaseRanges\n"+ 1003 "var _CaseRanges = []CaseRange {\n", 1004 *dataURL, *casefoldingURL) 1005 1006 var startState *caseState // the start of a run; nil for not active 1007 var prevState = &caseState{} // the state of the previous character 1008 for i := range chars { 1009 state := getCaseState(rune(i)) 1010 if state.adjacent(prevState) { 1011 prevState = state 1012 continue 1013 } 1014 // end of run (possibly) 1015 printCaseRange(startState, prevState) 1016 startState = nil 1017 if state._case != CaseMissing && state._case != CaseNone { 1018 startState = state 1019 } 1020 prevState = state 1021 } 1022 print("}\n") 1023 } 1024 1025 func printCaseRange(lo, hi *caseState) { 1026 if lo == nil { 1027 return 1028 } 1029 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 1030 // character represents itself in all cases - no need to mention it 1031 return 1032 } 1033 switch { 1034 case hi.point > lo.point && lo.isUpperLower(): 1035 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 1036 lo.point, hi.point) 1037 case hi.point > lo.point && lo.isLowerUpper(): 1038 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 1039 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 1040 lo.point, hi.point) 1041 default: 1042 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 1043 lo.point, hi.point, 1044 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 1045 } 1046 } 1047 1048 // If the cased value in the Char is 0, it means use the rune itself. 1049 func caseIt(r, cased rune) rune { 1050 if cased == 0 { 1051 return r 1052 } 1053 return cased 1054 } 1055 1056 func fullCaseTest() { 1057 for j, c := range chars { 1058 i := rune(j) 1059 lower := unicode.ToLower(i) 1060 want := caseIt(i, c.lowerCase) 1061 if lower != want { 1062 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 1063 } 1064 upper := unicode.ToUpper(i) 1065 want = caseIt(i, c.upperCase) 1066 if upper != want { 1067 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 1068 } 1069 title := unicode.ToTitle(i) 1070 want = caseIt(i, c.titleCase) 1071 if title != want { 1072 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 1073 } 1074 } 1075 } 1076 1077 func printLatinProperties() { 1078 if *test { 1079 return 1080 } 1081 println("var properties = [MaxLatin1+1]uint8{") 1082 for code := 0; code <= unicode.MaxLatin1; code++ { 1083 var property string 1084 switch chars[code].category { 1085 case "Cc", "": // NUL has no category. 1086 property = "pC" 1087 case "Cf": // soft hyphen, unique category, not printable. 1088 property = "0" 1089 case "Ll": 1090 property = "pLl | pp" 1091 case "Lo": 1092 property = "pLo | pp" 1093 case "Lu": 1094 property = "pLu | pp" 1095 case "Nd", "No": 1096 property = "pN | pp" 1097 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 1098 property = "pP | pp" 1099 case "Sc", "Sk", "Sm", "So": 1100 property = "pS | pp" 1101 case "Zs": 1102 property = "pZ" 1103 default: 1104 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 1105 } 1106 // Special case 1107 if code == ' ' { 1108 property = "pZ | pp" 1109 } 1110 printf("\t0x%02X: %s, // %q\n", code, property, code) 1111 } 1112 printf("}\n\n") 1113 } 1114 1115 type runeSlice []rune 1116 1117 func (p runeSlice) Len() int { return len(p) } 1118 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 1119 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 1120 1121 func printCasefold() { 1122 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 1123 var caseOrbit = make([][]rune, MaxChar+1) 1124 for j := range chars { 1125 i := rune(j) 1126 c := &chars[i] 1127 if c.foldCase == 0 { 1128 continue 1129 } 1130 orb := caseOrbit[c.foldCase] 1131 if orb == nil { 1132 orb = append(orb, c.foldCase) 1133 } 1134 caseOrbit[c.foldCase] = append(orb, i) 1135 } 1136 1137 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 1138 for j := range chars { 1139 i := rune(j) 1140 c := &chars[i] 1141 f := c.foldCase 1142 if f == 0 { 1143 f = i 1144 } 1145 orb := caseOrbit[f] 1146 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 1147 // Default assumption of [upper, lower] is wrong. 1148 caseOrbit[i] = []rune{i} 1149 } 1150 } 1151 1152 // Delete the groups for which assuming [lower, upper] or [upper, lower] is right. 1153 for i, orb := range caseOrbit { 1154 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 1155 caseOrbit[i] = nil 1156 } 1157 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] { 1158 caseOrbit[i] = nil 1159 } 1160 } 1161 1162 // Record orbit information in chars. 1163 for _, orb := range caseOrbit { 1164 if orb == nil { 1165 continue 1166 } 1167 sort.Sort(runeSlice(orb)) 1168 c := orb[len(orb)-1] 1169 for _, d := range orb { 1170 chars[c].caseOrbit = d 1171 c = d 1172 } 1173 } 1174 1175 printCaseOrbit() 1176 1177 // Tables of category and script folding exceptions: code points 1178 // that must be added when interpreting a particular category/script 1179 // in a case-folding context. 1180 cat := make(map[string]map[rune]bool) 1181 for name := range category { 1182 if x := foldExceptions(inCategory(name)); len(x) > 0 { 1183 cat[name] = x 1184 } 1185 } 1186 1187 scr := make(map[string]map[rune]bool) 1188 for name := range scripts { 1189 if x := foldExceptions(inScript(name)); len(x) > 0 { 1190 cat[name] = x 1191 } 1192 } 1193 1194 printCatFold("FoldCategory", cat) 1195 printCatFold("FoldScript", scr) 1196 } 1197 1198 // inCategory returns a list of all the runes in the category. 1199 func inCategory(name string) []rune { 1200 var x []rune 1201 for j := range chars { 1202 i := rune(j) 1203 c := &chars[i] 1204 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 1205 x = append(x, i) 1206 } 1207 } 1208 return x 1209 } 1210 1211 // inScript returns a list of all the runes in the script. 1212 func inScript(name string) []rune { 1213 var x []rune 1214 for _, s := range scripts[name] { 1215 for c := s.lo; c <= s.hi; c++ { 1216 x = append(x, rune(c)) 1217 } 1218 } 1219 return x 1220 } 1221 1222 // foldExceptions returns a list of all the runes fold-equivalent 1223 // to runes in class but not in class themselves. 1224 func foldExceptions(class []rune) map[rune]bool { 1225 // Create map containing class and all fold-equivalent chars. 1226 m := make(map[rune]bool) 1227 for _, r := range class { 1228 c := &chars[r] 1229 if c.caseOrbit == 0 { 1230 // Just upper and lower. 1231 if u := c.upperCase; u != 0 { 1232 m[u] = true 1233 } 1234 if l := c.lowerCase; l != 0 { 1235 m[l] = true 1236 } 1237 m[r] = true 1238 continue 1239 } 1240 // Otherwise walk orbit. 1241 r0 := r 1242 for { 1243 m[r] = true 1244 r = chars[r].caseOrbit 1245 if r == r0 { 1246 break 1247 } 1248 } 1249 } 1250 1251 // Remove class itself. 1252 for _, r := range class { 1253 delete(m, r) 1254 } 1255 1256 // What's left is the exceptions. 1257 return m 1258 } 1259 1260 var comment = map[string]string{ 1261 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 1262 "// code points outside the category that are equivalent under\n" + 1263 "// simple case folding to code points inside the category.\n" + 1264 "// If there is no entry for a category name, there are no such points.\n", 1265 1266 "FoldScript": "// FoldScript maps a script name to a table of\n" + 1267 "// code points outside the script that are equivalent under\n" + 1268 "// simple case folding to code points inside the script.\n" + 1269 "// If there is no entry for a script name, there are no such points.\n", 1270 } 1271 1272 func printCaseOrbit() { 1273 if *test { 1274 for j := range chars { 1275 i := rune(j) 1276 c := &chars[i] 1277 f := c.caseOrbit 1278 if f == 0 { 1279 if c.lowerCase != i && c.lowerCase != 0 { 1280 f = c.lowerCase 1281 } else if c.upperCase != i && c.upperCase != 0 { 1282 f = c.upperCase 1283 } else { 1284 f = i 1285 } 1286 } 1287 if g := unicode.SimpleFold(i); g != f { 1288 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 1289 } 1290 } 1291 return 1292 } 1293 1294 printf("var caseOrbit = []foldPair{\n") 1295 for i := range chars { 1296 c := &chars[i] 1297 if c.caseOrbit != 0 { 1298 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 1299 foldPairCount++ 1300 } 1301 } 1302 printf("}\n\n") 1303 } 1304 1305 func printCatFold(name string, m map[string]map[rune]bool) { 1306 if *test { 1307 var pkgMap map[string]*unicode.RangeTable 1308 if name == "FoldCategory" { 1309 pkgMap = unicode.FoldCategory 1310 } else { 1311 pkgMap = unicode.FoldScript 1312 } 1313 if len(pkgMap) != len(m) { 1314 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 1315 return 1316 } 1317 for k, v := range m { 1318 t, ok := pkgMap[k] 1319 if !ok { 1320 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 1321 continue 1322 } 1323 n := 0 1324 for _, r := range t.R16 { 1325 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1326 if !v[c] { 1327 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1328 } 1329 n++ 1330 } 1331 } 1332 for _, r := range t.R32 { 1333 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1334 if !v[c] { 1335 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1336 } 1337 n++ 1338 } 1339 } 1340 if n != len(v) { 1341 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 1342 } 1343 } 1344 return 1345 } 1346 1347 print(comment[name]) 1348 printf("var %s = map[string]*RangeTable{\n", name) 1349 for _, name := range allCatFold(m) { 1350 printf("\t%q: fold%s,\n", name, name) 1351 } 1352 printf("}\n\n") 1353 for _, name := range allCatFold(m) { 1354 class := m[name] 1355 dumpRange( 1356 fmt.Sprintf("var fold%s = &RangeTable{\n", name), 1357 func(code rune) bool { return class[code] }) 1358 } 1359 } 1360 1361 var range16Count = 0 // Number of entries in the 16-bit range tables. 1362 var range32Count = 0 // Number of entries in the 32-bit range tables. 1363 var foldPairCount = 0 // Number of fold pairs in the exception tables. 1364 1365 func printSizes() { 1366 if *test { 1367 return 1368 } 1369 println() 1370 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1371 range16Bytes := range16Count * 3 * 2 1372 range32Bytes := range32Count * 3 * 4 1373 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1374 println() 1375 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1376 }