github.com/dannin/go@v0.0.0-20161031215817-d35dfd405eaa/src/unicode/maketables.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Unicode table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "log" 18 "net/http" 19 "os" 20 "os/exec" 21 "path/filepath" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 "unicode" 27 ) 28 29 func main() { 30 flag.Parse() 31 setupOutput() 32 loadChars() // always needed 33 loadCasefold() 34 printCategories() 35 printScriptOrProperty(false) 36 printScriptOrProperty(true) 37 printCases() 38 printLatinProperties() 39 printCasefold() 40 printSizes() 41 flushOutput() 42 } 43 44 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") 45 var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") 46 var url = flag.String("url", 47 "http://www.unicode.org/Public/9.0.0/ucd/", 48 "URL of Unicode database directory") 49 var tablelist = flag.String("tables", 50 "all", 51 "comma-separated list of which tables to generate; can be letter") 52 var scriptlist = flag.String("scripts", 53 "all", 54 "comma-separated list of which script tables to generate") 55 var proplist = flag.String("props", 56 "all", 57 "comma-separated list of which property tables to generate") 58 var cases = flag.Bool("cases", 59 true, 60 "generate case tables") 61 var test = flag.Bool("test", 62 false, 63 "test existing tables; can be used to compare web data with package data") 64 var localFiles = flag.Bool("local", 65 false, 66 "data files have been copied to current directory; for debugging only") 67 var outputFile = flag.String("output", 68 "", 69 "output file for generated tables; default stdout") 70 71 var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 72 var logger = log.New(os.Stderr, "", log.Lshortfile) 73 74 var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile" 75 76 func setupOutput() { 77 output = bufio.NewWriter(startGofmt()) 78 } 79 80 // startGofmt connects output to a gofmt process if -output is set. 81 func startGofmt() io.Writer { 82 if *outputFile == "" { 83 return os.Stdout 84 } 85 stdout, err := os.Create(*outputFile) 86 if err != nil { 87 logger.Fatal(err) 88 } 89 // Pipe output to gofmt. 90 gofmt := exec.Command("gofmt") 91 fd, err := gofmt.StdinPipe() 92 if err != nil { 93 logger.Fatal(err) 94 } 95 gofmt.Stdout = stdout 96 gofmt.Stderr = os.Stderr 97 err = gofmt.Start() 98 if err != nil { 99 logger.Fatal(err) 100 } 101 return fd 102 } 103 104 func flushOutput() { 105 err := output.Flush() 106 if err != nil { 107 logger.Fatal(err) 108 } 109 } 110 111 func printf(format string, args ...interface{}) { 112 fmt.Fprintf(output, format, args...) 113 } 114 115 func print(args ...interface{}) { 116 fmt.Fprint(output, args...) 117 } 118 119 func println(args ...interface{}) { 120 fmt.Fprintln(output, args...) 121 } 122 123 type reader struct { 124 *bufio.Reader 125 fd *os.File 126 resp *http.Response 127 } 128 129 func open(url string) *reader { 130 file := filepath.Base(url) 131 if *localFiles { 132 fd, err := os.Open(file) 133 if err != nil { 134 logger.Fatal(err) 135 } 136 return &reader{bufio.NewReader(fd), fd, nil} 137 } 138 resp, err := http.Get(url) 139 if err != nil { 140 logger.Fatal(err) 141 } 142 if resp.StatusCode != 200 { 143 logger.Fatalf("bad GET status for %s: %d", file, resp.Status) 144 } 145 return &reader{bufio.NewReader(resp.Body), nil, resp} 146 147 } 148 149 func (r *reader) close() { 150 if r.fd != nil { 151 r.fd.Close() 152 } else { 153 r.resp.Body.Close() 154 } 155 } 156 157 var category = map[string]bool{ 158 // Nd Lu etc. 159 // We use one-character names to identify merged categories 160 "L": true, // Lu Ll Lt Lm Lo 161 "P": true, // Pc Pd Ps Pe Pu Pf Po 162 "M": true, // Mn Mc Me 163 "N": true, // Nd Nl No 164 "S": true, // Sm Sc Sk So 165 "Z": true, // Zs Zl Zp 166 "C": true, // Cc Cf Cs Co Cn 167 } 168 169 // UnicodeData.txt has form: 170 // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; 171 // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A 172 // See http://www.unicode.org/reports/tr44/ for a full explanation 173 // The fields: 174 const ( 175 FCodePoint = iota 176 FName 177 FGeneralCategory 178 FCanonicalCombiningClass 179 FBidiClass 180 FDecompositionTypeAndMapping 181 FNumericType 182 FNumericDigit // If a decimal digit. 183 FNumericValue // Includes non-decimal, e.g. U+2155=1/5 184 FBidiMirrored 185 FUnicode1Name 186 FISOComment 187 FSimpleUppercaseMapping 188 FSimpleLowercaseMapping 189 FSimpleTitlecaseMapping 190 NumField 191 192 MaxChar = 0x10FFFF // anything above this shouldn't exist 193 ) 194 195 var fieldName = []string{ 196 FCodePoint: "CodePoint", 197 FName: "Name", 198 FGeneralCategory: "GeneralCategory", 199 FCanonicalCombiningClass: "CanonicalCombiningClass", 200 FBidiClass: "BidiClass", 201 FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", 202 FNumericType: "NumericType", 203 FNumericDigit: "NumericDigit", 204 FNumericValue: "NumericValue", 205 FBidiMirrored: "BidiMirrored", 206 FUnicode1Name: "Unicode1Name", 207 FISOComment: "ISOComment", 208 FSimpleUppercaseMapping: "SimpleUppercaseMapping", 209 FSimpleLowercaseMapping: "SimpleLowercaseMapping", 210 FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", 211 } 212 213 // This contains only the properties we're interested in. 214 type Char struct { 215 field []string // debugging only; could be deleted if we take out char.dump() 216 codePoint rune // if zero, this index is not a valid code point. 217 category string 218 upperCase rune 219 lowerCase rune 220 titleCase rune 221 foldCase rune // simple case folding 222 caseOrbit rune // next in simple case folding orbit 223 } 224 225 // Scripts.txt has form: 226 // A673 ; Cyrillic # Po SLAVONIC ASTERISK 227 // A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK 228 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation 229 230 type Script struct { 231 lo, hi uint32 // range of code points 232 script string 233 } 234 235 var chars = make([]Char, MaxChar+1) 236 var scripts = make(map[string][]Script) 237 var props = make(map[string][]Script) // a property looks like a script; can share the format 238 239 var lastChar rune = 0 240 241 // In UnicodeData.txt, some ranges are marked like this: 242 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 243 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 244 // parseCategory returns a state variable indicating the weirdness. 245 type State int 246 247 const ( 248 SNormal State = iota // known to be zero for the type 249 SFirst 250 SLast 251 SMissing 252 ) 253 254 func parseCategory(line string) (state State) { 255 field := strings.Split(line, ";") 256 if len(field) != NumField { 257 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) 258 } 259 point, err := strconv.ParseUint(field[FCodePoint], 16, 64) 260 if err != nil { 261 logger.Fatalf("%.5s...: %s", line, err) 262 } 263 lastChar = rune(point) 264 if point > MaxChar { 265 return 266 } 267 char := &chars[point] 268 char.field = field 269 if char.codePoint != 0 { 270 logger.Fatalf("point %U reused", point) 271 } 272 char.codePoint = lastChar 273 char.category = field[FGeneralCategory] 274 category[char.category] = true 275 switch char.category { 276 case "Nd": 277 // Decimal digit 278 _, err := strconv.Atoi(field[FNumericValue]) 279 if err != nil { 280 logger.Fatalf("%U: bad numeric field: %s", point, err) 281 } 282 case "Lu": 283 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 284 case "Ll": 285 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) 286 case "Lt": 287 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) 288 default: 289 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 290 } 291 switch { 292 case strings.Index(field[FName], ", First>") > 0: 293 state = SFirst 294 case strings.Index(field[FName], ", Last>") > 0: 295 state = SLast 296 } 297 return 298 } 299 300 func (char *Char) dump(s string) { 301 print(s, " ") 302 for i := 0; i < len(char.field); i++ { 303 printf("%s:%q ", fieldName[i], char.field[i]) 304 } 305 print("\n") 306 } 307 308 func (char *Char) letter(u, l, t string) { 309 char.upperCase = char.letterValue(u, "U") 310 char.lowerCase = char.letterValue(l, "L") 311 char.titleCase = char.letterValue(t, "T") 312 } 313 314 func (char *Char) letterValue(s string, cas string) rune { 315 if s == "" { 316 return 0 317 } 318 v, err := strconv.ParseUint(s, 16, 64) 319 if err != nil { 320 char.dump(cas) 321 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err) 322 } 323 return rune(v) 324 } 325 326 func allCategories() []string { 327 a := make([]string, 0, len(category)) 328 for k := range category { 329 a = append(a, k) 330 } 331 sort.Strings(a) 332 return a 333 } 334 335 func all(scripts map[string][]Script) []string { 336 a := make([]string, 0, len(scripts)) 337 for k := range scripts { 338 a = append(a, k) 339 } 340 sort.Strings(a) 341 return a 342 } 343 344 func allCatFold(m map[string]map[rune]bool) []string { 345 a := make([]string, 0, len(m)) 346 for k := range m { 347 a = append(a, k) 348 } 349 sort.Strings(a) 350 return a 351 } 352 353 // Extract the version number from the URL 354 func version() string { 355 // Break on slashes and look for the first numeric field 356 fields := strings.Split(*url, "/") 357 for _, f := range fields { 358 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { 359 return f 360 } 361 } 362 logger.Fatal("unknown version") 363 return "Unknown" 364 } 365 366 func categoryOp(code rune, class uint8) bool { 367 category := chars[code].category 368 return len(category) > 0 && category[0] == class 369 } 370 371 func loadChars() { 372 if *dataURL == "" { 373 flag.Set("data", *url+"UnicodeData.txt") 374 } 375 input := open(*dataURL) 376 defer input.close() 377 scanner := bufio.NewScanner(input) 378 var first rune = 0 379 for scanner.Scan() { 380 switch parseCategory(scanner.Text()) { 381 case SNormal: 382 if first != 0 { 383 logger.Fatalf("bad state normal at %U", lastChar) 384 } 385 case SFirst: 386 if first != 0 { 387 logger.Fatalf("bad state first at %U", lastChar) 388 } 389 first = lastChar 390 case SLast: 391 if first == 0 { 392 logger.Fatalf("bad state last at %U", lastChar) 393 } 394 for i := first + 1; i <= lastChar; i++ { 395 chars[i] = chars[first] 396 chars[i].codePoint = i 397 } 398 first = 0 399 } 400 } 401 if scanner.Err() != nil { 402 logger.Fatal(scanner.Err()) 403 } 404 } 405 406 func loadCasefold() { 407 if *casefoldingURL == "" { 408 flag.Set("casefolding", *url+"CaseFolding.txt") 409 } 410 input := open(*casefoldingURL) 411 defer input.close() 412 scanner := bufio.NewScanner(input) 413 for scanner.Scan() { 414 line := scanner.Text() 415 if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 { 416 continue 417 } 418 field := strings.Split(line, "; ") 419 if len(field) != 4 { 420 logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) 421 } 422 kind := field[1] 423 if kind != "C" && kind != "S" { 424 // Only care about 'common' and 'simple' foldings. 425 continue 426 } 427 p1, err := strconv.ParseUint(field[0], 16, 64) 428 if err != nil { 429 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 430 } 431 p2, err := strconv.ParseUint(field[2], 16, 64) 432 if err != nil { 433 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 434 } 435 chars[p1].foldCase = rune(p2) 436 } 437 if scanner.Err() != nil { 438 logger.Fatal(scanner.Err()) 439 } 440 } 441 442 const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. 443 // Use of this source code is governed by a BSD-style 444 // license that can be found in the LICENSE file. 445 446 // Generated by running 447 // maketables --tables=%s --data=%s --casefolding=%s 448 // DO NOT EDIT 449 450 package unicode 451 452 ` 453 454 func printCategories() { 455 if *tablelist == "" { 456 return 457 } 458 // Find out which categories to dump 459 list := strings.Split(*tablelist, ",") 460 if *tablelist == "all" { 461 list = allCategories() 462 } 463 if *test { 464 fullCategoryTest(list) 465 return 466 } 467 printf(progHeader, *tablelist, *dataURL, *casefoldingURL) 468 469 println("// Version is the Unicode edition from which the tables are derived.") 470 printf("const Version = %q\n\n", version()) 471 472 if *tablelist == "all" { 473 println("// Categories is the set of Unicode category tables.") 474 println("var Categories = map[string] *RangeTable {") 475 for _, k := range allCategories() { 476 printf("\t%q: %s,\n", k, k) 477 } 478 print("}\n\n") 479 } 480 481 decl := make(sort.StringSlice, len(list)) 482 ndecl := 0 483 for _, name := range list { 484 if _, ok := category[name]; !ok { 485 logger.Fatal("unknown category", name) 486 } 487 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 488 // name to store the data. This stops godoc dumping all the tables but keeps them 489 // available to clients. 490 // Cases deserving special comments 491 varDecl := "" 492 switch name { 493 case "C": 494 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 495 varDecl += "\tC = _C\n" 496 case "L": 497 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 498 varDecl += "\tL = _L\n" 499 case "M": 500 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 501 varDecl += "\tM = _M\n" 502 case "N": 503 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 504 varDecl += "\tN = _N\n" 505 case "P": 506 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 507 varDecl += "\tP = _P\n" 508 case "S": 509 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 510 varDecl += "\tS = _S\n" 511 case "Z": 512 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 513 varDecl += "\tZ = _Z\n" 514 case "Nd": 515 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 516 case "Lu": 517 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 518 case "Ll": 519 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 520 case "Lt": 521 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 522 } 523 if len(name) > 1 { 524 varDecl += fmt.Sprintf( 525 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 526 name, name, name, name) 527 } 528 decl[ndecl] = varDecl 529 ndecl++ 530 if len(name) == 1 { // unified categories 531 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) 532 dumpRange( 533 decl, 534 func(code rune) bool { return categoryOp(code, name[0]) }) 535 continue 536 } 537 dumpRange( 538 fmt.Sprintf("var _%s = &RangeTable{\n", name), 539 func(code rune) bool { return chars[code].category == name }) 540 } 541 decl.Sort() 542 println("// These variables have type *RangeTable.") 543 println("var (") 544 for _, d := range decl { 545 print(d) 546 } 547 print(")\n\n") 548 } 549 550 type Op func(code rune) bool 551 552 const format = "\t\t{0x%04x, 0x%04x, %d},\n" 553 554 func dumpRange(header string, inCategory Op) { 555 print(header) 556 next := rune(0) 557 latinOffset := 0 558 print("\tR16: []Range16{\n") 559 // one Range for each iteration 560 count := &range16Count 561 size := 16 562 for { 563 // look for start of range 564 for next < rune(len(chars)) && !inCategory(next) { 565 next++ 566 } 567 if next >= rune(len(chars)) { 568 // no characters remain 569 break 570 } 571 572 // start of range 573 lo := next 574 hi := next 575 stride := rune(1) 576 // accept lo 577 next++ 578 // look for another character to set the stride 579 for next < rune(len(chars)) && !inCategory(next) { 580 next++ 581 } 582 if next >= rune(len(chars)) { 583 // no more characters 584 printf(format, lo, hi, stride) 585 break 586 } 587 // set stride 588 stride = next - lo 589 // check for length of run. next points to first jump in stride 590 for i := next; i < rune(len(chars)); i++ { 591 if inCategory(i) == (((i - lo) % stride) == 0) { 592 // accept 593 if inCategory(i) { 594 hi = i 595 } 596 } else { 597 // no more characters in this run 598 break 599 } 600 } 601 if uint32(hi) <= unicode.MaxLatin1 { 602 latinOffset++ 603 } 604 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) 605 // next range: start looking where this range ends 606 next = hi + 1 607 } 608 print("\t},\n") 609 if latinOffset > 0 { 610 printf("\tLatinOffset: %d,\n", latinOffset) 611 } 612 print("}\n\n") 613 } 614 615 func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { 616 if size == 16 && hi >= 1<<16 { 617 if lo < 1<<16 { 618 if lo+stride != hi { 619 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) 620 } 621 // No range contains U+FFFF as an instance, so split 622 // the range into two entries. That way we can maintain 623 // the invariant that R32 contains only >= 1<<16. 624 printf(format, lo, lo, 1) 625 lo = hi 626 stride = 1 627 *count++ 628 } 629 print("\t},\n") 630 print("\tR32: []Range32{\n") 631 size = 32 632 count = &range32Count 633 } 634 printf(format, lo, hi, stride) 635 *count++ 636 return size, count 637 } 638 639 func fullCategoryTest(list []string) { 640 for _, name := range list { 641 if _, ok := category[name]; !ok { 642 logger.Fatal("unknown category", name) 643 } 644 r, ok := unicode.Categories[name] 645 if !ok && len(name) > 1 { 646 logger.Fatalf("unknown table %q", name) 647 } 648 if len(name) == 1 { 649 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 650 } else { 651 verifyRange( 652 name, 653 func(code rune) bool { return chars[code].category == name }, 654 r) 655 } 656 } 657 } 658 659 func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 660 count := 0 661 for j := range chars { 662 i := rune(j) 663 web := inCategory(i) 664 pkg := unicode.Is(table, i) 665 if web != pkg { 666 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 667 count++ 668 if count > 10 { 669 break 670 } 671 } 672 } 673 } 674 675 func parseScript(line string, scripts map[string][]Script) { 676 comment := strings.Index(line, "#") 677 if comment >= 0 { 678 line = line[0:comment] 679 } 680 line = strings.TrimSpace(line) 681 if len(line) == 0 { 682 return 683 } 684 field := strings.Split(line, ";") 685 if len(field) != 2 { 686 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) 687 } 688 matches := scriptRe.FindStringSubmatch(line) 689 if len(matches) != 4 { 690 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches)) 691 } 692 lo, err := strconv.ParseUint(matches[1], 16, 64) 693 if err != nil { 694 logger.Fatalf("%.5s...: %s", line, err) 695 } 696 hi := lo 697 if len(matches[2]) > 2 { // ignore leading .. 698 hi, err = strconv.ParseUint(matches[2][2:], 16, 64) 699 if err != nil { 700 logger.Fatalf("%.5s...: %s", line, err) 701 } 702 } 703 name := matches[3] 704 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) 705 } 706 707 // The script tables have a lot of adjacent elements. Fold them together. 708 func foldAdjacent(r []Script) []unicode.Range32 { 709 s := make([]unicode.Range32, 0, len(r)) 710 j := 0 711 for i := 0; i < len(r); i++ { 712 if j > 0 && r[i].lo == s[j-1].Hi+1 { 713 s[j-1].Hi = r[i].hi 714 } else { 715 s = s[0 : j+1] 716 s[j] = unicode.Range32{ 717 Lo: uint32(r[i].lo), 718 Hi: uint32(r[i].hi), 719 Stride: 1, 720 } 721 j++ 722 } 723 } 724 return s 725 } 726 727 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { 728 for _, name := range list { 729 if _, ok := scripts[name]; !ok { 730 logger.Fatal("unknown script", name) 731 } 732 _, ok := installed[name] 733 if !ok { 734 logger.Fatal("unknown table", name) 735 } 736 for _, script := range scripts[name] { 737 for r := script.lo; r <= script.hi; r++ { 738 if !unicode.Is(installed[name], rune(r)) { 739 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 740 } 741 } 742 } 743 } 744 } 745 746 var deprecatedAliases = map[string]string{ 747 "Sentence_Terminal": "STerm", 748 } 749 750 // PropList.txt has the same format as Scripts.txt so we can share its parser. 751 func printScriptOrProperty(doProps bool) { 752 flag := "scripts" 753 flaglist := *scriptlist 754 file := "Scripts.txt" 755 table := scripts 756 installed := unicode.Scripts 757 if doProps { 758 flag = "props" 759 flaglist = *proplist 760 file = "PropList.txt" 761 table = props 762 installed = unicode.Properties 763 } 764 if flaglist == "" { 765 return 766 } 767 input := open(*url + file) 768 scanner := bufio.NewScanner(input) 769 for scanner.Scan() { 770 parseScript(scanner.Text(), table) 771 } 772 if scanner.Err() != nil { 773 logger.Fatal(scanner.Err()) 774 } 775 input.close() 776 777 // Find out which scripts to dump 778 list := strings.Split(flaglist, ",") 779 if flaglist == "all" { 780 list = all(table) 781 } 782 if *test { 783 fullScriptTest(list, installed, table) 784 return 785 } 786 787 printf( 788 "// Generated by running\n"+ 789 "// maketables --%s=%s --url=%s\n"+ 790 "// DO NOT EDIT\n\n", 791 flag, 792 flaglist, 793 *url) 794 if flaglist == "all" { 795 if doProps { 796 println("// Properties is the set of Unicode property tables.") 797 println("var Properties = map[string] *RangeTable{") 798 } else { 799 println("// Scripts is the set of Unicode script tables.") 800 println("var Scripts = map[string] *RangeTable{") 801 } 802 for _, k := range all(table) { 803 printf("\t%q: %s,\n", k, k) 804 if alias, ok := deprecatedAliases[k]; ok { 805 printf("\t%q: %s,\n", alias, k) 806 } 807 } 808 print("}\n\n") 809 } 810 811 decl := make(sort.StringSlice, len(list)+len(deprecatedAliases)) 812 ndecl := 0 813 for _, name := range list { 814 if doProps { 815 decl[ndecl] = fmt.Sprintf( 816 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 817 name, name, name, name) 818 } else { 819 decl[ndecl] = fmt.Sprintf( 820 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 821 name, name, name, name) 822 } 823 ndecl++ 824 if alias, ok := deprecatedAliases[name]; ok { 825 decl[ndecl] = fmt.Sprintf( 826 "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n", 827 alias, name) 828 ndecl++ 829 } 830 printf("var _%s = &RangeTable {\n", name) 831 ranges := foldAdjacent(table[name]) 832 print("\tR16: []Range16{\n") 833 size := 16 834 count := &range16Count 835 for _, s := range ranges { 836 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) 837 } 838 print("\t},\n") 839 if off := findLatinOffset(ranges); off > 0 { 840 printf("\tLatinOffset: %d,\n", off) 841 } 842 print("}\n\n") 843 } 844 decl.Sort() 845 println("// These variables have type *RangeTable.") 846 println("var (") 847 for _, d := range decl { 848 print(d) 849 } 850 print(")\n\n") 851 } 852 853 func findLatinOffset(ranges []unicode.Range32) int { 854 i := 0 855 for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { 856 i++ 857 } 858 return i 859 } 860 861 const ( 862 CaseUpper = 1 << iota 863 CaseLower 864 CaseTitle 865 CaseNone = 0 // must be zero 866 CaseMissing = -1 // character not present; not a valid case state 867 ) 868 869 type caseState struct { 870 point rune 871 _case int 872 deltaToUpper rune 873 deltaToLower rune 874 deltaToTitle rune 875 } 876 877 // Is d a continuation of the state of c? 878 func (c *caseState) adjacent(d *caseState) bool { 879 if d.point < c.point { 880 c, d = d, c 881 } 882 switch { 883 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 884 return false 885 case d._case != c._case: // different cases 886 return c.upperLowerAdjacent(d) 887 case c._case == CaseNone: 888 return false 889 case c._case == CaseMissing: 890 return false 891 case d.deltaToUpper != c.deltaToUpper: 892 return false 893 case d.deltaToLower != c.deltaToLower: 894 return false 895 case d.deltaToTitle != c.deltaToTitle: 896 return false 897 } 898 return true 899 } 900 901 // Is d the same as c, but opposite in upper/lower case? this would make it 902 // an element of an UpperLower sequence. 903 func (c *caseState) upperLowerAdjacent(d *caseState) bool { 904 // check they're a matched case pair. we know they have adjacent values 905 switch { 906 case c._case == CaseUpper && d._case != CaseLower: 907 return false 908 case c._case == CaseLower && d._case != CaseUpper: 909 return false 910 } 911 // matched pair (at least in upper/lower). make the order Upper Lower 912 if c._case == CaseLower { 913 c, d = d, c 914 } 915 // for an Upper Lower sequence the deltas have to be in order 916 // c: 0 1 0 917 // d: -1 0 -1 918 switch { 919 case c.deltaToUpper != 0: 920 return false 921 case c.deltaToLower != 1: 922 return false 923 case c.deltaToTitle != 0: 924 return false 925 case d.deltaToUpper != -1: 926 return false 927 case d.deltaToLower != 0: 928 return false 929 case d.deltaToTitle != -1: 930 return false 931 } 932 return true 933 } 934 935 // Does this character start an UpperLower sequence? 936 func (c *caseState) isUpperLower() bool { 937 // for an Upper Lower sequence the deltas have to be in order 938 // c: 0 1 0 939 switch { 940 case c.deltaToUpper != 0: 941 return false 942 case c.deltaToLower != 1: 943 return false 944 case c.deltaToTitle != 0: 945 return false 946 } 947 return true 948 } 949 950 // Does this character start a LowerUpper sequence? 951 func (c *caseState) isLowerUpper() bool { 952 // for an Upper Lower sequence the deltas have to be in order 953 // c: -1 0 -1 954 switch { 955 case c.deltaToUpper != -1: 956 return false 957 case c.deltaToLower != 0: 958 return false 959 case c.deltaToTitle != -1: 960 return false 961 } 962 return true 963 } 964 965 func getCaseState(i rune) (c *caseState) { 966 c = &caseState{point: i, _case: CaseNone} 967 ch := &chars[i] 968 switch ch.codePoint { 969 case 0: 970 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 971 return 972 case ch.upperCase: 973 c._case = CaseUpper 974 case ch.lowerCase: 975 c._case = CaseLower 976 case ch.titleCase: 977 c._case = CaseTitle 978 } 979 // Some things such as roman numeral U+2161 don't describe themselves 980 // as upper case, but have a lower case. Second-guess them. 981 if c._case == CaseNone && ch.lowerCase != 0 { 982 c._case = CaseUpper 983 } 984 // Same in the other direction. 985 if c._case == CaseNone && ch.upperCase != 0 { 986 c._case = CaseLower 987 } 988 989 if ch.upperCase != 0 { 990 c.deltaToUpper = ch.upperCase - i 991 } 992 if ch.lowerCase != 0 { 993 c.deltaToLower = ch.lowerCase - i 994 } 995 if ch.titleCase != 0 { 996 c.deltaToTitle = ch.titleCase - i 997 } 998 return 999 } 1000 1001 func printCases() { 1002 if !*cases { 1003 return 1004 } 1005 if *test { 1006 fullCaseTest() 1007 return 1008 } 1009 printf( 1010 "// Generated by running\n"+ 1011 "// maketables --data=%s --casefolding=%s\n"+ 1012 "// DO NOT EDIT\n\n"+ 1013 "// CaseRanges is the table describing case mappings for all letters with\n"+ 1014 "// non-self mappings.\n"+ 1015 "var CaseRanges = _CaseRanges\n"+ 1016 "var _CaseRanges = []CaseRange {\n", 1017 *dataURL, *casefoldingURL) 1018 1019 var startState *caseState // the start of a run; nil for not active 1020 var prevState = &caseState{} // the state of the previous character 1021 for i := range chars { 1022 state := getCaseState(rune(i)) 1023 if state.adjacent(prevState) { 1024 prevState = state 1025 continue 1026 } 1027 // end of run (possibly) 1028 printCaseRange(startState, prevState) 1029 startState = nil 1030 if state._case != CaseMissing && state._case != CaseNone { 1031 startState = state 1032 } 1033 prevState = state 1034 } 1035 print("}\n") 1036 } 1037 1038 func printCaseRange(lo, hi *caseState) { 1039 if lo == nil { 1040 return 1041 } 1042 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 1043 // character represents itself in all cases - no need to mention it 1044 return 1045 } 1046 switch { 1047 case hi.point > lo.point && lo.isUpperLower(): 1048 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 1049 lo.point, hi.point) 1050 case hi.point > lo.point && lo.isLowerUpper(): 1051 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 1052 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 1053 lo.point, hi.point) 1054 default: 1055 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 1056 lo.point, hi.point, 1057 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 1058 } 1059 } 1060 1061 // If the cased value in the Char is 0, it means use the rune itself. 1062 func caseIt(r, cased rune) rune { 1063 if cased == 0 { 1064 return r 1065 } 1066 return cased 1067 } 1068 1069 func fullCaseTest() { 1070 for j, c := range chars { 1071 i := rune(j) 1072 lower := unicode.ToLower(i) 1073 want := caseIt(i, c.lowerCase) 1074 if lower != want { 1075 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 1076 } 1077 upper := unicode.ToUpper(i) 1078 want = caseIt(i, c.upperCase) 1079 if upper != want { 1080 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 1081 } 1082 title := unicode.ToTitle(i) 1083 want = caseIt(i, c.titleCase) 1084 if title != want { 1085 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 1086 } 1087 } 1088 } 1089 1090 func printLatinProperties() { 1091 if *test { 1092 return 1093 } 1094 println("var properties = [MaxLatin1+1]uint8{") 1095 for code := 0; code <= unicode.MaxLatin1; code++ { 1096 var property string 1097 switch chars[code].category { 1098 case "Cc", "": // NUL has no category. 1099 property = "pC" 1100 case "Cf": // soft hyphen, unique category, not printable. 1101 property = "0" 1102 case "Ll": 1103 property = "pLl | pp" 1104 case "Lo": 1105 property = "pLo | pp" 1106 case "Lu": 1107 property = "pLu | pp" 1108 case "Nd", "No": 1109 property = "pN | pp" 1110 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 1111 property = "pP | pp" 1112 case "Sc", "Sk", "Sm", "So": 1113 property = "pS | pp" 1114 case "Zs": 1115 property = "pZ" 1116 default: 1117 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 1118 } 1119 // Special case 1120 if code == ' ' { 1121 property = "pZ | pp" 1122 } 1123 printf("\t0x%02X: %s, // %q\n", code, property, code) 1124 } 1125 printf("}\n\n") 1126 } 1127 1128 type runeSlice []rune 1129 1130 func (p runeSlice) Len() int { return len(p) } 1131 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 1132 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 1133 1134 func printCasefold() { 1135 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 1136 var caseOrbit = make([][]rune, MaxChar+1) 1137 for j := range chars { 1138 i := rune(j) 1139 c := &chars[i] 1140 if c.foldCase == 0 { 1141 continue 1142 } 1143 orb := caseOrbit[c.foldCase] 1144 if orb == nil { 1145 orb = append(orb, c.foldCase) 1146 } 1147 caseOrbit[c.foldCase] = append(orb, i) 1148 } 1149 1150 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 1151 for j := range chars { 1152 i := rune(j) 1153 c := &chars[i] 1154 f := c.foldCase 1155 if f == 0 { 1156 f = i 1157 } 1158 orb := caseOrbit[f] 1159 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 1160 // Default assumption of [upper, lower] is wrong. 1161 caseOrbit[i] = []rune{i} 1162 } 1163 } 1164 1165 // Delete the groups for which assuming [lower, upper] or [upper, lower] is right. 1166 for i, orb := range caseOrbit { 1167 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 1168 caseOrbit[i] = nil 1169 } 1170 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] { 1171 caseOrbit[i] = nil 1172 } 1173 } 1174 1175 // Record orbit information in chars. 1176 for _, orb := range caseOrbit { 1177 if orb == nil { 1178 continue 1179 } 1180 sort.Sort(runeSlice(orb)) 1181 c := orb[len(orb)-1] 1182 for _, d := range orb { 1183 chars[c].caseOrbit = d 1184 c = d 1185 } 1186 } 1187 1188 printAsciiFold() 1189 printCaseOrbit() 1190 1191 // Tables of category and script folding exceptions: code points 1192 // that must be added when interpreting a particular category/script 1193 // in a case-folding context. 1194 cat := make(map[string]map[rune]bool) 1195 for name := range category { 1196 if x := foldExceptions(inCategory(name)); len(x) > 0 { 1197 cat[name] = x 1198 } 1199 } 1200 1201 scr := make(map[string]map[rune]bool) 1202 for name := range scripts { 1203 if x := foldExceptions(inScript(name)); len(x) > 0 { 1204 cat[name] = x 1205 } 1206 } 1207 1208 printCatFold("FoldCategory", cat) 1209 printCatFold("FoldScript", scr) 1210 } 1211 1212 // inCategory returns a list of all the runes in the category. 1213 func inCategory(name string) []rune { 1214 var x []rune 1215 for j := range chars { 1216 i := rune(j) 1217 c := &chars[i] 1218 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 1219 x = append(x, i) 1220 } 1221 } 1222 return x 1223 } 1224 1225 // inScript returns a list of all the runes in the script. 1226 func inScript(name string) []rune { 1227 var x []rune 1228 for _, s := range scripts[name] { 1229 for c := s.lo; c <= s.hi; c++ { 1230 x = append(x, rune(c)) 1231 } 1232 } 1233 return x 1234 } 1235 1236 // foldExceptions returns a list of all the runes fold-equivalent 1237 // to runes in class but not in class themselves. 1238 func foldExceptions(class []rune) map[rune]bool { 1239 // Create map containing class and all fold-equivalent chars. 1240 m := make(map[rune]bool) 1241 for _, r := range class { 1242 c := &chars[r] 1243 if c.caseOrbit == 0 { 1244 // Just upper and lower. 1245 if u := c.upperCase; u != 0 { 1246 m[u] = true 1247 } 1248 if l := c.lowerCase; l != 0 { 1249 m[l] = true 1250 } 1251 m[r] = true 1252 continue 1253 } 1254 // Otherwise walk orbit. 1255 r0 := r 1256 for { 1257 m[r] = true 1258 r = chars[r].caseOrbit 1259 if r == r0 { 1260 break 1261 } 1262 } 1263 } 1264 1265 // Remove class itself. 1266 for _, r := range class { 1267 delete(m, r) 1268 } 1269 1270 // What's left is the exceptions. 1271 return m 1272 } 1273 1274 var comment = map[string]string{ 1275 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 1276 "// code points outside the category that are equivalent under\n" + 1277 "// simple case folding to code points inside the category.\n" + 1278 "// If there is no entry for a category name, there are no such points.\n", 1279 1280 "FoldScript": "// FoldScript maps a script name to a table of\n" + 1281 "// code points outside the script that are equivalent under\n" + 1282 "// simple case folding to code points inside the script.\n" + 1283 "// If there is no entry for a script name, there are no such points.\n", 1284 } 1285 1286 func printAsciiFold() { 1287 printf("var asciiFold = [MaxASCII + 1]uint16{\n") 1288 for i := rune(0); i <= unicode.MaxASCII; i++ { 1289 c := chars[i] 1290 f := c.caseOrbit 1291 if f == 0 { 1292 if c.lowerCase != i && c.lowerCase != 0 { 1293 f = c.lowerCase 1294 } else if c.upperCase != i && c.upperCase != 0 { 1295 f = c.upperCase 1296 } else { 1297 f = i 1298 } 1299 } 1300 printf("\t0x%04X,\n", f) 1301 } 1302 printf("}\n\n") 1303 } 1304 1305 func printCaseOrbit() { 1306 if *test { 1307 for j := range chars { 1308 i := rune(j) 1309 c := &chars[i] 1310 f := c.caseOrbit 1311 if f == 0 { 1312 if c.lowerCase != i && c.lowerCase != 0 { 1313 f = c.lowerCase 1314 } else if c.upperCase != i && c.upperCase != 0 { 1315 f = c.upperCase 1316 } else { 1317 f = i 1318 } 1319 } 1320 if g := unicode.SimpleFold(i); g != f { 1321 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 1322 } 1323 } 1324 return 1325 } 1326 1327 printf("var caseOrbit = []foldPair{\n") 1328 for i := range chars { 1329 c := &chars[i] 1330 if c.caseOrbit != 0 { 1331 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 1332 foldPairCount++ 1333 } 1334 } 1335 printf("}\n\n") 1336 } 1337 1338 func printCatFold(name string, m map[string]map[rune]bool) { 1339 if *test { 1340 var pkgMap map[string]*unicode.RangeTable 1341 if name == "FoldCategory" { 1342 pkgMap = unicode.FoldCategory 1343 } else { 1344 pkgMap = unicode.FoldScript 1345 } 1346 if len(pkgMap) != len(m) { 1347 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 1348 return 1349 } 1350 for k, v := range m { 1351 t, ok := pkgMap[k] 1352 if !ok { 1353 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 1354 continue 1355 } 1356 n := 0 1357 for _, r := range t.R16 { 1358 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1359 if !v[c] { 1360 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1361 } 1362 n++ 1363 } 1364 } 1365 for _, r := range t.R32 { 1366 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1367 if !v[c] { 1368 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1369 } 1370 n++ 1371 } 1372 } 1373 if n != len(v) { 1374 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 1375 } 1376 } 1377 return 1378 } 1379 1380 print(comment[name]) 1381 printf("var %s = map[string]*RangeTable{\n", name) 1382 for _, name := range allCatFold(m) { 1383 printf("\t%q: fold%s,\n", name, name) 1384 } 1385 printf("}\n\n") 1386 for _, name := range allCatFold(m) { 1387 class := m[name] 1388 dumpRange( 1389 fmt.Sprintf("var fold%s = &RangeTable{\n", name), 1390 func(code rune) bool { return class[code] }) 1391 } 1392 } 1393 1394 var range16Count = 0 // Number of entries in the 16-bit range tables. 1395 var range32Count = 0 // Number of entries in the 32-bit range tables. 1396 var foldPairCount = 0 // Number of fold pairs in the exception tables. 1397 1398 func printSizes() { 1399 if *test { 1400 return 1401 } 1402 println() 1403 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1404 range16Bytes := range16Count * 3 * 2 1405 range32Bytes := range32Count * 3 * 4 1406 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1407 println() 1408 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1409 }