github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/unicode/maketables.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Unicode table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "log" 18 "net/http" 19 "os" 20 "os/exec" 21 "path/filepath" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 "unicode" 27 ) 28 29 func main() { 30 flag.Parse() 31 setupOutput() 32 loadChars() // always needed 33 loadCasefold() 34 printCategories() 35 printScriptOrProperty(false) 36 printScriptOrProperty(true) 37 printCases() 38 printLatinProperties() 39 printCasefold() 40 printSizes() 41 flushOutput() 42 } 43 44 func defaultVersion() string { 45 if v := os.Getenv("UNICODE_VERSION"); v != "" { 46 return v 47 } 48 return unicode.Version 49 } 50 51 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") 52 var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") 53 var url = flag.String("url", 54 "http://www.unicode.org/Public/"+defaultVersion()+"/ucd/", 55 "URL of Unicode database directory") 56 var tablelist = flag.String("tables", 57 "all", 58 "comma-separated list of which tables to generate; can be letter") 59 var scriptlist = flag.String("scripts", 60 "all", 61 "comma-separated list of which script tables to generate") 62 var proplist = flag.String("props", 63 "all", 64 "comma-separated list of which property tables to generate") 65 var cases = flag.Bool("cases", 66 true, 67 "generate case tables") 68 var test = flag.Bool("test", 69 false, 70 "test existing tables; can be used to compare web data with package data") 71 var localFiles = flag.Bool("local", 72 false, 73 "data files have been copied to current directory; for debugging only") 74 var outputFile = flag.String("output", 75 "", 76 "output file for generated tables; default stdout") 77 78 var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 79 var logger = log.New(os.Stderr, "", log.Lshortfile) 80 81 var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile" 82 83 func setupOutput() { 84 output = bufio.NewWriter(startGofmt()) 85 } 86 87 // startGofmt connects output to a gofmt process if -output is set. 88 func startGofmt() io.Writer { 89 if *outputFile == "" { 90 return os.Stdout 91 } 92 stdout, err := os.Create(*outputFile) 93 if err != nil { 94 logger.Fatal(err) 95 } 96 // Pipe output to gofmt. 97 gofmt := exec.Command("gofmt") 98 fd, err := gofmt.StdinPipe() 99 if err != nil { 100 logger.Fatal(err) 101 } 102 gofmt.Stdout = stdout 103 gofmt.Stderr = os.Stderr 104 err = gofmt.Start() 105 if err != nil { 106 logger.Fatal(err) 107 } 108 return fd 109 } 110 111 func flushOutput() { 112 err := output.Flush() 113 if err != nil { 114 logger.Fatal(err) 115 } 116 } 117 118 func printf(format string, args ...interface{}) { 119 fmt.Fprintf(output, format, args...) 120 } 121 122 func print(args ...interface{}) { 123 fmt.Fprint(output, args...) 124 } 125 126 func println(args ...interface{}) { 127 fmt.Fprintln(output, args...) 128 } 129 130 type reader struct { 131 *bufio.Reader 132 fd *os.File 133 resp *http.Response 134 } 135 136 func open(url string) *reader { 137 file := filepath.Base(url) 138 if *localFiles { 139 fd, err := os.Open(file) 140 if err != nil { 141 logger.Fatal(err) 142 } 143 return &reader{bufio.NewReader(fd), fd, nil} 144 } 145 resp, err := http.Get(url) 146 if err != nil { 147 logger.Fatal(err) 148 } 149 if resp.StatusCode != 200 { 150 logger.Fatalf("bad GET status for %s: %d", file, resp.Status) 151 } 152 return &reader{bufio.NewReader(resp.Body), nil, resp} 153 154 } 155 156 func (r *reader) close() { 157 if r.fd != nil { 158 r.fd.Close() 159 } else { 160 r.resp.Body.Close() 161 } 162 } 163 164 var category = map[string]bool{ 165 // Nd Lu etc. 166 // We use one-character names to identify merged categories 167 "L": true, // Lu Ll Lt Lm Lo 168 "P": true, // Pc Pd Ps Pe Pu Pf Po 169 "M": true, // Mn Mc Me 170 "N": true, // Nd Nl No 171 "S": true, // Sm Sc Sk So 172 "Z": true, // Zs Zl Zp 173 "C": true, // Cc Cf Cs Co Cn 174 } 175 176 // UnicodeData.txt has form: 177 // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; 178 // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A 179 // See http://www.unicode.org/reports/tr44/ for a full explanation 180 // The fields: 181 const ( 182 FCodePoint = iota 183 FName 184 FGeneralCategory 185 FCanonicalCombiningClass 186 FBidiClass 187 FDecompositionTypeAndMapping 188 FNumericType 189 FNumericDigit // If a decimal digit. 190 FNumericValue // Includes non-decimal, e.g. U+2155=1/5 191 FBidiMirrored 192 FUnicode1Name 193 FISOComment 194 FSimpleUppercaseMapping 195 FSimpleLowercaseMapping 196 FSimpleTitlecaseMapping 197 NumField 198 199 MaxChar = 0x10FFFF // anything above this shouldn't exist 200 ) 201 202 var fieldName = []string{ 203 FCodePoint: "CodePoint", 204 FName: "Name", 205 FGeneralCategory: "GeneralCategory", 206 FCanonicalCombiningClass: "CanonicalCombiningClass", 207 FBidiClass: "BidiClass", 208 FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", 209 FNumericType: "NumericType", 210 FNumericDigit: "NumericDigit", 211 FNumericValue: "NumericValue", 212 FBidiMirrored: "BidiMirrored", 213 FUnicode1Name: "Unicode1Name", 214 FISOComment: "ISOComment", 215 FSimpleUppercaseMapping: "SimpleUppercaseMapping", 216 FSimpleLowercaseMapping: "SimpleLowercaseMapping", 217 FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", 218 } 219 220 // This contains only the properties we're interested in. 221 type Char struct { 222 field []string // debugging only; could be deleted if we take out char.dump() 223 codePoint rune // if zero, this index is not a valid code point. 224 category string 225 upperCase rune 226 lowerCase rune 227 titleCase rune 228 foldCase rune // simple case folding 229 caseOrbit rune // next in simple case folding orbit 230 } 231 232 // Scripts.txt has form: 233 // A673 ; Cyrillic # Po SLAVONIC ASTERISK 234 // A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK 235 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation 236 237 type Script struct { 238 lo, hi uint32 // range of code points 239 script string 240 } 241 242 var chars = make([]Char, MaxChar+1) 243 var scripts = make(map[string][]Script) 244 var props = make(map[string][]Script) // a property looks like a script; can share the format 245 246 var lastChar rune = 0 247 248 // In UnicodeData.txt, some ranges are marked like this: 249 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 250 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 251 // parseCategory returns a state variable indicating the weirdness. 252 type State int 253 254 const ( 255 SNormal State = iota // known to be zero for the type 256 SFirst 257 SLast 258 SMissing 259 ) 260 261 func parseCategory(line string) (state State) { 262 field := strings.Split(line, ";") 263 if len(field) != NumField { 264 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) 265 } 266 point, err := strconv.ParseUint(field[FCodePoint], 16, 64) 267 if err != nil { 268 logger.Fatalf("%.5s...: %s", line, err) 269 } 270 lastChar = rune(point) 271 if point > MaxChar { 272 return 273 } 274 char := &chars[point] 275 char.field = field 276 if char.codePoint != 0 { 277 logger.Fatalf("point %U reused", point) 278 } 279 char.codePoint = lastChar 280 char.category = field[FGeneralCategory] 281 category[char.category] = true 282 switch char.category { 283 case "Nd": 284 // Decimal digit 285 _, err := strconv.Atoi(field[FNumericValue]) 286 if err != nil { 287 logger.Fatalf("%U: bad numeric field: %s", point, err) 288 } 289 case "Lu": 290 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 291 case "Ll": 292 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) 293 case "Lt": 294 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) 295 default: 296 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 297 } 298 switch { 299 case strings.Index(field[FName], ", First>") > 0: 300 state = SFirst 301 case strings.Index(field[FName], ", Last>") > 0: 302 state = SLast 303 } 304 return 305 } 306 307 func (char *Char) dump(s string) { 308 print(s, " ") 309 for i := 0; i < len(char.field); i++ { 310 printf("%s:%q ", fieldName[i], char.field[i]) 311 } 312 print("\n") 313 } 314 315 func (char *Char) letter(u, l, t string) { 316 char.upperCase = char.letterValue(u, "U") 317 char.lowerCase = char.letterValue(l, "L") 318 char.titleCase = char.letterValue(t, "T") 319 } 320 321 func (char *Char) letterValue(s string, cas string) rune { 322 if s == "" { 323 return 0 324 } 325 v, err := strconv.ParseUint(s, 16, 64) 326 if err != nil { 327 char.dump(cas) 328 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err) 329 } 330 return rune(v) 331 } 332 333 func allCategories() []string { 334 a := make([]string, 0, len(category)) 335 for k := range category { 336 a = append(a, k) 337 } 338 sort.Strings(a) 339 return a 340 } 341 342 func all(scripts map[string][]Script) []string { 343 a := make([]string, 0, len(scripts)) 344 for k := range scripts { 345 a = append(a, k) 346 } 347 sort.Strings(a) 348 return a 349 } 350 351 func allCatFold(m map[string]map[rune]bool) []string { 352 a := make([]string, 0, len(m)) 353 for k := range m { 354 a = append(a, k) 355 } 356 sort.Strings(a) 357 return a 358 } 359 360 // Extract the version number from the URL 361 func version() string { 362 // Break on slashes and look for the first numeric field 363 fields := strings.Split(*url, "/") 364 for _, f := range fields { 365 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { 366 return f 367 } 368 } 369 logger.Fatal("unknown version") 370 return "Unknown" 371 } 372 373 func categoryOp(code rune, class uint8) bool { 374 category := chars[code].category 375 return len(category) > 0 && category[0] == class 376 } 377 378 func loadChars() { 379 if *dataURL == "" { 380 flag.Set("data", *url+"UnicodeData.txt") 381 } 382 input := open(*dataURL) 383 defer input.close() 384 scanner := bufio.NewScanner(input) 385 var first rune = 0 386 for scanner.Scan() { 387 switch parseCategory(scanner.Text()) { 388 case SNormal: 389 if first != 0 { 390 logger.Fatalf("bad state normal at %U", lastChar) 391 } 392 case SFirst: 393 if first != 0 { 394 logger.Fatalf("bad state first at %U", lastChar) 395 } 396 first = lastChar 397 case SLast: 398 if first == 0 { 399 logger.Fatalf("bad state last at %U", lastChar) 400 } 401 for i := first + 1; i <= lastChar; i++ { 402 chars[i] = chars[first] 403 chars[i].codePoint = i 404 } 405 first = 0 406 } 407 } 408 if scanner.Err() != nil { 409 logger.Fatal(scanner.Err()) 410 } 411 } 412 413 func loadCasefold() { 414 if *casefoldingURL == "" { 415 flag.Set("casefolding", *url+"CaseFolding.txt") 416 } 417 input := open(*casefoldingURL) 418 defer input.close() 419 scanner := bufio.NewScanner(input) 420 for scanner.Scan() { 421 line := scanner.Text() 422 if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 { 423 continue 424 } 425 field := strings.Split(line, "; ") 426 if len(field) != 4 { 427 logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) 428 } 429 kind := field[1] 430 if kind != "C" && kind != "S" { 431 // Only care about 'common' and 'simple' foldings. 432 continue 433 } 434 p1, err := strconv.ParseUint(field[0], 16, 64) 435 if err != nil { 436 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 437 } 438 p2, err := strconv.ParseUint(field[2], 16, 64) 439 if err != nil { 440 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 441 } 442 chars[p1].foldCase = rune(p2) 443 } 444 if scanner.Err() != nil { 445 logger.Fatal(scanner.Err()) 446 } 447 } 448 449 const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. 450 // Use of this source code is governed by a BSD-style 451 // license that can be found in the LICENSE file. 452 453 // Code generated by maketables; DO NOT EDIT. 454 // To regenerate, run: 455 // maketables --tables=%s --data=%s --casefolding=%s 456 457 package unicode 458 459 ` 460 461 func printCategories() { 462 if *tablelist == "" { 463 return 464 } 465 // Find out which categories to dump 466 list := strings.Split(*tablelist, ",") 467 if *tablelist == "all" { 468 list = allCategories() 469 } 470 if *test { 471 fullCategoryTest(list) 472 return 473 } 474 printf(progHeader, *tablelist, *dataURL, *casefoldingURL) 475 476 println("// Version is the Unicode edition from which the tables are derived.") 477 printf("const Version = %q\n\n", version()) 478 479 if *tablelist == "all" { 480 println("// Categories is the set of Unicode category tables.") 481 println("var Categories = map[string] *RangeTable {") 482 for _, k := range allCategories() { 483 printf("\t%q: %s,\n", k, k) 484 } 485 print("}\n\n") 486 } 487 488 decl := make(sort.StringSlice, len(list)) 489 ndecl := 0 490 for _, name := range list { 491 if _, ok := category[name]; !ok { 492 logger.Fatal("unknown category", name) 493 } 494 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 495 // name to store the data. This stops godoc dumping all the tables but keeps them 496 // available to clients. 497 // Cases deserving special comments 498 varDecl := "" 499 switch name { 500 case "C": 501 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 502 varDecl += "\tC = _C\n" 503 case "L": 504 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 505 varDecl += "\tL = _L\n" 506 case "M": 507 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 508 varDecl += "\tM = _M\n" 509 case "N": 510 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 511 varDecl += "\tN = _N\n" 512 case "P": 513 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 514 varDecl += "\tP = _P\n" 515 case "S": 516 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 517 varDecl += "\tS = _S\n" 518 case "Z": 519 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 520 varDecl += "\tZ = _Z\n" 521 case "Nd": 522 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 523 case "Lu": 524 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 525 case "Ll": 526 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 527 case "Lt": 528 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 529 } 530 if len(name) > 1 { 531 varDecl += fmt.Sprintf( 532 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 533 name, name, name, name) 534 } 535 decl[ndecl] = varDecl 536 ndecl++ 537 if len(name) == 1 { // unified categories 538 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) 539 dumpRange( 540 decl, 541 func(code rune) bool { return categoryOp(code, name[0]) }) 542 continue 543 } 544 dumpRange( 545 fmt.Sprintf("var _%s = &RangeTable{\n", name), 546 func(code rune) bool { return chars[code].category == name }) 547 } 548 decl.Sort() 549 println("// These variables have type *RangeTable.") 550 println("var (") 551 for _, d := range decl { 552 print(d) 553 } 554 print(")\n\n") 555 } 556 557 type Op func(code rune) bool 558 559 const format = "\t\t{0x%04x, 0x%04x, %d},\n" 560 561 func dumpRange(header string, inCategory Op) { 562 print(header) 563 next := rune(0) 564 latinOffset := 0 565 print("\tR16: []Range16{\n") 566 // one Range for each iteration 567 count := &range16Count 568 size := 16 569 for { 570 // look for start of range 571 for next < rune(len(chars)) && !inCategory(next) { 572 next++ 573 } 574 if next >= rune(len(chars)) { 575 // no characters remain 576 break 577 } 578 579 // start of range 580 lo := next 581 hi := next 582 stride := rune(1) 583 // accept lo 584 next++ 585 // look for another character to set the stride 586 for next < rune(len(chars)) && !inCategory(next) { 587 next++ 588 } 589 if next >= rune(len(chars)) { 590 // no more characters 591 printf(format, lo, hi, stride) 592 break 593 } 594 // set stride 595 stride = next - lo 596 // check for length of run. next points to first jump in stride 597 for i := next; i < rune(len(chars)); i++ { 598 if inCategory(i) == (((i - lo) % stride) == 0) { 599 // accept 600 if inCategory(i) { 601 hi = i 602 } 603 } else { 604 // no more characters in this run 605 break 606 } 607 } 608 if uint32(hi) <= unicode.MaxLatin1 { 609 latinOffset++ 610 } 611 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) 612 // next range: start looking where this range ends 613 next = hi + 1 614 } 615 print("\t},\n") 616 if latinOffset > 0 { 617 printf("\tLatinOffset: %d,\n", latinOffset) 618 } 619 print("}\n\n") 620 } 621 622 func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { 623 if size == 16 && hi >= 1<<16 { 624 if lo < 1<<16 { 625 if lo+stride != hi { 626 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) 627 } 628 // No range contains U+FFFF as an instance, so split 629 // the range into two entries. That way we can maintain 630 // the invariant that R32 contains only >= 1<<16. 631 printf(format, lo, lo, 1) 632 lo = hi 633 stride = 1 634 *count++ 635 } 636 print("\t},\n") 637 print("\tR32: []Range32{\n") 638 size = 32 639 count = &range32Count 640 } 641 printf(format, lo, hi, stride) 642 *count++ 643 return size, count 644 } 645 646 func fullCategoryTest(list []string) { 647 for _, name := range list { 648 if _, ok := category[name]; !ok { 649 logger.Fatal("unknown category", name) 650 } 651 r, ok := unicode.Categories[name] 652 if !ok && len(name) > 1 { 653 logger.Fatalf("unknown table %q", name) 654 } 655 if len(name) == 1 { 656 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 657 } else { 658 verifyRange( 659 name, 660 func(code rune) bool { return chars[code].category == name }, 661 r) 662 } 663 } 664 } 665 666 func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 667 count := 0 668 for j := range chars { 669 i := rune(j) 670 web := inCategory(i) 671 pkg := unicode.Is(table, i) 672 if web != pkg { 673 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 674 count++ 675 if count > 10 { 676 break 677 } 678 } 679 } 680 } 681 682 func parseScript(line string, scripts map[string][]Script) { 683 comment := strings.Index(line, "#") 684 if comment >= 0 { 685 line = line[0:comment] 686 } 687 line = strings.TrimSpace(line) 688 if len(line) == 0 { 689 return 690 } 691 field := strings.Split(line, ";") 692 if len(field) != 2 { 693 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) 694 } 695 matches := scriptRe.FindStringSubmatch(line) 696 if len(matches) != 4 { 697 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches)) 698 } 699 lo, err := strconv.ParseUint(matches[1], 16, 64) 700 if err != nil { 701 logger.Fatalf("%.5s...: %s", line, err) 702 } 703 hi := lo 704 if len(matches[2]) > 2 { // ignore leading .. 705 hi, err = strconv.ParseUint(matches[2][2:], 16, 64) 706 if err != nil { 707 logger.Fatalf("%.5s...: %s", line, err) 708 } 709 } 710 name := matches[3] 711 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) 712 } 713 714 // The script tables have a lot of adjacent elements. Fold them together. 715 func foldAdjacent(r []Script) []unicode.Range32 { 716 s := make([]unicode.Range32, 0, len(r)) 717 j := 0 718 for i := 0; i < len(r); i++ { 719 if j > 0 && r[i].lo == s[j-1].Hi+1 { 720 s[j-1].Hi = r[i].hi 721 } else { 722 s = s[0 : j+1] 723 s[j] = unicode.Range32{ 724 Lo: uint32(r[i].lo), 725 Hi: uint32(r[i].hi), 726 Stride: 1, 727 } 728 j++ 729 } 730 } 731 return s 732 } 733 734 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { 735 for _, name := range list { 736 if _, ok := scripts[name]; !ok { 737 logger.Fatal("unknown script", name) 738 } 739 _, ok := installed[name] 740 if !ok { 741 logger.Fatal("unknown table", name) 742 } 743 for _, script := range scripts[name] { 744 for r := script.lo; r <= script.hi; r++ { 745 if !unicode.Is(installed[name], rune(r)) { 746 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 747 } 748 } 749 } 750 } 751 } 752 753 var deprecatedAliases = map[string]string{ 754 "Sentence_Terminal": "STerm", 755 } 756 757 // PropList.txt has the same format as Scripts.txt so we can share its parser. 758 func printScriptOrProperty(doProps bool) { 759 flag := "scripts" 760 flaglist := *scriptlist 761 file := "Scripts.txt" 762 table := scripts 763 installed := unicode.Scripts 764 if doProps { 765 flag = "props" 766 flaglist = *proplist 767 file = "PropList.txt" 768 table = props 769 installed = unicode.Properties 770 } 771 if flaglist == "" { 772 return 773 } 774 input := open(*url + file) 775 scanner := bufio.NewScanner(input) 776 for scanner.Scan() { 777 parseScript(scanner.Text(), table) 778 } 779 if scanner.Err() != nil { 780 logger.Fatal(scanner.Err()) 781 } 782 input.close() 783 784 // Find out which scripts to dump 785 list := strings.Split(flaglist, ",") 786 if flaglist == "all" { 787 list = all(table) 788 } 789 if *test { 790 fullScriptTest(list, installed, table) 791 return 792 } 793 794 printf( 795 "// Generated by running\n"+ 796 "// maketables --%s=%s --url=%s\n"+ 797 "// DO NOT EDIT\n\n", 798 flag, 799 flaglist, 800 *url) 801 if flaglist == "all" { 802 if doProps { 803 println("// Properties is the set of Unicode property tables.") 804 println("var Properties = map[string] *RangeTable{") 805 } else { 806 println("// Scripts is the set of Unicode script tables.") 807 println("var Scripts = map[string] *RangeTable{") 808 } 809 for _, k := range all(table) { 810 printf("\t%q: %s,\n", k, k) 811 if alias, ok := deprecatedAliases[k]; ok { 812 printf("\t%q: %s,\n", alias, k) 813 } 814 } 815 print("}\n\n") 816 } 817 818 decl := make(sort.StringSlice, len(list)+len(deprecatedAliases)) 819 ndecl := 0 820 for _, name := range list { 821 if doProps { 822 decl[ndecl] = fmt.Sprintf( 823 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 824 name, name, name, name) 825 } else { 826 decl[ndecl] = fmt.Sprintf( 827 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 828 name, name, name, name) 829 } 830 ndecl++ 831 if alias, ok := deprecatedAliases[name]; ok { 832 decl[ndecl] = fmt.Sprintf( 833 "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n", 834 alias, name) 835 ndecl++ 836 } 837 printf("var _%s = &RangeTable {\n", name) 838 ranges := foldAdjacent(table[name]) 839 print("\tR16: []Range16{\n") 840 size := 16 841 count := &range16Count 842 for _, s := range ranges { 843 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) 844 } 845 print("\t},\n") 846 if off := findLatinOffset(ranges); off > 0 { 847 printf("\tLatinOffset: %d,\n", off) 848 } 849 print("}\n\n") 850 } 851 decl.Sort() 852 println("// These variables have type *RangeTable.") 853 println("var (") 854 for _, d := range decl { 855 print(d) 856 } 857 print(")\n\n") 858 } 859 860 func findLatinOffset(ranges []unicode.Range32) int { 861 i := 0 862 for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { 863 i++ 864 } 865 return i 866 } 867 868 const ( 869 CaseUpper = 1 << iota 870 CaseLower 871 CaseTitle 872 CaseNone = 0 // must be zero 873 CaseMissing = -1 // character not present; not a valid case state 874 ) 875 876 type caseState struct { 877 point rune 878 _case int 879 deltaToUpper rune 880 deltaToLower rune 881 deltaToTitle rune 882 } 883 884 // Is d a continuation of the state of c? 885 func (c *caseState) adjacent(d *caseState) bool { 886 if d.point < c.point { 887 c, d = d, c 888 } 889 switch { 890 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 891 return false 892 case d._case != c._case: // different cases 893 return c.upperLowerAdjacent(d) 894 case c._case == CaseNone: 895 return false 896 case c._case == CaseMissing: 897 return false 898 case d.deltaToUpper != c.deltaToUpper: 899 return false 900 case d.deltaToLower != c.deltaToLower: 901 return false 902 case d.deltaToTitle != c.deltaToTitle: 903 return false 904 } 905 return true 906 } 907 908 // Is d the same as c, but opposite in upper/lower case? this would make it 909 // an element of an UpperLower sequence. 910 func (c *caseState) upperLowerAdjacent(d *caseState) bool { 911 // check they're a matched case pair. we know they have adjacent values 912 switch { 913 case c._case == CaseUpper && d._case != CaseLower: 914 return false 915 case c._case == CaseLower && d._case != CaseUpper: 916 return false 917 } 918 // matched pair (at least in upper/lower). make the order Upper Lower 919 if c._case == CaseLower { 920 c, d = d, c 921 } 922 // for an Upper Lower sequence the deltas have to be in order 923 // c: 0 1 0 924 // d: -1 0 -1 925 switch { 926 case c.deltaToUpper != 0: 927 return false 928 case c.deltaToLower != 1: 929 return false 930 case c.deltaToTitle != 0: 931 return false 932 case d.deltaToUpper != -1: 933 return false 934 case d.deltaToLower != 0: 935 return false 936 case d.deltaToTitle != -1: 937 return false 938 } 939 return true 940 } 941 942 // Does this character start an UpperLower sequence? 943 func (c *caseState) isUpperLower() bool { 944 // for an Upper Lower sequence the deltas have to be in order 945 // c: 0 1 0 946 switch { 947 case c.deltaToUpper != 0: 948 return false 949 case c.deltaToLower != 1: 950 return false 951 case c.deltaToTitle != 0: 952 return false 953 } 954 return true 955 } 956 957 // Does this character start a LowerUpper sequence? 958 func (c *caseState) isLowerUpper() bool { 959 // for an Upper Lower sequence the deltas have to be in order 960 // c: -1 0 -1 961 switch { 962 case c.deltaToUpper != -1: 963 return false 964 case c.deltaToLower != 0: 965 return false 966 case c.deltaToTitle != -1: 967 return false 968 } 969 return true 970 } 971 972 func getCaseState(i rune) (c *caseState) { 973 c = &caseState{point: i, _case: CaseNone} 974 ch := &chars[i] 975 switch ch.codePoint { 976 case 0: 977 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 978 return 979 case ch.upperCase: 980 c._case = CaseUpper 981 case ch.lowerCase: 982 c._case = CaseLower 983 case ch.titleCase: 984 c._case = CaseTitle 985 } 986 // Some things such as roman numeral U+2161 don't describe themselves 987 // as upper case, but have a lower case. Second-guess them. 988 if c._case == CaseNone && ch.lowerCase != 0 { 989 c._case = CaseUpper 990 } 991 // Same in the other direction. 992 if c._case == CaseNone && ch.upperCase != 0 { 993 c._case = CaseLower 994 } 995 996 if ch.upperCase != 0 { 997 c.deltaToUpper = ch.upperCase - i 998 } 999 if ch.lowerCase != 0 { 1000 c.deltaToLower = ch.lowerCase - i 1001 } 1002 if ch.titleCase != 0 { 1003 c.deltaToTitle = ch.titleCase - i 1004 } 1005 return 1006 } 1007 1008 func printCases() { 1009 if !*cases { 1010 return 1011 } 1012 if *test { 1013 fullCaseTest() 1014 return 1015 } 1016 printf( 1017 "// Generated by running\n"+ 1018 "// maketables --data=%s --casefolding=%s\n"+ 1019 "// DO NOT EDIT\n\n"+ 1020 "// CaseRanges is the table describing case mappings for all letters with\n"+ 1021 "// non-self mappings.\n"+ 1022 "var CaseRanges = _CaseRanges\n"+ 1023 "var _CaseRanges = []CaseRange {\n", 1024 *dataURL, *casefoldingURL) 1025 1026 var startState *caseState // the start of a run; nil for not active 1027 var prevState = &caseState{} // the state of the previous character 1028 for i := range chars { 1029 state := getCaseState(rune(i)) 1030 if state.adjacent(prevState) { 1031 prevState = state 1032 continue 1033 } 1034 // end of run (possibly) 1035 printCaseRange(startState, prevState) 1036 startState = nil 1037 if state._case != CaseMissing && state._case != CaseNone { 1038 startState = state 1039 } 1040 prevState = state 1041 } 1042 print("}\n") 1043 } 1044 1045 func printCaseRange(lo, hi *caseState) { 1046 if lo == nil { 1047 return 1048 } 1049 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 1050 // character represents itself in all cases - no need to mention it 1051 return 1052 } 1053 switch { 1054 case hi.point > lo.point && lo.isUpperLower(): 1055 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 1056 lo.point, hi.point) 1057 case hi.point > lo.point && lo.isLowerUpper(): 1058 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 1059 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 1060 lo.point, hi.point) 1061 default: 1062 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 1063 lo.point, hi.point, 1064 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 1065 } 1066 } 1067 1068 // If the cased value in the Char is 0, it means use the rune itself. 1069 func caseIt(r, cased rune) rune { 1070 if cased == 0 { 1071 return r 1072 } 1073 return cased 1074 } 1075 1076 func fullCaseTest() { 1077 for j, c := range chars { 1078 i := rune(j) 1079 lower := unicode.ToLower(i) 1080 want := caseIt(i, c.lowerCase) 1081 if lower != want { 1082 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 1083 } 1084 upper := unicode.ToUpper(i) 1085 want = caseIt(i, c.upperCase) 1086 if upper != want { 1087 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 1088 } 1089 title := unicode.ToTitle(i) 1090 want = caseIt(i, c.titleCase) 1091 if title != want { 1092 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 1093 } 1094 } 1095 } 1096 1097 func printLatinProperties() { 1098 if *test { 1099 return 1100 } 1101 println("var properties = [MaxLatin1+1]uint8{") 1102 for code := 0; code <= unicode.MaxLatin1; code++ { 1103 var property string 1104 switch chars[code].category { 1105 case "Cc", "": // NUL has no category. 1106 property = "pC" 1107 case "Cf": // soft hyphen, unique category, not printable. 1108 property = "0" 1109 case "Ll": 1110 property = "pLl | pp" 1111 case "Lo": 1112 property = "pLo | pp" 1113 case "Lu": 1114 property = "pLu | pp" 1115 case "Nd", "No": 1116 property = "pN | pp" 1117 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 1118 property = "pP | pp" 1119 case "Sc", "Sk", "Sm", "So": 1120 property = "pS | pp" 1121 case "Zs": 1122 property = "pZ" 1123 default: 1124 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 1125 } 1126 // Special case 1127 if code == ' ' { 1128 property = "pZ | pp" 1129 } 1130 printf("\t0x%02X: %s, // %q\n", code, property, code) 1131 } 1132 printf("}\n\n") 1133 } 1134 1135 func printCasefold() { 1136 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 1137 var caseOrbit = make([][]rune, MaxChar+1) 1138 for j := range chars { 1139 i := rune(j) 1140 c := &chars[i] 1141 if c.foldCase == 0 { 1142 continue 1143 } 1144 orb := caseOrbit[c.foldCase] 1145 if orb == nil { 1146 orb = append(orb, c.foldCase) 1147 } 1148 caseOrbit[c.foldCase] = append(orb, i) 1149 } 1150 1151 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 1152 for j := range chars { 1153 i := rune(j) 1154 c := &chars[i] 1155 f := c.foldCase 1156 if f == 0 { 1157 f = i 1158 } 1159 orb := caseOrbit[f] 1160 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 1161 // Default assumption of [upper, lower] is wrong. 1162 caseOrbit[i] = []rune{i} 1163 } 1164 } 1165 1166 // Delete the groups for which assuming [lower, upper] or [upper, lower] is right. 1167 for i, orb := range caseOrbit { 1168 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 1169 caseOrbit[i] = nil 1170 } 1171 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] { 1172 caseOrbit[i] = nil 1173 } 1174 } 1175 1176 // Record orbit information in chars. 1177 for _, orb := range caseOrbit { 1178 if orb == nil { 1179 continue 1180 } 1181 sort.Slice(orb, func(i, j int) bool { 1182 return orb[i] < orb[j] 1183 }) 1184 c := orb[len(orb)-1] 1185 for _, d := range orb { 1186 chars[c].caseOrbit = d 1187 c = d 1188 } 1189 } 1190 1191 printAsciiFold() 1192 printCaseOrbit() 1193 1194 // Tables of category and script folding exceptions: code points 1195 // that must be added when interpreting a particular category/script 1196 // in a case-folding context. 1197 cat := make(map[string]map[rune]bool) 1198 for name := range category { 1199 if x := foldExceptions(inCategory(name)); len(x) > 0 { 1200 cat[name] = x 1201 } 1202 } 1203 1204 scr := make(map[string]map[rune]bool) 1205 for name := range scripts { 1206 if x := foldExceptions(inScript(name)); len(x) > 0 { 1207 scr[name] = x 1208 } 1209 } 1210 1211 printCatFold("FoldCategory", cat) 1212 printCatFold("FoldScript", scr) 1213 } 1214 1215 // inCategory returns a list of all the runes in the category. 1216 func inCategory(name string) []rune { 1217 var x []rune 1218 for j := range chars { 1219 i := rune(j) 1220 c := &chars[i] 1221 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 1222 x = append(x, i) 1223 } 1224 } 1225 return x 1226 } 1227 1228 // inScript returns a list of all the runes in the script. 1229 func inScript(name string) []rune { 1230 var x []rune 1231 for _, s := range scripts[name] { 1232 for c := s.lo; c <= s.hi; c++ { 1233 x = append(x, rune(c)) 1234 } 1235 } 1236 return x 1237 } 1238 1239 // foldExceptions returns a list of all the runes fold-equivalent 1240 // to runes in class but not in class themselves. 1241 func foldExceptions(class []rune) map[rune]bool { 1242 // Create map containing class and all fold-equivalent chars. 1243 m := make(map[rune]bool) 1244 for _, r := range class { 1245 c := &chars[r] 1246 if c.caseOrbit == 0 { 1247 // Just upper and lower. 1248 if u := c.upperCase; u != 0 { 1249 m[u] = true 1250 } 1251 if l := c.lowerCase; l != 0 { 1252 m[l] = true 1253 } 1254 m[r] = true 1255 continue 1256 } 1257 // Otherwise walk orbit. 1258 r0 := r 1259 for { 1260 m[r] = true 1261 r = chars[r].caseOrbit 1262 if r == r0 { 1263 break 1264 } 1265 } 1266 } 1267 1268 // Remove class itself. 1269 for _, r := range class { 1270 delete(m, r) 1271 } 1272 1273 // What's left is the exceptions. 1274 return m 1275 } 1276 1277 var comment = map[string]string{ 1278 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 1279 "// code points outside the category that are equivalent under\n" + 1280 "// simple case folding to code points inside the category.\n" + 1281 "// If there is no entry for a category name, there are no such points.\n", 1282 1283 "FoldScript": "// FoldScript maps a script name to a table of\n" + 1284 "// code points outside the script that are equivalent under\n" + 1285 "// simple case folding to code points inside the script.\n" + 1286 "// If there is no entry for a script name, there are no such points.\n", 1287 } 1288 1289 func printAsciiFold() { 1290 printf("var asciiFold = [MaxASCII + 1]uint16{\n") 1291 for i := rune(0); i <= unicode.MaxASCII; i++ { 1292 c := chars[i] 1293 f := c.caseOrbit 1294 if f == 0 { 1295 if c.lowerCase != i && c.lowerCase != 0 { 1296 f = c.lowerCase 1297 } else if c.upperCase != i && c.upperCase != 0 { 1298 f = c.upperCase 1299 } else { 1300 f = i 1301 } 1302 } 1303 printf("\t0x%04X,\n", f) 1304 } 1305 printf("}\n\n") 1306 } 1307 1308 func printCaseOrbit() { 1309 if *test { 1310 for j := range chars { 1311 i := rune(j) 1312 c := &chars[i] 1313 f := c.caseOrbit 1314 if f == 0 { 1315 if c.lowerCase != i && c.lowerCase != 0 { 1316 f = c.lowerCase 1317 } else if c.upperCase != i && c.upperCase != 0 { 1318 f = c.upperCase 1319 } else { 1320 f = i 1321 } 1322 } 1323 if g := unicode.SimpleFold(i); g != f { 1324 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 1325 } 1326 } 1327 return 1328 } 1329 1330 printf("var caseOrbit = []foldPair{\n") 1331 for i := range chars { 1332 c := &chars[i] 1333 if c.caseOrbit != 0 { 1334 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 1335 foldPairCount++ 1336 } 1337 } 1338 printf("}\n\n") 1339 } 1340 1341 func printCatFold(name string, m map[string]map[rune]bool) { 1342 if *test { 1343 var pkgMap map[string]*unicode.RangeTable 1344 if name == "FoldCategory" { 1345 pkgMap = unicode.FoldCategory 1346 } else { 1347 pkgMap = unicode.FoldScript 1348 } 1349 if len(pkgMap) != len(m) { 1350 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 1351 return 1352 } 1353 for k, v := range m { 1354 t, ok := pkgMap[k] 1355 if !ok { 1356 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 1357 continue 1358 } 1359 n := 0 1360 for _, r := range t.R16 { 1361 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1362 if !v[c] { 1363 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1364 } 1365 n++ 1366 } 1367 } 1368 for _, r := range t.R32 { 1369 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1370 if !v[c] { 1371 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1372 } 1373 n++ 1374 } 1375 } 1376 if n != len(v) { 1377 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 1378 } 1379 } 1380 return 1381 } 1382 1383 print(comment[name]) 1384 printf("var %s = map[string]*RangeTable{\n", name) 1385 for _, name := range allCatFold(m) { 1386 printf("\t%q: fold%s,\n", name, name) 1387 } 1388 printf("}\n\n") 1389 for _, name := range allCatFold(m) { 1390 class := m[name] 1391 dumpRange( 1392 fmt.Sprintf("var fold%s = &RangeTable{\n", name), 1393 func(code rune) bool { return class[code] }) 1394 } 1395 } 1396 1397 var range16Count = 0 // Number of entries in the 16-bit range tables. 1398 var range32Count = 0 // Number of entries in the 32-bit range tables. 1399 var foldPairCount = 0 // Number of fold pairs in the exception tables. 1400 1401 func printSizes() { 1402 if *test { 1403 return 1404 } 1405 println() 1406 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1407 range16Bytes := range16Count * 3 * 2 1408 range32Bytes := range32Count * 3 * 4 1409 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1410 println() 1411 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1412 }