github.com/dara-project/godist@v0.0.0-20200823115410-e0c80c8f0c78/src/unicode/maketables.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Unicode table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "log" 18 "net/http" 19 "os" 20 "os/exec" 21 "path/filepath" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 "unicode" 27 "dara" 28 ) 29 30 func main() { 31 flag.Parse() 32 setupOutput() 33 loadChars() // always needed 34 loadCasefold() 35 printCategories() 36 printScriptOrProperty(false) 37 printScriptOrProperty(true) 38 printCases() 39 printLatinProperties() 40 printCasefold() 41 printSizes() 42 flushOutput() 43 } 44 45 func defaultVersion() string { 46 if v := os.Getenv("UNICODE_VERSION"); v != "" { 47 return v 48 } 49 return unicode.Version 50 } 51 52 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") 53 var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") 54 var url = flag.String("url", 55 "http://www.unicode.org/Public/"+defaultVersion()+"/ucd/", 56 "URL of Unicode database directory") 57 var tablelist = flag.String("tables", 58 "all", 59 "comma-separated list of which tables to generate; can be letter") 60 var scriptlist = flag.String("scripts", 61 "all", 62 "comma-separated list of which script tables to generate") 63 var proplist = flag.String("props", 64 "all", 65 "comma-separated list of which property tables to generate") 66 var cases = flag.Bool("cases", 67 true, 68 "generate case tables") 69 var test = flag.Bool("test", 70 false, 71 "test existing tables; can be used to compare web data with package data") 72 var localFiles = flag.Bool("local", 73 false, 74 "data files have been copied to current directory; for debugging only") 75 var outputFile = flag.String("output", 76 "", 77 "output file for generated tables; default stdout") 78 79 var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 80 var logger = log.New(os.Stderr, "", log.Lshortfile) 81 82 var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile" 83 84 func setupOutput() { 85 output = bufio.NewWriter(startGofmt()) 86 } 87 88 // startGofmt connects output to a gofmt process if -output is set. 89 func startGofmt() io.Writer { 90 if *outputFile == "" { 91 return os.Stdout 92 } 93 stdout, err := os.Create(*outputFile) 94 if err != nil { 95 logger.Fatal(err) 96 } 97 // Pipe output to gofmt. 98 gofmt := exec.Command("gofmt") 99 fd, err := gofmt.StdinPipe() 100 if err != nil { 101 logger.Fatal(err) 102 } 103 gofmt.Stdout = stdout 104 gofmt.Stderr = os.Stderr 105 err = gofmt.Start() 106 if err != nil { 107 logger.Fatal(err) 108 } 109 return fd 110 } 111 112 func flushOutput() { 113 err := output.Flush() 114 if err != nil { 115 logger.Fatal(err) 116 } 117 } 118 119 func printf(format string, args ...interface{}) { 120 fmt.Fprintf(output, format, args...) 121 } 122 123 func print(args ...interface{}) { 124 fmt.Fprint(output, args...) 125 } 126 127 func println(args ...interface{}) { 128 fmt.Fprintln(output, args...) 129 } 130 131 type reader struct { 132 *bufio.Reader 133 fd *os.File 134 resp *http.Response 135 } 136 137 func open(url string) *reader { 138 file := filepath.Base(url) 139 if *localFiles { 140 fd, err := os.Open(file) 141 if err != nil { 142 logger.Fatal(err) 143 } 144 return &reader{bufio.NewReader(fd), fd, nil} 145 } 146 resp, err := http.Get(url) 147 if err != nil { 148 logger.Fatal(err) 149 } 150 if resp.StatusCode != 200 { 151 logger.Fatalf("bad GET status for %s: %d", file, resp.Status) 152 } 153 return &reader{bufio.NewReader(resp.Body), nil, resp} 154 155 } 156 157 func (r *reader) close() { 158 if r.fd != nil { 159 r.fd.Close() 160 } else { 161 r.resp.Body.Close() 162 } 163 } 164 165 var category = map[string]bool{ 166 // Nd Lu etc. 167 // We use one-character names to identify merged categories 168 "L": true, // Lu Ll Lt Lm Lo 169 "P": true, // Pc Pd Ps Pe Pu Pf Po 170 "M": true, // Mn Mc Me 171 "N": true, // Nd Nl No 172 "S": true, // Sm Sc Sk So 173 "Z": true, // Zs Zl Zp 174 "C": true, // Cc Cf Cs Co Cn 175 } 176 177 // UnicodeData.txt has form: 178 // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; 179 // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A 180 // See http://www.unicode.org/reports/tr44/ for a full explanation 181 // The fields: 182 const ( 183 FCodePoint = iota 184 FName 185 FGeneralCategory 186 FCanonicalCombiningClass 187 FBidiClass 188 FDecompositionTypeAndMapping 189 FNumericType 190 FNumericDigit // If a decimal digit. 191 FNumericValue // Includes non-decimal, e.g. U+2155=1/5 192 FBidiMirrored 193 FUnicode1Name 194 FISOComment 195 FSimpleUppercaseMapping 196 FSimpleLowercaseMapping 197 FSimpleTitlecaseMapping 198 NumField 199 200 MaxChar = 0x10FFFF // anything above this shouldn't exist 201 ) 202 203 var fieldName = []string{ 204 FCodePoint: "CodePoint", 205 FName: "Name", 206 FGeneralCategory: "GeneralCategory", 207 FCanonicalCombiningClass: "CanonicalCombiningClass", 208 FBidiClass: "BidiClass", 209 FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", 210 FNumericType: "NumericType", 211 FNumericDigit: "NumericDigit", 212 FNumericValue: "NumericValue", 213 FBidiMirrored: "BidiMirrored", 214 FUnicode1Name: "Unicode1Name", 215 FISOComment: "ISOComment", 216 FSimpleUppercaseMapping: "SimpleUppercaseMapping", 217 FSimpleLowercaseMapping: "SimpleLowercaseMapping", 218 FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", 219 } 220 221 // This contains only the properties we're interested in. 222 type Char struct { 223 field []string // debugging only; could be deleted if we take out char.dump() 224 codePoint rune // if zero, this index is not a valid code point. 225 category string 226 upperCase rune 227 lowerCase rune 228 titleCase rune 229 foldCase rune // simple case folding 230 caseOrbit rune // next in simple case folding orbit 231 } 232 233 // Scripts.txt has form: 234 // A673 ; Cyrillic # Po SLAVONIC ASTERISK 235 // A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK 236 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation 237 238 type Script struct { 239 lo, hi uint32 // range of code points 240 script string 241 } 242 243 var chars = make([]Char, MaxChar+1) 244 var scripts = make(map[string][]Script) 245 var props = make(map[string][]Script) // a property looks like a script; can share the format 246 247 var lastChar rune = 0 248 249 // In UnicodeData.txt, some ranges are marked like this: 250 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 251 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 252 // parseCategory returns a state variable indicating the weirdness. 253 type State int 254 255 const ( 256 SNormal State = iota // known to be zero for the type 257 SFirst 258 SLast 259 SMissing 260 ) 261 262 func parseCategory(line string) (state State) { 263 field := strings.Split(line, ";") 264 if len(field) != NumField { 265 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) 266 } 267 point, err := strconv.ParseUint(field[FCodePoint], 16, 64) 268 if err != nil { 269 logger.Fatalf("%.5s...: %s", line, err) 270 } 271 lastChar = rune(point) 272 if point > MaxChar { 273 return 274 } 275 char := &chars[point] 276 char.field = field 277 if char.codePoint != 0 { 278 logger.Fatalf("point %U reused", point) 279 } 280 char.codePoint = lastChar 281 char.category = field[FGeneralCategory] 282 category[char.category] = true 283 switch char.category { 284 case "Nd": 285 // Decimal digit 286 _, err := strconv.Atoi(field[FNumericValue]) 287 if err != nil { 288 logger.Fatalf("%U: bad numeric field: %s", point, err) 289 } 290 case "Lu": 291 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 292 case "Ll": 293 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) 294 case "Lt": 295 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) 296 default: 297 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 298 } 299 switch { 300 case strings.Index(field[FName], ", First>") > 0: 301 state = SFirst 302 case strings.Index(field[FName], ", Last>") > 0: 303 state = SLast 304 } 305 return 306 } 307 308 func (char *Char) dump(s string) { 309 print(s, " ") 310 for i := 0; i < len(char.field); i++ { 311 printf("%s:%q ", fieldName[i], char.field[i]) 312 } 313 print("\n") 314 } 315 316 func (char *Char) letter(u, l, t string) { 317 char.upperCase = char.letterValue(u, "U") 318 char.lowerCase = char.letterValue(l, "L") 319 char.titleCase = char.letterValue(t, "T") 320 } 321 322 func (char *Char) letterValue(s string, cas string) rune { 323 if s == "" { 324 return 0 325 } 326 v, err := strconv.ParseUint(s, 16, 64) 327 if err != nil { 328 char.dump(cas) 329 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err) 330 } 331 return rune(v) 332 } 333 334 func allCategories() []string { 335 a := make([]string, 0, len(category)) 336 for k := range category { 337 a = append(a, k) 338 } 339 sort.Strings(a) 340 return a 341 } 342 343 func all(scripts map[string][]Script) []string { 344 a := make([]string, 0, len(scripts)) 345 for k := range scripts { 346 a = append(a, k) 347 } 348 sort.Strings(a) 349 return a 350 } 351 352 func allCatFold(m map[string]map[rune]bool) []string { 353 a := make([]string, 0, len(m)) 354 for k := range m { 355 a = append(a, k) 356 } 357 sort.Strings(a) 358 return a 359 } 360 361 // Extract the version number from the URL 362 func version() string { 363 // Break on slashes and look for the first numeric field 364 fields := strings.Split(*url, "/") 365 for _, f := range fields { 366 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { 367 return f 368 } 369 } 370 logger.Fatal("unknown version") 371 return "Unknown" 372 } 373 374 func categoryOp(code rune, class uint8) bool { 375 category := chars[code].category 376 return len(category) > 0 && category[0] == class 377 } 378 379 func loadChars() { 380 if *dataURL == "" { 381 flag.Set("data", *url+"UnicodeData.txt") 382 } 383 input := open(*dataURL) 384 defer input.close() 385 scanner := bufio.NewScanner(input) 386 var first rune = 0 387 for scanner.Scan() { 388 switch parseCategory(scanner.Text()) { 389 case SNormal: 390 if first != 0 { 391 logger.Fatalf("bad state normal at %U", lastChar) 392 } 393 case SFirst: 394 if first != 0 { 395 logger.Fatalf("bad state first at %U", lastChar) 396 } 397 first = lastChar 398 case SLast: 399 if first == 0 { 400 logger.Fatalf("bad state last at %U", lastChar) 401 } 402 for i := first + 1; i <= lastChar; i++ { 403 chars[i] = chars[first] 404 chars[i].codePoint = i 405 } 406 first = 0 407 } 408 } 409 if scanner.Err() != nil { 410 logger.Fatal(scanner.Err()) 411 } 412 } 413 414 func loadCasefold() { 415 if *casefoldingURL == "" { 416 flag.Set("casefolding", *url+"CaseFolding.txt") 417 } 418 input := open(*casefoldingURL) 419 defer input.close() 420 scanner := bufio.NewScanner(input) 421 for scanner.Scan() { 422 line := scanner.Text() 423 if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 { 424 continue 425 } 426 field := strings.Split(line, "; ") 427 if len(field) != 4 { 428 logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) 429 } 430 kind := field[1] 431 if kind != "C" && kind != "S" { 432 // Only care about 'common' and 'simple' foldings. 433 continue 434 } 435 p1, err := strconv.ParseUint(field[0], 16, 64) 436 if err != nil { 437 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 438 } 439 p2, err := strconv.ParseUint(field[2], 16, 64) 440 if err != nil { 441 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 442 } 443 chars[p1].foldCase = rune(p2) 444 } 445 if scanner.Err() != nil { 446 logger.Fatal(scanner.Err()) 447 } 448 } 449 450 const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. 451 // Use of this source code is governed by a BSD-style 452 // license that can be found in the LICENSE file. 453 454 // Code generated by maketables; DO NOT EDIT. 455 // To regenerate, run: 456 // maketables --tables=%s --data=%s --casefolding=%s 457 458 package unicode 459 460 ` 461 462 func printCategories() { 463 if *tablelist == "" { 464 return 465 } 466 // Find out which categories to dump 467 list := strings.Split(*tablelist, ",") 468 if *tablelist == "all" { 469 list = allCategories() 470 } 471 if *test { 472 fullCategoryTest(list) 473 return 474 } 475 printf(progHeader, *tablelist, *dataURL, *casefoldingURL) 476 477 println("// Version is the Unicode edition from which the tables are derived.") 478 printf("const Version = %q\n\n", version()) 479 480 if *tablelist == "all" { 481 println("// Categories is the set of Unicode category tables.") 482 println("var Categories = map[string] *RangeTable {") 483 for _, k := range allCategories() { 484 printf("\t%q: %s,\n", k, k) 485 } 486 print("}\n\n") 487 } 488 489 decl := make(sort.StringSlice, len(list)) 490 ndecl := 0 491 for _, name := range list { 492 if _, ok := category[name]; !ok { 493 logger.Fatal("unknown category", name) 494 } 495 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 496 // name to store the data. This stops godoc dumping all the tables but keeps them 497 // available to clients. 498 // Cases deserving special comments 499 varDecl := "" 500 switch name { 501 case "C": 502 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 503 varDecl += "\tC = _C\n" 504 case "L": 505 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 506 varDecl += "\tL = _L\n" 507 case "M": 508 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 509 varDecl += "\tM = _M\n" 510 case "N": 511 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 512 varDecl += "\tN = _N\n" 513 case "P": 514 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 515 varDecl += "\tP = _P\n" 516 case "S": 517 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 518 varDecl += "\tS = _S\n" 519 case "Z": 520 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 521 varDecl += "\tZ = _Z\n" 522 case "Nd": 523 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 524 case "Lu": 525 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 526 case "Ll": 527 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 528 case "Lt": 529 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 530 } 531 if len(name) > 1 { 532 varDecl += fmt.Sprintf( 533 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 534 name, name, name, name) 535 } 536 decl[ndecl] = varDecl 537 ndecl++ 538 if len(name) == 1 { // unified categories 539 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) 540 dumpRange( 541 decl, 542 func(code rune) bool { return categoryOp(code, name[0]) }) 543 continue 544 } 545 dumpRange( 546 fmt.Sprintf("var _%s = &RangeTable{\n", name), 547 func(code rune) bool { return chars[code].category == name }) 548 } 549 decl.Sort() 550 println("// These variables have type *RangeTable.") 551 println("var (") 552 for _, d := range decl { 553 print(d) 554 } 555 print(")\n\n") 556 } 557 558 type Op func(code rune) bool 559 560 const format = "\t\t{0x%04x, 0x%04x, %d},\n" 561 562 func dumpRange(header string, inCategory Op) { 563 print(header) 564 next := rune(0) 565 latinOffset := 0 566 print("\tR16: []Range16{\n") 567 // one Range for each iteration 568 count := &range16Count 569 size := 16 570 for { 571 // look for start of range 572 for next < rune(len(chars)) && !inCategory(next) { 573 next++ 574 } 575 if next >= rune(len(chars)) { 576 // no characters remain 577 break 578 } 579 580 // start of range 581 lo := next 582 hi := next 583 stride := rune(1) 584 // accept lo 585 next++ 586 // look for another character to set the stride 587 for next < rune(len(chars)) && !inCategory(next) { 588 next++ 589 } 590 if next >= rune(len(chars)) { 591 // no more characters 592 printf(format, lo, hi, stride) 593 break 594 } 595 // set stride 596 stride = next - lo 597 // check for length of run. next points to first jump in stride 598 for i := next; i < rune(len(chars)); i++ { 599 if inCategory(i) == (((i - lo) % stride) == 0) { 600 // accept 601 if inCategory(i) { 602 hi = i 603 } 604 } else { 605 // no more characters in this run 606 break 607 } 608 } 609 if uint32(hi) <= unicode.MaxLatin1 { 610 latinOffset++ 611 } 612 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) 613 // next range: start looking where this range ends 614 next = hi + 1 615 } 616 print("\t},\n") 617 if latinOffset > 0 { 618 printf("\tLatinOffset: %d,\n", latinOffset) 619 } 620 print("}\n\n") 621 } 622 623 func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { 624 if size == 16 && hi >= 1<<16 { 625 if lo < 1<<16 { 626 if lo+stride != hi { 627 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) 628 } 629 // No range contains U+FFFF as an instance, so split 630 // the range into two entries. That way we can maintain 631 // the invariant that R32 contains only >= 1<<16. 632 printf(format, lo, lo, 1) 633 lo = hi 634 stride = 1 635 *count++ 636 } 637 print("\t},\n") 638 print("\tR32: []Range32{\n") 639 size = 32 640 count = &range32Count 641 } 642 printf(format, lo, hi, stride) 643 *count++ 644 return size, count 645 } 646 647 func fullCategoryTest(list []string) { 648 for _, name := range list { 649 if _, ok := category[name]; !ok { 650 logger.Fatal("unknown category", name) 651 } 652 r, ok := unicode.Categories[name] 653 if !ok && len(name) > 1 { 654 logger.Fatalf("unknown table %q", name) 655 } 656 if len(name) == 1 { 657 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 658 } else { 659 verifyRange( 660 name, 661 func(code rune) bool { return chars[code].category == name }, 662 r) 663 } 664 } 665 } 666 667 func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 668 count := 0 669 for j := range chars { 670 i := rune(j) 671 web := inCategory(i) 672 pkg := unicode.Is(table, i) 673 if web != pkg { 674 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 675 count++ 676 if count > 10 { 677 break 678 } 679 } 680 } 681 } 682 683 func parseScript(line string, scripts map[string][]Script) { 684 comment := strings.Index(line, "#") 685 if comment >= 0 { 686 line = line[0:comment] 687 } 688 line = strings.TrimSpace(line) 689 if len(line) == 0 { 690 return 691 } 692 field := strings.Split(line, ";") 693 if len(field) != 2 { 694 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) 695 } 696 matches := scriptRe.FindStringSubmatch(line) 697 if len(matches) != 4 { 698 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches)) 699 } 700 lo, err := strconv.ParseUint(matches[1], 16, 64) 701 if err != nil { 702 logger.Fatalf("%.5s...: %s", line, err) 703 } 704 hi := lo 705 if len(matches[2]) > 2 { // ignore leading .. 706 hi, err = strconv.ParseUint(matches[2][2:], 16, 64) 707 if err != nil { 708 logger.Fatalf("%.5s...: %s", line, err) 709 } 710 } 711 name := matches[3] 712 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) 713 } 714 715 // The script tables have a lot of adjacent elements. Fold them together. 716 func foldAdjacent(r []Script) []unicode.Range32 { 717 s := make([]unicode.Range32, 0, len(r)) 718 j := 0 719 for i := 0; i < len(r); i++ { 720 if j > 0 && r[i].lo == s[j-1].Hi+1 { 721 s[j-1].Hi = r[i].hi 722 } else { 723 s = s[0 : j+1] 724 s[j] = unicode.Range32{ 725 Lo: uint32(r[i].lo), 726 Hi: uint32(r[i].hi), 727 Stride: 1, 728 } 729 j++ 730 } 731 } 732 return s 733 } 734 735 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { 736 for _, name := range list { 737 if _, ok := scripts[name]; !ok { 738 logger.Fatal("unknown script", name) 739 } 740 _, ok := installed[name] 741 if !ok { 742 logger.Fatal("unknown table", name) 743 } 744 for _, script := range scripts[name] { 745 for r := script.lo; r <= script.hi; r++ { 746 if !unicode.Is(installed[name], rune(r)) { 747 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 748 } 749 } 750 } 751 } 752 } 753 754 var deprecatedAliases = map[string]string{ 755 "Sentence_Terminal": "STerm", 756 } 757 758 // PropList.txt has the same format as Scripts.txt so we can share its parser. 759 func printScriptOrProperty(doProps bool) { 760 flag := "scripts" 761 flaglist := *scriptlist 762 file := "Scripts.txt" 763 table := scripts 764 installed := unicode.Scripts 765 if doProps { 766 flag = "props" 767 flaglist = *proplist 768 file = "PropList.txt" 769 table = props 770 installed = unicode.Properties 771 } 772 if flaglist == "" { 773 return 774 } 775 input := open(*url + file) 776 scanner := bufio.NewScanner(input) 777 for scanner.Scan() { 778 parseScript(scanner.Text(), table) 779 } 780 if scanner.Err() != nil { 781 logger.Fatal(scanner.Err()) 782 } 783 input.close() 784 785 // Find out which scripts to dump 786 list := strings.Split(flaglist, ",") 787 if flaglist == "all" { 788 list = all(table) 789 } 790 if *test { 791 fullScriptTest(list, installed, table) 792 return 793 } 794 795 printf( 796 "// Generated by running\n"+ 797 "// maketables --%s=%s --url=%s\n"+ 798 "// DO NOT EDIT\n\n", 799 flag, 800 flaglist, 801 *url) 802 if flaglist == "all" { 803 if doProps { 804 println("// Properties is the set of Unicode property tables.") 805 println("var Properties = map[string] *RangeTable{") 806 } else { 807 println("// Scripts is the set of Unicode script tables.") 808 println("var Scripts = map[string] *RangeTable{") 809 } 810 for _, k := range all(table) { 811 printf("\t%q: %s,\n", k, k) 812 if alias, ok := deprecatedAliases[k]; ok { 813 printf("\t%q: %s,\n", alias, k) 814 } 815 } 816 print("}\n\n") 817 } 818 819 decl := make(sort.StringSlice, len(list)+len(deprecatedAliases)) 820 ndecl := 0 821 for _, name := range list { 822 if doProps { 823 decl[ndecl] = fmt.Sprintf( 824 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 825 name, name, name, name) 826 } else { 827 decl[ndecl] = fmt.Sprintf( 828 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 829 name, name, name, name) 830 } 831 ndecl++ 832 if alias, ok := deprecatedAliases[name]; ok { 833 decl[ndecl] = fmt.Sprintf( 834 "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n", 835 alias, name) 836 ndecl++ 837 } 838 printf("var _%s = &RangeTable {\n", name) 839 ranges := foldAdjacent(table[name]) 840 print("\tR16: []Range16{\n") 841 size := 16 842 count := &range16Count 843 for _, s := range ranges { 844 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) 845 } 846 print("\t},\n") 847 if off := findLatinOffset(ranges); off > 0 { 848 printf("\tLatinOffset: %d,\n", off) 849 } 850 print("}\n\n") 851 } 852 decl.Sort() 853 println("// These variables have type *RangeTable.") 854 println("var (") 855 for _, d := range decl { 856 print(d) 857 } 858 print(")\n\n") 859 } 860 861 func findLatinOffset(ranges []unicode.Range32) int { 862 i := 0 863 for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { 864 i++ 865 } 866 return i 867 } 868 869 const ( 870 CaseUpper = 1 << iota 871 CaseLower 872 CaseTitle 873 CaseNone = 0 // must be zero 874 CaseMissing = -1 // character not present; not a valid case state 875 ) 876 877 type caseState struct { 878 point rune 879 _case int 880 deltaToUpper rune 881 deltaToLower rune 882 deltaToTitle rune 883 } 884 885 // Is d a continuation of the state of c? 886 func (c *caseState) adjacent(d *caseState) bool { 887 if d.point < c.point { 888 c, d = d, c 889 } 890 switch { 891 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 892 return false 893 case d._case != c._case: // different cases 894 return c.upperLowerAdjacent(d) 895 case c._case == CaseNone: 896 return false 897 case c._case == CaseMissing: 898 return false 899 case d.deltaToUpper != c.deltaToUpper: 900 return false 901 case d.deltaToLower != c.deltaToLower: 902 return false 903 case d.deltaToTitle != c.deltaToTitle: 904 return false 905 } 906 return true 907 } 908 909 // Is d the same as c, but opposite in upper/lower case? this would make it 910 // an element of an UpperLower sequence. 911 func (c *caseState) upperLowerAdjacent(d *caseState) bool { 912 // check they're a matched case pair. we know they have adjacent values 913 switch { 914 case c._case == CaseUpper && d._case != CaseLower: 915 return false 916 case c._case == CaseLower && d._case != CaseUpper: 917 return false 918 } 919 // matched pair (at least in upper/lower). make the order Upper Lower 920 if c._case == CaseLower { 921 c, d = d, c 922 } 923 // for an Upper Lower sequence the deltas have to be in order 924 // c: 0 1 0 925 // d: -1 0 -1 926 switch { 927 case c.deltaToUpper != 0: 928 return false 929 case c.deltaToLower != 1: 930 return false 931 case c.deltaToTitle != 0: 932 return false 933 case d.deltaToUpper != -1: 934 return false 935 case d.deltaToLower != 0: 936 return false 937 case d.deltaToTitle != -1: 938 return false 939 } 940 return true 941 } 942 943 // Does this character start an UpperLower sequence? 944 func (c *caseState) isUpperLower() bool { 945 // for an Upper Lower sequence the deltas have to be in order 946 // c: 0 1 0 947 switch { 948 case c.deltaToUpper != 0: 949 return false 950 case c.deltaToLower != 1: 951 return false 952 case c.deltaToTitle != 0: 953 return false 954 } 955 return true 956 } 957 958 // Does this character start a LowerUpper sequence? 959 func (c *caseState) isLowerUpper() bool { 960 // for an Upper Lower sequence the deltas have to be in order 961 // c: -1 0 -1 962 switch { 963 case c.deltaToUpper != -1: 964 return false 965 case c.deltaToLower != 0: 966 return false 967 case c.deltaToTitle != -1: 968 return false 969 } 970 return true 971 } 972 973 func getCaseState(i rune) (c *caseState) { 974 c = &caseState{point: i, _case: CaseNone} 975 ch := &chars[i] 976 switch ch.codePoint { 977 case 0: 978 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 979 return 980 case ch.upperCase: 981 c._case = CaseUpper 982 case ch.lowerCase: 983 c._case = CaseLower 984 case ch.titleCase: 985 c._case = CaseTitle 986 } 987 // Some things such as roman numeral U+2161 don't describe themselves 988 // as upper case, but have a lower case. Second-guess them. 989 if c._case == CaseNone && ch.lowerCase != 0 { 990 c._case = CaseUpper 991 } 992 // Same in the other direction. 993 if c._case == CaseNone && ch.upperCase != 0 { 994 c._case = CaseLower 995 } 996 997 if ch.upperCase != 0 { 998 c.deltaToUpper = ch.upperCase - i 999 } 1000 if ch.lowerCase != 0 { 1001 c.deltaToLower = ch.lowerCase - i 1002 } 1003 if ch.titleCase != 0 { 1004 c.deltaToTitle = ch.titleCase - i 1005 } 1006 return 1007 } 1008 1009 func printCases() { 1010 if !*cases { 1011 return 1012 } 1013 if *test { 1014 fullCaseTest() 1015 return 1016 } 1017 printf( 1018 "// Generated by running\n"+ 1019 "// maketables --data=%s --casefolding=%s\n"+ 1020 "// DO NOT EDIT\n\n"+ 1021 "// CaseRanges is the table describing case mappings for all letters with\n"+ 1022 "// non-self mappings.\n"+ 1023 "var CaseRanges = _CaseRanges\n"+ 1024 "var _CaseRanges = []CaseRange {\n", 1025 *dataURL, *casefoldingURL) 1026 1027 var startState *caseState // the start of a run; nil for not active 1028 var prevState = &caseState{} // the state of the previous character 1029 for i := range chars { 1030 state := getCaseState(rune(i)) 1031 if state.adjacent(prevState) { 1032 prevState = state 1033 continue 1034 } 1035 // end of run (possibly) 1036 printCaseRange(startState, prevState) 1037 startState = nil 1038 if state._case != CaseMissing && state._case != CaseNone { 1039 startState = state 1040 } 1041 prevState = state 1042 } 1043 print("}\n") 1044 } 1045 1046 func printCaseRange(lo, hi *caseState) { 1047 if lo == nil { 1048 return 1049 } 1050 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 1051 // character represents itself in all cases - no need to mention it 1052 return 1053 } 1054 switch { 1055 case hi.point > lo.point && lo.isUpperLower(): 1056 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 1057 lo.point, hi.point) 1058 case hi.point > lo.point && lo.isLowerUpper(): 1059 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 1060 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 1061 lo.point, hi.point) 1062 default: 1063 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 1064 lo.point, hi.point, 1065 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 1066 } 1067 } 1068 1069 // If the cased value in the Char is 0, it means use the rune itself. 1070 func caseIt(r, cased rune) rune { 1071 if cased == 0 { 1072 return r 1073 } 1074 return cased 1075 } 1076 1077 func fullCaseTest() { 1078 for j, c := range chars { 1079 i := rune(j) 1080 lower := unicode.ToLower(i) 1081 want := caseIt(i, c.lowerCase) 1082 if lower != want { 1083 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 1084 } 1085 upper := unicode.ToUpper(i) 1086 want = caseIt(i, c.upperCase) 1087 if upper != want { 1088 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 1089 } 1090 title := unicode.ToTitle(i) 1091 want = caseIt(i, c.titleCase) 1092 if title != want { 1093 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 1094 } 1095 } 1096 } 1097 1098 func printLatinProperties() { 1099 if *test { 1100 return 1101 } 1102 println("var properties = [MaxLatin1+1]uint8{") 1103 for code := 0; code <= unicode.MaxLatin1; code++ { 1104 var property string 1105 switch chars[code].category { 1106 case "Cc", "": // NUL has no category. 1107 property = "pC" 1108 case "Cf": // soft hyphen, unique category, not printable. 1109 property = "0" 1110 case "Ll": 1111 property = "pLl | pp" 1112 case "Lo": 1113 property = "pLo | pp" 1114 case "Lu": 1115 property = "pLu | pp" 1116 case "Nd", "No": 1117 property = "pN | pp" 1118 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 1119 property = "pP | pp" 1120 case "Sc", "Sk", "Sm", "So": 1121 property = "pS | pp" 1122 case "Zs": 1123 property = "pZ" 1124 default: 1125 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 1126 } 1127 // Special case 1128 if code == ' ' { 1129 property = "pZ | pp" 1130 } 1131 printf("\t0x%02X: %s, // %q\n", code, property, code) 1132 } 1133 printf("}\n\n") 1134 } 1135 1136 func printCasefold() { 1137 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 1138 var caseOrbit = make([][]rune, MaxChar+1) 1139 for j := range chars { 1140 i := rune(j) 1141 c := &chars[i] 1142 if c.foldCase == 0 { 1143 continue 1144 } 1145 orb := caseOrbit[c.foldCase] 1146 if orb == nil { 1147 orb = append(orb, c.foldCase) 1148 } 1149 caseOrbit[c.foldCase] = append(orb, i) 1150 } 1151 1152 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 1153 for j := range chars { 1154 i := rune(j) 1155 c := &chars[i] 1156 f := c.foldCase 1157 if f == 0 { 1158 f = i 1159 } 1160 orb := caseOrbit[f] 1161 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 1162 // Default assumption of [upper, lower] is wrong. 1163 caseOrbit[i] = []rune{i} 1164 } 1165 } 1166 1167 // Delete the groups for which assuming [lower, upper] or [upper, lower] is right. 1168 for i, orb := range caseOrbit { 1169 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 1170 caseOrbit[i] = nil 1171 } 1172 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] { 1173 caseOrbit[i] = nil 1174 } 1175 } 1176 1177 // Record orbit information in chars. 1178 for _, orb := range caseOrbit { 1179 if orb == nil { 1180 continue 1181 } 1182 sort.Slice(orb, func(i, j int) bool { 1183 return orb[i] < orb[j] 1184 }) 1185 c := orb[len(orb)-1] 1186 for _, d := range orb { 1187 chars[c].caseOrbit = d 1188 c = d 1189 } 1190 } 1191 1192 printAsciiFold() 1193 printCaseOrbit() 1194 1195 // Tables of category and script folding exceptions: code points 1196 // that must be added when interpreting a particular category/script 1197 // in a case-folding context. 1198 cat := make(map[string]map[rune]bool) 1199 for name := range category { 1200 if x := foldExceptions(inCategory(name)); len(x) > 0 { 1201 cat[name] = x 1202 } 1203 } 1204 1205 scr := make(map[string]map[rune]bool) 1206 for name := range scripts { 1207 if x := foldExceptions(inScript(name)); len(x) > 0 { 1208 scr[name] = x 1209 } 1210 } 1211 1212 printCatFold("FoldCategory", cat) 1213 printCatFold("FoldScript", scr) 1214 } 1215 1216 // inCategory returns a list of all the runes in the category. 1217 func inCategory(name string) []rune { 1218 var x []rune 1219 for j := range chars { 1220 i := rune(j) 1221 c := &chars[i] 1222 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 1223 x = append(x, i) 1224 } 1225 } 1226 return x 1227 } 1228 1229 // inScript returns a list of all the runes in the script. 1230 func inScript(name string) []rune { 1231 var x []rune 1232 for _, s := range scripts[name] { 1233 for c := s.lo; c <= s.hi; c++ { 1234 x = append(x, rune(c)) 1235 } 1236 } 1237 return x 1238 } 1239 1240 // foldExceptions returns a list of all the runes fold-equivalent 1241 // to runes in class but not in class themselves. 1242 func foldExceptions(class []rune) map[rune]bool { 1243 // Create map containing class and all fold-equivalent chars. 1244 m := make(map[rune]bool) 1245 for _, r := range class { 1246 c := &chars[r] 1247 if c.caseOrbit == 0 { 1248 // Just upper and lower. 1249 if u := c.upperCase; u != 0 { 1250 m[u] = true 1251 } 1252 if l := c.lowerCase; l != 0 { 1253 m[l] = true 1254 } 1255 m[r] = true 1256 continue 1257 } 1258 // Otherwise walk orbit. 1259 r0 := r 1260 for { 1261 m[r] = true 1262 r = chars[r].caseOrbit 1263 if r == r0 { 1264 break 1265 } 1266 } 1267 } 1268 1269 // Remove class itself. 1270 for _, r := range class { 1271 delete(m, r) 1272 } 1273 1274 // What's left is the exceptions. 1275 return m 1276 } 1277 1278 var comment = map[string]string{ 1279 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 1280 "// code points outside the category that are equivalent under\n" + 1281 "// simple case folding to code points inside the category.\n" + 1282 "// If there is no entry for a category name, there are no such points.\n", 1283 1284 "FoldScript": "// FoldScript maps a script name to a table of\n" + 1285 "// code points outside the script that are equivalent under\n" + 1286 "// simple case folding to code points inside the script.\n" + 1287 "// If there is no entry for a script name, there are no such points.\n", 1288 } 1289 1290 func printAsciiFold() { 1291 printf("var asciiFold = [MaxASCII + 1]uint16{\n") 1292 for i := rune(0); i <= unicode.MaxASCII; i++ { 1293 c := chars[i] 1294 f := c.caseOrbit 1295 if f == 0 { 1296 if c.lowerCase != i && c.lowerCase != 0 { 1297 f = c.lowerCase 1298 } else if c.upperCase != i && c.upperCase != 0 { 1299 f = c.upperCase 1300 } else { 1301 f = i 1302 } 1303 } 1304 printf("\t0x%04X,\n", f) 1305 } 1306 printf("}\n\n") 1307 } 1308 1309 func printCaseOrbit() { 1310 if *test { 1311 for j := range chars { 1312 i := rune(j) 1313 c := &chars[i] 1314 f := c.caseOrbit 1315 if f == 0 { 1316 if c.lowerCase != i && c.lowerCase != 0 { 1317 f = c.lowerCase 1318 } else if c.upperCase != i && c.upperCase != 0 { 1319 f = c.upperCase 1320 } else { 1321 f = i 1322 } 1323 } 1324 if g := unicode.SimpleFold(i); g != f { 1325 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 1326 } 1327 } 1328 return 1329 } 1330 1331 printf("var caseOrbit = []foldPair{\n") 1332 for i := range chars { 1333 c := &chars[i] 1334 if c.caseOrbit != 0 { 1335 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 1336 foldPairCount++ 1337 } 1338 } 1339 printf("}\n\n") 1340 } 1341 1342 func printCatFold(name string, m map[string]map[rune]bool) { 1343 if *test { 1344 var pkgMap map[string]*unicode.RangeTable 1345 if name == "FoldCategory" { 1346 pkgMap = unicode.FoldCategory 1347 } else { 1348 pkgMap = unicode.FoldScript 1349 } 1350 if len(pkgMap) != len(m) { 1351 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 1352 return 1353 } 1354 for k, v := range m { 1355 t, ok := pkgMap[k] 1356 if !ok { 1357 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 1358 continue 1359 } 1360 n := 0 1361 for _, r := range t.R16 { 1362 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1363 if !v[c] { 1364 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1365 } 1366 n++ 1367 } 1368 } 1369 for _, r := range t.R32 { 1370 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1371 if !v[c] { 1372 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1373 } 1374 n++ 1375 } 1376 } 1377 if n != len(v) { 1378 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 1379 } 1380 } 1381 return 1382 } 1383 1384 print(comment[name]) 1385 printf("var %s = map[string]*RangeTable{\n", name) 1386 for _, name := range allCatFold(m) { 1387 printf("\t%q: fold%s,\n", name, name) 1388 } 1389 printf("}\n\n") 1390 for _, name := range allCatFold(m) { 1391 class := m[name] 1392 dumpRange( 1393 fmt.Sprintf("var fold%s = &RangeTable{\n", name), 1394 func(code rune) bool { return class[code] }) 1395 } 1396 } 1397 1398 var range16Count = 0 // Number of entries in the 16-bit range tables. 1399 var range32Count = 0 // Number of entries in the 32-bit range tables. 1400 var foldPairCount = 0 // Number of fold pairs in the exception tables. 1401 1402 func printSizes() { 1403 if *test { 1404 return 1405 } 1406 println() 1407 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1408 range16Bytes := range16Count * 3 * 2 1409 range32Bytes := range32Count * 3 * 4 1410 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1411 println() 1412 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1413 } 1414 1415 1416 //DARA HACK TO GET A PIPE 1417 // Pipe returns a connected pair of Files; reads from r return bytes written to w. 1418 // It returns the files and an error, if any. 1419 /* 1420 func GPipe(name string) (r *File, w *File, err error) { 1421 var p [2]int 1422 1423 e := syscall.Pipe2(p[0:], syscall.O_CLOEXEC) 1424 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it 1425 // might not be implemented. 1426 if e == syscall.ENOSYS { 1427 // See ../syscall/exec.go for description of lock. 1428 syscall.ForkLock.RLock() 1429 e = syscall.Pipe(p[0:]) 1430 if e != nil { 1431 syscall.ForkLock.RUnlock() 1432 return nil, nil, NewSyscallError("pipe", e) 1433 } 1434 syscall.CloseOnExec(p[0]) 1435 syscall.CloseOnExec(p[1]) 1436 syscall.ForkLock.RUnlock() 1437 } else if e != nil { 1438 return nil, nil, NewSyscallError("pipe2", e) 1439 } 1440 1441 return newFile(uintptr(p[0]), name+"0", kindPipe), newFile(uintptr(p[1]), name+"|1", kindPipe), nil 1442 }*/