github.com/razvanm/vanadium-go-1.3@v0.0.0-20160721203343-4a65068e5915/src/unicode/maketables.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Unicode table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "log" 18 "net/http" 19 "os" 20 "os/exec" 21 "path/filepath" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 "unicode" 27 ) 28 29 func main() { 30 flag.Parse() 31 setupOutput() 32 loadChars() // always needed 33 loadCasefold() 34 printCategories() 35 printScriptOrProperty(false) 36 printScriptOrProperty(true) 37 printCases() 38 printLatinProperties() 39 printCasefold() 40 printSizes() 41 flushOutput() 42 } 43 44 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") 45 var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") 46 var url = flag.String("url", 47 "http://www.unicode.org/Public/7.0.0/ucd/", 48 "URL of Unicode database directory") 49 var tablelist = flag.String("tables", 50 "all", 51 "comma-separated list of which tables to generate; can be letter") 52 var scriptlist = flag.String("scripts", 53 "all", 54 "comma-separated list of which script tables to generate") 55 var proplist = flag.String("props", 56 "all", 57 "comma-separated list of which property tables to generate") 58 var cases = flag.Bool("cases", 59 true, 60 "generate case tables") 61 var test = flag.Bool("test", 62 false, 63 "test existing tables; can be used to compare web data with package data") 64 var localFiles = flag.Bool("local", 65 false, 66 "data files have been copied to current directory; for debugging only") 67 var outputFile = flag.String("output", 68 "", 69 "output file for generated tables; default stdout") 70 71 var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 72 var logger = log.New(os.Stderr, "", log.Lshortfile) 73 74 var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile" 75 76 func setupOutput() { 77 output = bufio.NewWriter(startGofmt()) 78 } 79 80 // startGofmt connects output to a gofmt process if -output is set. 81 func startGofmt() io.Writer { 82 if *outputFile == "" { 83 return os.Stdout 84 } 85 stdout, err := os.Create(*outputFile) 86 if err != nil { 87 logger.Fatal(err) 88 } 89 // Pipe output to gofmt. 90 gofmt := exec.Command("gofmt") 91 fd, err := gofmt.StdinPipe() 92 if err != nil { 93 logger.Fatal(err) 94 } 95 gofmt.Stdout = stdout 96 gofmt.Stderr = os.Stderr 97 err = gofmt.Start() 98 if err != nil { 99 logger.Fatal(err) 100 } 101 return fd 102 } 103 104 func flushOutput() { 105 err := output.Flush() 106 if err != nil { 107 logger.Fatal(err) 108 } 109 } 110 111 func printf(format string, args ...interface{}) { 112 fmt.Fprintf(output, format, args...) 113 } 114 115 func print(args ...interface{}) { 116 fmt.Fprint(output, args...) 117 } 118 119 func println(args ...interface{}) { 120 fmt.Fprintln(output, args...) 121 } 122 123 type reader struct { 124 *bufio.Reader 125 fd *os.File 126 resp *http.Response 127 } 128 129 func open(url string) *reader { 130 file := filepath.Base(url) 131 if *localFiles { 132 fd, err := os.Open(file) 133 if err != nil { 134 logger.Fatal(err) 135 } 136 return &reader{bufio.NewReader(fd), fd, nil} 137 } 138 resp, err := http.Get(url) 139 if err != nil { 140 logger.Fatal(err) 141 } 142 if resp.StatusCode != 200 { 143 logger.Fatalf("bad GET status for %s: %d", file, resp.Status) 144 } 145 return &reader{bufio.NewReader(resp.Body), nil, resp} 146 147 } 148 149 func (r *reader) close() { 150 if r.fd != nil { 151 r.fd.Close() 152 } else { 153 r.resp.Body.Close() 154 } 155 } 156 157 var category = map[string]bool{ 158 // Nd Lu etc. 159 // We use one-character names to identify merged categories 160 "L": true, // Lu Ll Lt Lm Lo 161 "P": true, // Pc Pd Ps Pe Pu Pf Po 162 "M": true, // Mn Mc Me 163 "N": true, // Nd Nl No 164 "S": true, // Sm Sc Sk So 165 "Z": true, // Zs Zl Zp 166 "C": true, // Cc Cf Cs Co Cn 167 } 168 169 // UnicodeData.txt has form: 170 // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; 171 // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A 172 // See http://www.unicode.org/reports/tr44/ for a full explanation 173 // The fields: 174 const ( 175 FCodePoint = iota 176 FName 177 FGeneralCategory 178 FCanonicalCombiningClass 179 FBidiClass 180 FDecompositionTypeAndMapping 181 FNumericType 182 FNumericDigit // If a decimal digit. 183 FNumericValue // Includes non-decimal, e.g. U+2155=1/5 184 FBidiMirrored 185 FUnicode1Name 186 FISOComment 187 FSimpleUppercaseMapping 188 FSimpleLowercaseMapping 189 FSimpleTitlecaseMapping 190 NumField 191 192 MaxChar = 0x10FFFF // anything above this shouldn't exist 193 ) 194 195 var fieldName = []string{ 196 FCodePoint: "CodePoint", 197 FName: "Name", 198 FGeneralCategory: "GeneralCategory", 199 FCanonicalCombiningClass: "CanonicalCombiningClass", 200 FBidiClass: "BidiClass", 201 FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", 202 FNumericType: "NumericType", 203 FNumericDigit: "NumericDigit", 204 FNumericValue: "NumericValue", 205 FBidiMirrored: "BidiMirrored", 206 FUnicode1Name: "Unicode1Name", 207 FISOComment: "ISOComment", 208 FSimpleUppercaseMapping: "SimpleUppercaseMapping", 209 FSimpleLowercaseMapping: "SimpleLowercaseMapping", 210 FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", 211 } 212 213 // This contains only the properties we're interested in. 214 type Char struct { 215 field []string // debugging only; could be deleted if we take out char.dump() 216 codePoint rune // if zero, this index is not a valid code point. 217 category string 218 upperCase rune 219 lowerCase rune 220 titleCase rune 221 foldCase rune // simple case folding 222 caseOrbit rune // next in simple case folding orbit 223 } 224 225 // Scripts.txt has form: 226 // A673 ; Cyrillic # Po SLAVONIC ASTERISK 227 // A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK 228 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation 229 230 type Script struct { 231 lo, hi uint32 // range of code points 232 script string 233 } 234 235 var chars = make([]Char, MaxChar+1) 236 var scripts = make(map[string][]Script) 237 var props = make(map[string][]Script) // a property looks like a script; can share the format 238 239 var lastChar rune = 0 240 241 // In UnicodeData.txt, some ranges are marked like this: 242 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 243 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 244 // parseCategory returns a state variable indicating the weirdness. 245 type State int 246 247 const ( 248 SNormal State = iota // known to be zero for the type 249 SFirst 250 SLast 251 SMissing 252 ) 253 254 func parseCategory(line string) (state State) { 255 field := strings.Split(line, ";") 256 if len(field) != NumField { 257 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) 258 } 259 point, err := strconv.ParseUint(field[FCodePoint], 16, 64) 260 if err != nil { 261 logger.Fatalf("%.5s...: %s", line, err) 262 } 263 lastChar = rune(point) 264 if point == 0 { 265 return // not interesting and we use 0 as unset 266 } 267 if point > MaxChar { 268 return 269 } 270 char := &chars[point] 271 char.field = field 272 if char.codePoint != 0 { 273 logger.Fatalf("point %U reused", point) 274 } 275 char.codePoint = lastChar 276 char.category = field[FGeneralCategory] 277 category[char.category] = true 278 switch char.category { 279 case "Nd": 280 // Decimal digit 281 _, err := strconv.Atoi(field[FNumericValue]) 282 if err != nil { 283 logger.Fatalf("%U: bad numeric field: %s", point, err) 284 } 285 case "Lu": 286 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 287 case "Ll": 288 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) 289 case "Lt": 290 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) 291 default: 292 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 293 } 294 switch { 295 case strings.Index(field[FName], ", First>") > 0: 296 state = SFirst 297 case strings.Index(field[FName], ", Last>") > 0: 298 state = SLast 299 } 300 return 301 } 302 303 func (char *Char) dump(s string) { 304 print(s, " ") 305 for i := 0; i < len(char.field); i++ { 306 printf("%s:%q ", fieldName[i], char.field[i]) 307 } 308 print("\n") 309 } 310 311 func (char *Char) letter(u, l, t string) { 312 char.upperCase = char.letterValue(u, "U") 313 char.lowerCase = char.letterValue(l, "L") 314 char.titleCase = char.letterValue(t, "T") 315 } 316 317 func (char *Char) letterValue(s string, cas string) rune { 318 if s == "" { 319 return 0 320 } 321 v, err := strconv.ParseUint(s, 16, 64) 322 if err != nil { 323 char.dump(cas) 324 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err) 325 } 326 return rune(v) 327 } 328 329 func allCategories() []string { 330 a := make([]string, 0, len(category)) 331 for k := range category { 332 a = append(a, k) 333 } 334 sort.Strings(a) 335 return a 336 } 337 338 func all(scripts map[string][]Script) []string { 339 a := make([]string, 0, len(scripts)) 340 for k := range scripts { 341 a = append(a, k) 342 } 343 sort.Strings(a) 344 return a 345 } 346 347 func allCatFold(m map[string]map[rune]bool) []string { 348 a := make([]string, 0, len(m)) 349 for k := range m { 350 a = append(a, k) 351 } 352 sort.Strings(a) 353 return a 354 } 355 356 // Extract the version number from the URL 357 func version() string { 358 // Break on slashes and look for the first numeric field 359 fields := strings.Split(*url, "/") 360 for _, f := range fields { 361 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { 362 return f 363 } 364 } 365 logger.Fatal("unknown version") 366 return "Unknown" 367 } 368 369 func categoryOp(code rune, class uint8) bool { 370 category := chars[code].category 371 return len(category) > 0 && category[0] == class 372 } 373 374 func loadChars() { 375 if *dataURL == "" { 376 flag.Set("data", *url+"UnicodeData.txt") 377 } 378 input := open(*dataURL) 379 defer input.close() 380 scanner := bufio.NewScanner(input) 381 var first rune = 0 382 for scanner.Scan() { 383 switch parseCategory(scanner.Text()) { 384 case SNormal: 385 if first != 0 { 386 logger.Fatalf("bad state normal at %U", lastChar) 387 } 388 case SFirst: 389 if first != 0 { 390 logger.Fatalf("bad state first at %U", lastChar) 391 } 392 first = lastChar 393 case SLast: 394 if first == 0 { 395 logger.Fatalf("bad state last at %U", lastChar) 396 } 397 for i := first + 1; i <= lastChar; i++ { 398 chars[i] = chars[first] 399 chars[i].codePoint = i 400 } 401 first = 0 402 } 403 } 404 if scanner.Err() != nil { 405 logger.Fatal(scanner.Err()) 406 } 407 } 408 409 func loadCasefold() { 410 if *casefoldingURL == "" { 411 flag.Set("casefolding", *url+"CaseFolding.txt") 412 } 413 input := open(*casefoldingURL) 414 defer input.close() 415 scanner := bufio.NewScanner(input) 416 for scanner.Scan() { 417 line := scanner.Text() 418 if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 { 419 continue 420 } 421 field := strings.Split(line, "; ") 422 if len(field) != 4 { 423 logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) 424 } 425 kind := field[1] 426 if kind != "C" && kind != "S" { 427 // Only care about 'common' and 'simple' foldings. 428 continue 429 } 430 p1, err := strconv.ParseUint(field[0], 16, 64) 431 if err != nil { 432 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 433 } 434 p2, err := strconv.ParseUint(field[2], 16, 64) 435 if err != nil { 436 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 437 } 438 chars[p1].foldCase = rune(p2) 439 } 440 if scanner.Err() != nil { 441 logger.Fatal(scanner.Err()) 442 } 443 } 444 445 const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. 446 // Use of this source code is governed by a BSD-style 447 // license that can be found in the LICENSE file. 448 449 // Generated by running 450 // maketables --tables=%s --data=%s --casefolding=%s 451 // DO NOT EDIT 452 453 package unicode 454 455 ` 456 457 func printCategories() { 458 if *tablelist == "" { 459 return 460 } 461 // Find out which categories to dump 462 list := strings.Split(*tablelist, ",") 463 if *tablelist == "all" { 464 list = allCategories() 465 } 466 if *test { 467 fullCategoryTest(list) 468 return 469 } 470 printf(progHeader, *tablelist, *dataURL, *casefoldingURL) 471 472 println("// Version is the Unicode edition from which the tables are derived.") 473 printf("const Version = %q\n\n", version()) 474 475 if *tablelist == "all" { 476 println("// Categories is the set of Unicode category tables.") 477 println("var Categories = map[string] *RangeTable {") 478 for _, k := range allCategories() { 479 printf("\t%q: %s,\n", k, k) 480 } 481 print("}\n\n") 482 } 483 484 decl := make(sort.StringSlice, len(list)) 485 ndecl := 0 486 for _, name := range list { 487 if _, ok := category[name]; !ok { 488 logger.Fatal("unknown category", name) 489 } 490 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 491 // name to store the data. This stops godoc dumping all the tables but keeps them 492 // available to clients. 493 // Cases deserving special comments 494 varDecl := "" 495 switch name { 496 case "C": 497 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 498 varDecl += "\tC = _C\n" 499 case "L": 500 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 501 varDecl += "\tL = _L\n" 502 case "M": 503 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 504 varDecl += "\tM = _M\n" 505 case "N": 506 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 507 varDecl += "\tN = _N\n" 508 case "P": 509 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 510 varDecl += "\tP = _P\n" 511 case "S": 512 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 513 varDecl += "\tS = _S\n" 514 case "Z": 515 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 516 varDecl += "\tZ = _Z\n" 517 case "Nd": 518 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 519 case "Lu": 520 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 521 case "Ll": 522 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 523 case "Lt": 524 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 525 } 526 if len(name) > 1 { 527 varDecl += fmt.Sprintf( 528 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 529 name, name, name, name) 530 } 531 decl[ndecl] = varDecl 532 ndecl++ 533 if len(name) == 1 { // unified categories 534 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) 535 dumpRange( 536 decl, 537 func(code rune) bool { return categoryOp(code, name[0]) }) 538 continue 539 } 540 dumpRange( 541 fmt.Sprintf("var _%s = &RangeTable{\n", name), 542 func(code rune) bool { return chars[code].category == name }) 543 } 544 decl.Sort() 545 println("// These variables have type *RangeTable.") 546 println("var (") 547 for _, d := range decl { 548 print(d) 549 } 550 print(")\n\n") 551 } 552 553 type Op func(code rune) bool 554 555 const format = "\t\t{0x%04x, 0x%04x, %d},\n" 556 557 func dumpRange(header string, inCategory Op) { 558 print(header) 559 next := rune(0) 560 latinOffset := 0 561 print("\tR16: []Range16{\n") 562 // one Range for each iteration 563 count := &range16Count 564 size := 16 565 for { 566 // look for start of range 567 for next < rune(len(chars)) && !inCategory(next) { 568 next++ 569 } 570 if next >= rune(len(chars)) { 571 // no characters remain 572 break 573 } 574 575 // start of range 576 lo := next 577 hi := next 578 stride := rune(1) 579 // accept lo 580 next++ 581 // look for another character to set the stride 582 for next < rune(len(chars)) && !inCategory(next) { 583 next++ 584 } 585 if next >= rune(len(chars)) { 586 // no more characters 587 printf(format, lo, hi, stride) 588 break 589 } 590 // set stride 591 stride = next - lo 592 // check for length of run. next points to first jump in stride 593 for i := next; i < rune(len(chars)); i++ { 594 if inCategory(i) == (((i - lo) % stride) == 0) { 595 // accept 596 if inCategory(i) { 597 hi = i 598 } 599 } else { 600 // no more characters in this run 601 break 602 } 603 } 604 if uint32(hi) <= unicode.MaxLatin1 { 605 latinOffset++ 606 } 607 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) 608 // next range: start looking where this range ends 609 next = hi + 1 610 } 611 print("\t},\n") 612 if latinOffset > 0 { 613 printf("\tLatinOffset: %d,\n", latinOffset) 614 } 615 print("}\n\n") 616 } 617 618 func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { 619 if size == 16 && hi >= 1<<16 { 620 if lo < 1<<16 { 621 if lo+stride != hi { 622 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) 623 } 624 // No range contains U+FFFF as an instance, so split 625 // the range into two entries. That way we can maintain 626 // the invariant that R32 contains only >= 1<<16. 627 printf(format, lo, lo, 1) 628 lo = hi 629 stride = 1 630 *count++ 631 } 632 print("\t},\n") 633 print("\tR32: []Range32{\n") 634 size = 32 635 count = &range32Count 636 } 637 printf(format, lo, hi, stride) 638 *count++ 639 return size, count 640 } 641 642 func fullCategoryTest(list []string) { 643 for _, name := range list { 644 if _, ok := category[name]; !ok { 645 logger.Fatal("unknown category", name) 646 } 647 r, ok := unicode.Categories[name] 648 if !ok && len(name) > 1 { 649 logger.Fatalf("unknown table %q", name) 650 } 651 if len(name) == 1 { 652 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 653 } else { 654 verifyRange( 655 name, 656 func(code rune) bool { return chars[code].category == name }, 657 r) 658 } 659 } 660 } 661 662 func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 663 count := 0 664 for j := range chars { 665 i := rune(j) 666 web := inCategory(i) 667 pkg := unicode.Is(table, i) 668 if web != pkg { 669 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 670 count++ 671 if count > 10 { 672 break 673 } 674 } 675 } 676 } 677 678 func parseScript(line string, scripts map[string][]Script) { 679 comment := strings.Index(line, "#") 680 if comment >= 0 { 681 line = line[0:comment] 682 } 683 line = strings.TrimSpace(line) 684 if len(line) == 0 { 685 return 686 } 687 field := strings.Split(line, ";") 688 if len(field) != 2 { 689 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) 690 } 691 matches := scriptRe.FindStringSubmatch(line) 692 if len(matches) != 4 { 693 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches)) 694 } 695 lo, err := strconv.ParseUint(matches[1], 16, 64) 696 if err != nil { 697 logger.Fatalf("%.5s...: %s", line, err) 698 } 699 hi := lo 700 if len(matches[2]) > 2 { // ignore leading .. 701 hi, err = strconv.ParseUint(matches[2][2:], 16, 64) 702 if err != nil { 703 logger.Fatalf("%.5s...: %s", line, err) 704 } 705 } 706 name := matches[3] 707 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) 708 } 709 710 // The script tables have a lot of adjacent elements. Fold them together. 711 func foldAdjacent(r []Script) []unicode.Range32 { 712 s := make([]unicode.Range32, 0, len(r)) 713 j := 0 714 for i := 0; i < len(r); i++ { 715 if j > 0 && r[i].lo == s[j-1].Hi+1 { 716 s[j-1].Hi = r[i].hi 717 } else { 718 s = s[0 : j+1] 719 s[j] = unicode.Range32{ 720 Lo: uint32(r[i].lo), 721 Hi: uint32(r[i].hi), 722 Stride: 1, 723 } 724 j++ 725 } 726 } 727 return s 728 } 729 730 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { 731 for _, name := range list { 732 if _, ok := scripts[name]; !ok { 733 logger.Fatal("unknown script", name) 734 } 735 _, ok := installed[name] 736 if !ok { 737 logger.Fatal("unknown table", name) 738 } 739 for _, script := range scripts[name] { 740 for r := script.lo; r <= script.hi; r++ { 741 if !unicode.Is(installed[name], rune(r)) { 742 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 743 } 744 } 745 } 746 } 747 } 748 749 // PropList.txt has the same format as Scripts.txt so we can share its parser. 750 func printScriptOrProperty(doProps bool) { 751 flag := "scripts" 752 flaglist := *scriptlist 753 file := "Scripts.txt" 754 table := scripts 755 installed := unicode.Scripts 756 if doProps { 757 flag = "props" 758 flaglist = *proplist 759 file = "PropList.txt" 760 table = props 761 installed = unicode.Properties 762 } 763 if flaglist == "" { 764 return 765 } 766 input := open(*url + file) 767 scanner := bufio.NewScanner(input) 768 for scanner.Scan() { 769 parseScript(scanner.Text(), table) 770 } 771 if scanner.Err() != nil { 772 logger.Fatal(scanner.Err()) 773 } 774 input.close() 775 776 // Find out which scripts to dump 777 list := strings.Split(flaglist, ",") 778 if flaglist == "all" { 779 list = all(table) 780 } 781 if *test { 782 fullScriptTest(list, installed, table) 783 return 784 } 785 786 printf( 787 "// Generated by running\n"+ 788 "// maketables --%s=%s --url=%s\n"+ 789 "// DO NOT EDIT\n\n", 790 flag, 791 flaglist, 792 *url) 793 if flaglist == "all" { 794 if doProps { 795 println("// Properties is the set of Unicode property tables.") 796 println("var Properties = map[string] *RangeTable{") 797 } else { 798 println("// Scripts is the set of Unicode script tables.") 799 println("var Scripts = map[string] *RangeTable{") 800 } 801 for _, k := range all(table) { 802 printf("\t%q: %s,\n", k, k) 803 } 804 print("}\n\n") 805 } 806 807 decl := make(sort.StringSlice, len(list)) 808 ndecl := 0 809 for _, name := range list { 810 if doProps { 811 decl[ndecl] = fmt.Sprintf( 812 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 813 name, name, name, name) 814 } else { 815 decl[ndecl] = fmt.Sprintf( 816 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 817 name, name, name, name) 818 } 819 ndecl++ 820 printf("var _%s = &RangeTable {\n", name) 821 ranges := foldAdjacent(table[name]) 822 print("\tR16: []Range16{\n") 823 size := 16 824 count := &range16Count 825 for _, s := range ranges { 826 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) 827 } 828 print("\t},\n") 829 if off := findLatinOffset(ranges); off > 0 { 830 printf("\tLatinOffset: %d,\n", off) 831 } 832 print("}\n\n") 833 } 834 decl.Sort() 835 println("// These variables have type *RangeTable.") 836 println("var (") 837 for _, d := range decl { 838 print(d) 839 } 840 print(")\n\n") 841 } 842 843 func findLatinOffset(ranges []unicode.Range32) int { 844 i := 0 845 for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { 846 i++ 847 } 848 return i 849 } 850 851 const ( 852 CaseUpper = 1 << iota 853 CaseLower 854 CaseTitle 855 CaseNone = 0 // must be zero 856 CaseMissing = -1 // character not present; not a valid case state 857 ) 858 859 type caseState struct { 860 point rune 861 _case int 862 deltaToUpper rune 863 deltaToLower rune 864 deltaToTitle rune 865 } 866 867 // Is d a continuation of the state of c? 868 func (c *caseState) adjacent(d *caseState) bool { 869 if d.point < c.point { 870 c, d = d, c 871 } 872 switch { 873 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 874 return false 875 case d._case != c._case: // different cases 876 return c.upperLowerAdjacent(d) 877 case c._case == CaseNone: 878 return false 879 case c._case == CaseMissing: 880 return false 881 case d.deltaToUpper != c.deltaToUpper: 882 return false 883 case d.deltaToLower != c.deltaToLower: 884 return false 885 case d.deltaToTitle != c.deltaToTitle: 886 return false 887 } 888 return true 889 } 890 891 // Is d the same as c, but opposite in upper/lower case? this would make it 892 // an element of an UpperLower sequence. 893 func (c *caseState) upperLowerAdjacent(d *caseState) bool { 894 // check they're a matched case pair. we know they have adjacent values 895 switch { 896 case c._case == CaseUpper && d._case != CaseLower: 897 return false 898 case c._case == CaseLower && d._case != CaseUpper: 899 return false 900 } 901 // matched pair (at least in upper/lower). make the order Upper Lower 902 if c._case == CaseLower { 903 c, d = d, c 904 } 905 // for an Upper Lower sequence the deltas have to be in order 906 // c: 0 1 0 907 // d: -1 0 -1 908 switch { 909 case c.deltaToUpper != 0: 910 return false 911 case c.deltaToLower != 1: 912 return false 913 case c.deltaToTitle != 0: 914 return false 915 case d.deltaToUpper != -1: 916 return false 917 case d.deltaToLower != 0: 918 return false 919 case d.deltaToTitle != -1: 920 return false 921 } 922 return true 923 } 924 925 // Does this character start an UpperLower sequence? 926 func (c *caseState) isUpperLower() bool { 927 // for an Upper Lower sequence the deltas have to be in order 928 // c: 0 1 0 929 switch { 930 case c.deltaToUpper != 0: 931 return false 932 case c.deltaToLower != 1: 933 return false 934 case c.deltaToTitle != 0: 935 return false 936 } 937 return true 938 } 939 940 // Does this character start a LowerUpper sequence? 941 func (c *caseState) isLowerUpper() bool { 942 // for an Upper Lower sequence the deltas have to be in order 943 // c: -1 0 -1 944 switch { 945 case c.deltaToUpper != -1: 946 return false 947 case c.deltaToLower != 0: 948 return false 949 case c.deltaToTitle != -1: 950 return false 951 } 952 return true 953 } 954 955 func getCaseState(i rune) (c *caseState) { 956 c = &caseState{point: i, _case: CaseNone} 957 ch := &chars[i] 958 switch ch.codePoint { 959 case 0: 960 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 961 return 962 case ch.upperCase: 963 c._case = CaseUpper 964 case ch.lowerCase: 965 c._case = CaseLower 966 case ch.titleCase: 967 c._case = CaseTitle 968 } 969 // Some things such as roman numeral U+2161 don't describe themselves 970 // as upper case, but have a lower case. Second-guess them. 971 if c._case == CaseNone && ch.lowerCase != 0 { 972 c._case = CaseUpper 973 } 974 // Same in the other direction. 975 if c._case == CaseNone && ch.upperCase != 0 { 976 c._case = CaseLower 977 } 978 979 if ch.upperCase != 0 { 980 c.deltaToUpper = ch.upperCase - i 981 } 982 if ch.lowerCase != 0 { 983 c.deltaToLower = ch.lowerCase - i 984 } 985 if ch.titleCase != 0 { 986 c.deltaToTitle = ch.titleCase - i 987 } 988 return 989 } 990 991 func printCases() { 992 if !*cases { 993 return 994 } 995 if *test { 996 fullCaseTest() 997 return 998 } 999 printf( 1000 "// Generated by running\n"+ 1001 "// maketables --data=%s --casefolding=%s\n"+ 1002 "// DO NOT EDIT\n\n"+ 1003 "// CaseRanges is the table describing case mappings for all letters with\n"+ 1004 "// non-self mappings.\n"+ 1005 "var CaseRanges = _CaseRanges\n"+ 1006 "var _CaseRanges = []CaseRange {\n", 1007 *dataURL, *casefoldingURL) 1008 1009 var startState *caseState // the start of a run; nil for not active 1010 var prevState = &caseState{} // the state of the previous character 1011 for i := range chars { 1012 state := getCaseState(rune(i)) 1013 if state.adjacent(prevState) { 1014 prevState = state 1015 continue 1016 } 1017 // end of run (possibly) 1018 printCaseRange(startState, prevState) 1019 startState = nil 1020 if state._case != CaseMissing && state._case != CaseNone { 1021 startState = state 1022 } 1023 prevState = state 1024 } 1025 print("}\n") 1026 } 1027 1028 func printCaseRange(lo, hi *caseState) { 1029 if lo == nil { 1030 return 1031 } 1032 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 1033 // character represents itself in all cases - no need to mention it 1034 return 1035 } 1036 switch { 1037 case hi.point > lo.point && lo.isUpperLower(): 1038 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 1039 lo.point, hi.point) 1040 case hi.point > lo.point && lo.isLowerUpper(): 1041 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 1042 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 1043 lo.point, hi.point) 1044 default: 1045 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 1046 lo.point, hi.point, 1047 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 1048 } 1049 } 1050 1051 // If the cased value in the Char is 0, it means use the rune itself. 1052 func caseIt(r, cased rune) rune { 1053 if cased == 0 { 1054 return r 1055 } 1056 return cased 1057 } 1058 1059 func fullCaseTest() { 1060 for j, c := range chars { 1061 i := rune(j) 1062 lower := unicode.ToLower(i) 1063 want := caseIt(i, c.lowerCase) 1064 if lower != want { 1065 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 1066 } 1067 upper := unicode.ToUpper(i) 1068 want = caseIt(i, c.upperCase) 1069 if upper != want { 1070 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 1071 } 1072 title := unicode.ToTitle(i) 1073 want = caseIt(i, c.titleCase) 1074 if title != want { 1075 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 1076 } 1077 } 1078 } 1079 1080 func printLatinProperties() { 1081 if *test { 1082 return 1083 } 1084 println("var properties = [MaxLatin1+1]uint8{") 1085 for code := 0; code <= unicode.MaxLatin1; code++ { 1086 var property string 1087 switch chars[code].category { 1088 case "Cc", "": // NUL has no category. 1089 property = "pC" 1090 case "Cf": // soft hyphen, unique category, not printable. 1091 property = "0" 1092 case "Ll": 1093 property = "pLl | pp" 1094 case "Lo": 1095 property = "pLo | pp" 1096 case "Lu": 1097 property = "pLu | pp" 1098 case "Nd", "No": 1099 property = "pN | pp" 1100 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 1101 property = "pP | pp" 1102 case "Sc", "Sk", "Sm", "So": 1103 property = "pS | pp" 1104 case "Zs": 1105 property = "pZ" 1106 default: 1107 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 1108 } 1109 // Special case 1110 if code == ' ' { 1111 property = "pZ | pp" 1112 } 1113 printf("\t0x%02X: %s, // %q\n", code, property, code) 1114 } 1115 printf("}\n\n") 1116 } 1117 1118 type runeSlice []rune 1119 1120 func (p runeSlice) Len() int { return len(p) } 1121 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 1122 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 1123 1124 func printCasefold() { 1125 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 1126 var caseOrbit = make([][]rune, MaxChar+1) 1127 for j := range chars { 1128 i := rune(j) 1129 c := &chars[i] 1130 if c.foldCase == 0 { 1131 continue 1132 } 1133 orb := caseOrbit[c.foldCase] 1134 if orb == nil { 1135 orb = append(orb, c.foldCase) 1136 } 1137 caseOrbit[c.foldCase] = append(orb, i) 1138 } 1139 1140 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 1141 for j := range chars { 1142 i := rune(j) 1143 c := &chars[i] 1144 f := c.foldCase 1145 if f == 0 { 1146 f = i 1147 } 1148 orb := caseOrbit[f] 1149 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 1150 // Default assumption of [upper, lower] is wrong. 1151 caseOrbit[i] = []rune{i} 1152 } 1153 } 1154 1155 // Delete the groups for which assuming [lower, upper] is right. 1156 for i, orb := range caseOrbit { 1157 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 1158 caseOrbit[i] = nil 1159 } 1160 } 1161 1162 // Record orbit information in chars. 1163 for _, orb := range caseOrbit { 1164 if orb == nil { 1165 continue 1166 } 1167 sort.Sort(runeSlice(orb)) 1168 c := orb[len(orb)-1] 1169 for _, d := range orb { 1170 chars[c].caseOrbit = d 1171 c = d 1172 } 1173 } 1174 1175 printCaseOrbit() 1176 1177 // Tables of category and script folding exceptions: code points 1178 // that must be added when interpreting a particular category/script 1179 // in a case-folding context. 1180 cat := make(map[string]map[rune]bool) 1181 for name := range category { 1182 if x := foldExceptions(inCategory(name)); len(x) > 0 { 1183 cat[name] = x 1184 } 1185 } 1186 1187 scr := make(map[string]map[rune]bool) 1188 for name := range scripts { 1189 if x := foldExceptions(inScript(name)); len(x) > 0 { 1190 cat[name] = x 1191 } 1192 } 1193 1194 printCatFold("FoldCategory", cat) 1195 printCatFold("FoldScript", scr) 1196 } 1197 1198 // inCategory returns a list of all the runes in the category. 1199 func inCategory(name string) []rune { 1200 var x []rune 1201 for j := range chars { 1202 i := rune(j) 1203 c := &chars[i] 1204 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 1205 x = append(x, i) 1206 } 1207 } 1208 return x 1209 } 1210 1211 // inScript returns a list of all the runes in the script. 1212 func inScript(name string) []rune { 1213 var x []rune 1214 for _, s := range scripts[name] { 1215 for c := s.lo; c <= s.hi; c++ { 1216 x = append(x, rune(c)) 1217 } 1218 } 1219 return x 1220 } 1221 1222 // foldExceptions returns a list of all the runes fold-equivalent 1223 // to runes in class but not in class themselves. 1224 func foldExceptions(class []rune) map[rune]bool { 1225 // Create map containing class and all fold-equivalent chars. 1226 m := make(map[rune]bool) 1227 for _, r := range class { 1228 c := &chars[r] 1229 if c.caseOrbit == 0 { 1230 // Just upper and lower. 1231 if u := c.upperCase; u != 0 { 1232 m[u] = true 1233 } 1234 if l := c.lowerCase; l != 0 { 1235 m[l] = true 1236 } 1237 m[r] = true 1238 continue 1239 } 1240 // Otherwise walk orbit. 1241 r0 := r 1242 for { 1243 m[r] = true 1244 r = chars[r].caseOrbit 1245 if r == r0 { 1246 break 1247 } 1248 } 1249 } 1250 1251 // Remove class itself. 1252 for _, r := range class { 1253 delete(m, r) 1254 } 1255 1256 // What's left is the exceptions. 1257 return m 1258 } 1259 1260 var comment = map[string]string{ 1261 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 1262 "// code points outside the category that are equivalent under\n" + 1263 "// simple case folding to code points inside the category.\n" + 1264 "// If there is no entry for a category name, there are no such points.\n", 1265 1266 "FoldScript": "// FoldScript maps a script name to a table of\n" + 1267 "// code points outside the script that are equivalent under\n" + 1268 "// simple case folding to code points inside the script.\n" + 1269 "// If there is no entry for a script name, there are no such points.\n", 1270 } 1271 1272 func printCaseOrbit() { 1273 if *test { 1274 for j := range chars { 1275 i := rune(j) 1276 c := &chars[i] 1277 f := c.caseOrbit 1278 if f == 0 { 1279 if c.lowerCase != i && c.lowerCase != 0 { 1280 f = c.lowerCase 1281 } else if c.upperCase != i && c.upperCase != 0 { 1282 f = c.upperCase 1283 } else { 1284 f = i 1285 } 1286 } 1287 if g := unicode.SimpleFold(i); g != f { 1288 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 1289 } 1290 } 1291 return 1292 } 1293 1294 printf("var caseOrbit = []foldPair{\n") 1295 for i := range chars { 1296 c := &chars[i] 1297 if c.caseOrbit != 0 { 1298 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 1299 foldPairCount++ 1300 } 1301 } 1302 printf("}\n\n") 1303 } 1304 1305 func printCatFold(name string, m map[string]map[rune]bool) { 1306 if *test { 1307 var pkgMap map[string]*unicode.RangeTable 1308 if name == "FoldCategory" { 1309 pkgMap = unicode.FoldCategory 1310 } else { 1311 pkgMap = unicode.FoldScript 1312 } 1313 if len(pkgMap) != len(m) { 1314 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 1315 return 1316 } 1317 for k, v := range m { 1318 t, ok := pkgMap[k] 1319 if !ok { 1320 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 1321 continue 1322 } 1323 n := 0 1324 for _, r := range t.R16 { 1325 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1326 if !v[c] { 1327 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1328 } 1329 n++ 1330 } 1331 } 1332 for _, r := range t.R32 { 1333 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1334 if !v[c] { 1335 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1336 } 1337 n++ 1338 } 1339 } 1340 if n != len(v) { 1341 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 1342 } 1343 } 1344 return 1345 } 1346 1347 print(comment[name]) 1348 printf("var %s = map[string]*RangeTable{\n", name) 1349 for _, name := range allCatFold(m) { 1350 printf("\t%q: fold%s,\n", name, name) 1351 } 1352 printf("}\n\n") 1353 for _, name := range allCatFold(m) { 1354 class := m[name] 1355 dumpRange( 1356 fmt.Sprintf("var fold%s = &RangeTable{\n", name), 1357 func(code rune) bool { return class[code] }) 1358 } 1359 } 1360 1361 var range16Count = 0 // Number of entries in the 16-bit range tables. 1362 var range32Count = 0 // Number of entries in the 32-bit range tables. 1363 var foldPairCount = 0 // Number of fold pairs in the exception tables. 1364 1365 func printSizes() { 1366 if *test { 1367 return 1368 } 1369 println() 1370 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1371 range16Bytes := range16Count * 3 * 2 1372 range32Bytes := range32Count * 3 * 4 1373 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1374 println() 1375 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1376 }