github.com/ader1990/go@v0.0.0-20140630135419-8c24447fa791/src/pkg/unicode/maketables.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 // Unicode table generator. 8 // Data read from the web. 9 10 package main 11 12 import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "log" 17 "net/http" 18 "os" 19 "path/filepath" 20 "regexp" 21 "sort" 22 "strconv" 23 "strings" 24 "unicode" 25 ) 26 27 func main() { 28 flag.Parse() 29 loadChars() // always needed 30 loadCasefold() 31 printCategories() 32 printScriptOrProperty(false) 33 printScriptOrProperty(true) 34 printCases() 35 printLatinProperties() 36 printCasefold() 37 printSizes() 38 } 39 40 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") 41 var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") 42 var url = flag.String("url", 43 "http://www.unicode.org/Public/6.3.0/ucd/", 44 "URL of Unicode database directory") 45 var tablelist = flag.String("tables", 46 "all", 47 "comma-separated list of which tables to generate; can be letter") 48 var scriptlist = flag.String("scripts", 49 "all", 50 "comma-separated list of which script tables to generate") 51 var proplist = flag.String("props", 52 "all", 53 "comma-separated list of which property tables to generate") 54 var cases = flag.Bool("cases", 55 true, 56 "generate case tables") 57 var test = flag.Bool("test", 58 false, 59 "test existing tables; can be used to compare web data with package data") 60 var localFiles = flag.Bool("local", 61 false, 62 "data files have been copied to current directory; for debugging only") 63 64 var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 65 var logger = log.New(os.Stderr, "", log.Lshortfile) 66 67 type reader struct { 68 *bufio.Reader 69 fd *os.File 70 resp *http.Response 71 } 72 73 func open(url string) *reader { 74 file := filepath.Base(url) 75 if *localFiles { 76 fd, err := os.Open(file) 77 if err != nil { 78 logger.Fatal(err) 79 } 80 return &reader{bufio.NewReader(fd), fd, nil} 81 } 82 resp, err := http.Get(url) 83 if err != nil { 84 logger.Fatal(err) 85 } 86 if resp.StatusCode != 200 { 87 logger.Fatalf("bad GET status for %s: %d", file, resp.Status) 88 } 89 return &reader{bufio.NewReader(resp.Body), nil, resp} 90 91 } 92 93 func (r *reader) close() { 94 if r.fd != nil { 95 r.fd.Close() 96 } else { 97 r.resp.Body.Close() 98 } 99 } 100 101 var category = map[string]bool{ 102 // Nd Lu etc. 103 // We use one-character names to identify merged categories 104 "L": true, // Lu Ll Lt Lm Lo 105 "P": true, // Pc Pd Ps Pe Pu Pf Po 106 "M": true, // Mn Mc Me 107 "N": true, // Nd Nl No 108 "S": true, // Sm Sc Sk So 109 "Z": true, // Zs Zl Zp 110 "C": true, // Cc Cf Cs Co Cn 111 } 112 113 // UnicodeData.txt has form: 114 // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; 115 // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A 116 // See http://www.unicode.org/reports/tr44/ for a full explanation 117 // The fields: 118 const ( 119 FCodePoint = iota 120 FName 121 FGeneralCategory 122 FCanonicalCombiningClass 123 FBidiClass 124 FDecompositionTypeAndMapping 125 FNumericType 126 FNumericDigit // If a decimal digit. 127 FNumericValue // Includes non-decimal, e.g. U+2155=1/5 128 FBidiMirrored 129 FUnicode1Name 130 FISOComment 131 FSimpleUppercaseMapping 132 FSimpleLowercaseMapping 133 FSimpleTitlecaseMapping 134 NumField 135 136 MaxChar = 0x10FFFF // anything above this shouldn't exist 137 ) 138 139 var fieldName = []string{ 140 FCodePoint: "CodePoint", 141 FName: "Name", 142 FGeneralCategory: "GeneralCategory", 143 FCanonicalCombiningClass: "CanonicalCombiningClass", 144 FBidiClass: "BidiClass", 145 FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", 146 FNumericType: "NumericType", 147 FNumericDigit: "NumericDigit", 148 FNumericValue: "NumericValue", 149 FBidiMirrored: "BidiMirrored", 150 FUnicode1Name: "Unicode1Name", 151 FISOComment: "ISOComment", 152 FSimpleUppercaseMapping: "SimpleUppercaseMapping", 153 FSimpleLowercaseMapping: "SimpleLowercaseMapping", 154 FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", 155 } 156 157 // This contains only the properties we're interested in. 158 type Char struct { 159 field []string // debugging only; could be deleted if we take out char.dump() 160 codePoint rune // if zero, this index is not a valid code point. 161 category string 162 upperCase rune 163 lowerCase rune 164 titleCase rune 165 foldCase rune // simple case folding 166 caseOrbit rune // next in simple case folding orbit 167 } 168 169 // Scripts.txt has form: 170 // A673 ; Cyrillic # Po SLAVONIC ASTERISK 171 // A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK 172 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation 173 174 type Script struct { 175 lo, hi uint32 // range of code points 176 script string 177 } 178 179 var chars = make([]Char, MaxChar+1) 180 var scripts = make(map[string][]Script) 181 var props = make(map[string][]Script) // a property looks like a script; can share the format 182 183 var lastChar rune = 0 184 185 // In UnicodeData.txt, some ranges are marked like this: 186 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 187 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 188 // parseCategory returns a state variable indicating the weirdness. 189 type State int 190 191 const ( 192 SNormal State = iota // known to be zero for the type 193 SFirst 194 SLast 195 SMissing 196 ) 197 198 func parseCategory(line string) (state State) { 199 field := strings.Split(line, ";") 200 if len(field) != NumField { 201 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) 202 } 203 point, err := strconv.ParseUint(field[FCodePoint], 16, 64) 204 if err != nil { 205 logger.Fatalf("%.5s...: %s", line, err) 206 } 207 lastChar = rune(point) 208 if point == 0 { 209 return // not interesting and we use 0 as unset 210 } 211 if point > MaxChar { 212 return 213 } 214 char := &chars[point] 215 char.field = field 216 if char.codePoint != 0 { 217 logger.Fatalf("point %U reused", point) 218 } 219 char.codePoint = lastChar 220 char.category = field[FGeneralCategory] 221 category[char.category] = true 222 switch char.category { 223 case "Nd": 224 // Decimal digit 225 _, err := strconv.Atoi(field[FNumericValue]) 226 if err != nil { 227 logger.Fatalf("%U: bad numeric field: %s", point, err) 228 } 229 case "Lu": 230 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 231 case "Ll": 232 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) 233 case "Lt": 234 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) 235 default: 236 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) 237 } 238 switch { 239 case strings.Index(field[FName], ", First>") > 0: 240 state = SFirst 241 case strings.Index(field[FName], ", Last>") > 0: 242 state = SLast 243 } 244 return 245 } 246 247 func (char *Char) dump(s string) { 248 fmt.Print(s, " ") 249 for i := 0; i < len(char.field); i++ { 250 fmt.Printf("%s:%q ", fieldName[i], char.field[i]) 251 } 252 fmt.Print("\n") 253 } 254 255 func (char *Char) letter(u, l, t string) { 256 char.upperCase = char.letterValue(u, "U") 257 char.lowerCase = char.letterValue(l, "L") 258 char.titleCase = char.letterValue(t, "T") 259 } 260 261 func (char *Char) letterValue(s string, cas string) rune { 262 if s == "" { 263 return 0 264 } 265 v, err := strconv.ParseUint(s, 16, 64) 266 if err != nil { 267 char.dump(cas) 268 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err) 269 } 270 return rune(v) 271 } 272 273 func allCategories() []string { 274 a := make([]string, 0, len(category)) 275 for k := range category { 276 a = append(a, k) 277 } 278 sort.Strings(a) 279 return a 280 } 281 282 func all(scripts map[string][]Script) []string { 283 a := make([]string, 0, len(scripts)) 284 for k := range scripts { 285 a = append(a, k) 286 } 287 sort.Strings(a) 288 return a 289 } 290 291 func allCatFold(m map[string]map[rune]bool) []string { 292 a := make([]string, 0, len(m)) 293 for k := range m { 294 a = append(a, k) 295 } 296 sort.Strings(a) 297 return a 298 } 299 300 // Extract the version number from the URL 301 func version() string { 302 // Break on slashes and look for the first numeric field 303 fields := strings.Split(*url, "/") 304 for _, f := range fields { 305 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { 306 return f 307 } 308 } 309 logger.Fatal("unknown version") 310 return "Unknown" 311 } 312 313 func categoryOp(code rune, class uint8) bool { 314 category := chars[code].category 315 return len(category) > 0 && category[0] == class 316 } 317 318 func loadChars() { 319 if *dataURL == "" { 320 flag.Set("data", *url+"UnicodeData.txt") 321 } 322 input := open(*dataURL) 323 defer input.close() 324 scanner := bufio.NewScanner(input) 325 var first rune = 0 326 for scanner.Scan() { 327 switch parseCategory(scanner.Text()) { 328 case SNormal: 329 if first != 0 { 330 logger.Fatalf("bad state normal at %U", lastChar) 331 } 332 case SFirst: 333 if first != 0 { 334 logger.Fatalf("bad state first at %U", lastChar) 335 } 336 first = lastChar 337 case SLast: 338 if first == 0 { 339 logger.Fatalf("bad state last at %U", lastChar) 340 } 341 for i := first + 1; i <= lastChar; i++ { 342 chars[i] = chars[first] 343 chars[i].codePoint = i 344 } 345 first = 0 346 } 347 } 348 if scanner.Err() != nil { 349 logger.Fatal(scanner.Err()) 350 } 351 } 352 353 func loadCasefold() { 354 if *casefoldingURL == "" { 355 flag.Set("casefolding", *url+"CaseFolding.txt") 356 } 357 input := open(*casefoldingURL) 358 defer input.close() 359 scanner := bufio.NewScanner(input) 360 for scanner.Scan() { 361 line := scanner.Text() 362 if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 { 363 continue 364 } 365 field := strings.Split(line, "; ") 366 if len(field) != 4 { 367 logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) 368 } 369 kind := field[1] 370 if kind != "C" && kind != "S" { 371 // Only care about 'common' and 'simple' foldings. 372 continue 373 } 374 p1, err := strconv.ParseUint(field[0], 16, 64) 375 if err != nil { 376 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 377 } 378 p2, err := strconv.ParseUint(field[2], 16, 64) 379 if err != nil { 380 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) 381 } 382 chars[p1].foldCase = rune(p2) 383 } 384 if scanner.Err() != nil { 385 logger.Fatal(scanner.Err()) 386 } 387 } 388 389 const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. 390 // Use of this source code is governed by a BSD-style 391 // license that can be found in the LICENSE file. 392 393 // Generated by running 394 // maketables --tables=%s --data=%s --casefolding=%s 395 // DO NOT EDIT 396 397 package unicode 398 399 ` 400 401 func printCategories() { 402 if *tablelist == "" { 403 return 404 } 405 // Find out which categories to dump 406 list := strings.Split(*tablelist, ",") 407 if *tablelist == "all" { 408 list = allCategories() 409 } 410 if *test { 411 fullCategoryTest(list) 412 return 413 } 414 fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL) 415 416 fmt.Println("// Version is the Unicode edition from which the tables are derived.") 417 fmt.Printf("const Version = %q\n\n", version()) 418 419 if *tablelist == "all" { 420 fmt.Println("// Categories is the set of Unicode category tables.") 421 fmt.Println("var Categories = map[string] *RangeTable {") 422 for _, k := range allCategories() { 423 fmt.Printf("\t%q: %s,\n", k, k) 424 } 425 fmt.Print("}\n\n") 426 } 427 428 decl := make(sort.StringSlice, len(list)) 429 ndecl := 0 430 for _, name := range list { 431 if _, ok := category[name]; !ok { 432 logger.Fatal("unknown category", name) 433 } 434 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 435 // name to store the data. This stops godoc dumping all the tables but keeps them 436 // available to clients. 437 // Cases deserving special comments 438 varDecl := "" 439 switch name { 440 case "C": 441 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 442 varDecl += "\tC = _C\n" 443 case "L": 444 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 445 varDecl += "\tL = _L\n" 446 case "M": 447 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 448 varDecl += "\tM = _M\n" 449 case "N": 450 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 451 varDecl += "\tN = _N\n" 452 case "P": 453 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 454 varDecl += "\tP = _P\n" 455 case "S": 456 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 457 varDecl += "\tS = _S\n" 458 case "Z": 459 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 460 varDecl += "\tZ = _Z\n" 461 case "Nd": 462 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 463 case "Lu": 464 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 465 case "Ll": 466 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 467 case "Lt": 468 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 469 } 470 if len(name) > 1 { 471 varDecl += fmt.Sprintf( 472 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 473 name, name, name, name) 474 } 475 decl[ndecl] = varDecl 476 ndecl++ 477 if len(name) == 1 { // unified categories 478 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) 479 dumpRange( 480 decl, 481 func(code rune) bool { return categoryOp(code, name[0]) }) 482 continue 483 } 484 dumpRange( 485 fmt.Sprintf("var _%s = &RangeTable{\n", name), 486 func(code rune) bool { return chars[code].category == name }) 487 } 488 decl.Sort() 489 fmt.Println("// These variables have type *RangeTable.") 490 fmt.Println("var (") 491 for _, d := range decl { 492 fmt.Print(d) 493 } 494 fmt.Print(")\n\n") 495 } 496 497 type Op func(code rune) bool 498 499 const format = "\t\t{0x%04x, 0x%04x, %d},\n" 500 501 func dumpRange(header string, inCategory Op) { 502 fmt.Print(header) 503 next := rune(0) 504 latinOffset := 0 505 fmt.Print("\tR16: []Range16{\n") 506 // one Range for each iteration 507 count := &range16Count 508 size := 16 509 for { 510 // look for start of range 511 for next < rune(len(chars)) && !inCategory(next) { 512 next++ 513 } 514 if next >= rune(len(chars)) { 515 // no characters remain 516 break 517 } 518 519 // start of range 520 lo := next 521 hi := next 522 stride := rune(1) 523 // accept lo 524 next++ 525 // look for another character to set the stride 526 for next < rune(len(chars)) && !inCategory(next) { 527 next++ 528 } 529 if next >= rune(len(chars)) { 530 // no more characters 531 fmt.Printf(format, lo, hi, stride) 532 break 533 } 534 // set stride 535 stride = next - lo 536 // check for length of run. next points to first jump in stride 537 for i := next; i < rune(len(chars)); i++ { 538 if inCategory(i) == (((i - lo) % stride) == 0) { 539 // accept 540 if inCategory(i) { 541 hi = i 542 } 543 } else { 544 // no more characters in this run 545 break 546 } 547 } 548 if uint32(hi) <= unicode.MaxLatin1 { 549 latinOffset++ 550 } 551 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) 552 // next range: start looking where this range ends 553 next = hi + 1 554 } 555 fmt.Print("\t},\n") 556 if latinOffset > 0 { 557 fmt.Printf("\tLatinOffset: %d,\n", latinOffset) 558 } 559 fmt.Print("}\n\n") 560 } 561 562 func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { 563 if size == 16 && hi >= 1<<16 { 564 if lo < 1<<16 { 565 if lo+stride != hi { 566 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) 567 } 568 // No range contains U+FFFF as an instance, so split 569 // the range into two entries. That way we can maintain 570 // the invariant that R32 contains only >= 1<<16. 571 fmt.Printf(format, lo, lo, 1) 572 lo = hi 573 stride = 1 574 *count++ 575 } 576 fmt.Print("\t},\n") 577 fmt.Print("\tR32: []Range32{\n") 578 size = 32 579 count = &range32Count 580 } 581 fmt.Printf(format, lo, hi, stride) 582 *count++ 583 return size, count 584 } 585 586 func fullCategoryTest(list []string) { 587 for _, name := range list { 588 if _, ok := category[name]; !ok { 589 logger.Fatal("unknown category", name) 590 } 591 r, ok := unicode.Categories[name] 592 if !ok && len(name) > 1 { 593 logger.Fatalf("unknown table %q", name) 594 } 595 if len(name) == 1 { 596 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 597 } else { 598 verifyRange( 599 name, 600 func(code rune) bool { return chars[code].category == name }, 601 r) 602 } 603 } 604 } 605 606 func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 607 count := 0 608 for j := range chars { 609 i := rune(j) 610 web := inCategory(i) 611 pkg := unicode.Is(table, i) 612 if web != pkg { 613 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 614 count++ 615 if count > 10 { 616 break 617 } 618 } 619 } 620 } 621 622 func parseScript(line string, scripts map[string][]Script) { 623 comment := strings.Index(line, "#") 624 if comment >= 0 { 625 line = line[0:comment] 626 } 627 line = strings.TrimSpace(line) 628 if len(line) == 0 { 629 return 630 } 631 field := strings.Split(line, ";") 632 if len(field) != 2 { 633 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) 634 } 635 matches := scriptRe.FindStringSubmatch(line) 636 if len(matches) != 4 { 637 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches)) 638 } 639 lo, err := strconv.ParseUint(matches[1], 16, 64) 640 if err != nil { 641 logger.Fatalf("%.5s...: %s", line, err) 642 } 643 hi := lo 644 if len(matches[2]) > 2 { // ignore leading .. 645 hi, err = strconv.ParseUint(matches[2][2:], 16, 64) 646 if err != nil { 647 logger.Fatalf("%.5s...: %s", line, err) 648 } 649 } 650 name := matches[3] 651 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) 652 } 653 654 // The script tables have a lot of adjacent elements. Fold them together. 655 func foldAdjacent(r []Script) []unicode.Range32 { 656 s := make([]unicode.Range32, 0, len(r)) 657 j := 0 658 for i := 0; i < len(r); i++ { 659 if j > 0 && r[i].lo == s[j-1].Hi+1 { 660 s[j-1].Hi = r[i].hi 661 } else { 662 s = s[0 : j+1] 663 s[j] = unicode.Range32{ 664 Lo: uint32(r[i].lo), 665 Hi: uint32(r[i].hi), 666 Stride: 1, 667 } 668 j++ 669 } 670 } 671 return s 672 } 673 674 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { 675 for _, name := range list { 676 if _, ok := scripts[name]; !ok { 677 logger.Fatal("unknown script", name) 678 } 679 _, ok := installed[name] 680 if !ok { 681 logger.Fatal("unknown table", name) 682 } 683 for _, script := range scripts[name] { 684 for r := script.lo; r <= script.hi; r++ { 685 if !unicode.Is(installed[name], rune(r)) { 686 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 687 } 688 } 689 } 690 } 691 } 692 693 // PropList.txt has the same format as Scripts.txt so we can share its parser. 694 func printScriptOrProperty(doProps bool) { 695 flag := "scripts" 696 flaglist := *scriptlist 697 file := "Scripts.txt" 698 table := scripts 699 installed := unicode.Scripts 700 if doProps { 701 flag = "props" 702 flaglist = *proplist 703 file = "PropList.txt" 704 table = props 705 installed = unicode.Properties 706 } 707 if flaglist == "" { 708 return 709 } 710 input := open(*url + file) 711 scanner := bufio.NewScanner(input) 712 for scanner.Scan() { 713 parseScript(scanner.Text(), table) 714 } 715 if scanner.Err() != nil { 716 logger.Fatal(scanner.Err()) 717 } 718 input.close() 719 720 // Find out which scripts to dump 721 list := strings.Split(flaglist, ",") 722 if flaglist == "all" { 723 list = all(table) 724 } 725 if *test { 726 fullScriptTest(list, installed, table) 727 return 728 } 729 730 fmt.Printf( 731 "// Generated by running\n"+ 732 "// maketables --%s=%s --url=%s\n"+ 733 "// DO NOT EDIT\n\n", 734 flag, 735 flaglist, 736 *url) 737 if flaglist == "all" { 738 if doProps { 739 fmt.Println("// Properties is the set of Unicode property tables.") 740 fmt.Println("var Properties = map[string] *RangeTable{") 741 } else { 742 fmt.Println("// Scripts is the set of Unicode script tables.") 743 fmt.Println("var Scripts = map[string] *RangeTable{") 744 } 745 for _, k := range all(table) { 746 fmt.Printf("\t%q: %s,\n", k, k) 747 } 748 fmt.Print("}\n\n") 749 } 750 751 decl := make(sort.StringSlice, len(list)) 752 ndecl := 0 753 for _, name := range list { 754 if doProps { 755 decl[ndecl] = fmt.Sprintf( 756 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 757 name, name, name, name) 758 } else { 759 decl[ndecl] = fmt.Sprintf( 760 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 761 name, name, name, name) 762 } 763 ndecl++ 764 fmt.Printf("var _%s = &RangeTable {\n", name) 765 ranges := foldAdjacent(table[name]) 766 fmt.Print("\tR16: []Range16{\n") 767 size := 16 768 count := &range16Count 769 for _, s := range ranges { 770 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) 771 } 772 fmt.Print("\t},\n") 773 if off := findLatinOffset(ranges); off > 0 { 774 fmt.Printf("\tLatinOffset: %d,\n", off) 775 } 776 fmt.Print("}\n\n") 777 } 778 decl.Sort() 779 fmt.Println("// These variables have type *RangeTable.") 780 fmt.Println("var (") 781 for _, d := range decl { 782 fmt.Print(d) 783 } 784 fmt.Print(")\n\n") 785 } 786 787 func findLatinOffset(ranges []unicode.Range32) int { 788 i := 0 789 for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { 790 i++ 791 } 792 return i 793 } 794 795 const ( 796 CaseUpper = 1 << iota 797 CaseLower 798 CaseTitle 799 CaseNone = 0 // must be zero 800 CaseMissing = -1 // character not present; not a valid case state 801 ) 802 803 type caseState struct { 804 point rune 805 _case int 806 deltaToUpper rune 807 deltaToLower rune 808 deltaToTitle rune 809 } 810 811 // Is d a continuation of the state of c? 812 func (c *caseState) adjacent(d *caseState) bool { 813 if d.point < c.point { 814 c, d = d, c 815 } 816 switch { 817 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 818 return false 819 case d._case != c._case: // different cases 820 return c.upperLowerAdjacent(d) 821 case c._case == CaseNone: 822 return false 823 case c._case == CaseMissing: 824 return false 825 case d.deltaToUpper != c.deltaToUpper: 826 return false 827 case d.deltaToLower != c.deltaToLower: 828 return false 829 case d.deltaToTitle != c.deltaToTitle: 830 return false 831 } 832 return true 833 } 834 835 // Is d the same as c, but opposite in upper/lower case? this would make it 836 // an element of an UpperLower sequence. 837 func (c *caseState) upperLowerAdjacent(d *caseState) bool { 838 // check they're a matched case pair. we know they have adjacent values 839 switch { 840 case c._case == CaseUpper && d._case != CaseLower: 841 return false 842 case c._case == CaseLower && d._case != CaseUpper: 843 return false 844 } 845 // matched pair (at least in upper/lower). make the order Upper Lower 846 if c._case == CaseLower { 847 c, d = d, c 848 } 849 // for an Upper Lower sequence the deltas have to be in order 850 // c: 0 1 0 851 // d: -1 0 -1 852 switch { 853 case c.deltaToUpper != 0: 854 return false 855 case c.deltaToLower != 1: 856 return false 857 case c.deltaToTitle != 0: 858 return false 859 case d.deltaToUpper != -1: 860 return false 861 case d.deltaToLower != 0: 862 return false 863 case d.deltaToTitle != -1: 864 return false 865 } 866 return true 867 } 868 869 // Does this character start an UpperLower sequence? 870 func (c *caseState) isUpperLower() bool { 871 // for an Upper Lower sequence the deltas have to be in order 872 // c: 0 1 0 873 switch { 874 case c.deltaToUpper != 0: 875 return false 876 case c.deltaToLower != 1: 877 return false 878 case c.deltaToTitle != 0: 879 return false 880 } 881 return true 882 } 883 884 // Does this character start a LowerUpper sequence? 885 func (c *caseState) isLowerUpper() bool { 886 // for an Upper Lower sequence the deltas have to be in order 887 // c: -1 0 -1 888 switch { 889 case c.deltaToUpper != -1: 890 return false 891 case c.deltaToLower != 0: 892 return false 893 case c.deltaToTitle != -1: 894 return false 895 } 896 return true 897 } 898 899 func getCaseState(i rune) (c *caseState) { 900 c = &caseState{point: i, _case: CaseNone} 901 ch := &chars[i] 902 switch ch.codePoint { 903 case 0: 904 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 905 return 906 case ch.upperCase: 907 c._case = CaseUpper 908 case ch.lowerCase: 909 c._case = CaseLower 910 case ch.titleCase: 911 c._case = CaseTitle 912 } 913 // Some things such as roman numeral U+2161 don't describe themselves 914 // as upper case, but have a lower case. Second-guess them. 915 if c._case == CaseNone && ch.lowerCase != 0 { 916 c._case = CaseUpper 917 } 918 // Same in the other direction. 919 if c._case == CaseNone && ch.upperCase != 0 { 920 c._case = CaseLower 921 } 922 923 if ch.upperCase != 0 { 924 c.deltaToUpper = ch.upperCase - i 925 } 926 if ch.lowerCase != 0 { 927 c.deltaToLower = ch.lowerCase - i 928 } 929 if ch.titleCase != 0 { 930 c.deltaToTitle = ch.titleCase - i 931 } 932 return 933 } 934 935 func printCases() { 936 if !*cases { 937 return 938 } 939 if *test { 940 fullCaseTest() 941 return 942 } 943 fmt.Printf( 944 "// Generated by running\n"+ 945 "// maketables --data=%s --casefolding=%s\n"+ 946 "// DO NOT EDIT\n\n"+ 947 "// CaseRanges is the table describing case mappings for all letters with\n"+ 948 "// non-self mappings.\n"+ 949 "var CaseRanges = _CaseRanges\n"+ 950 "var _CaseRanges = []CaseRange {\n", 951 *dataURL, *casefoldingURL) 952 953 var startState *caseState // the start of a run; nil for not active 954 var prevState = &caseState{} // the state of the previous character 955 for i := range chars { 956 state := getCaseState(rune(i)) 957 if state.adjacent(prevState) { 958 prevState = state 959 continue 960 } 961 // end of run (possibly) 962 printCaseRange(startState, prevState) 963 startState = nil 964 if state._case != CaseMissing && state._case != CaseNone { 965 startState = state 966 } 967 prevState = state 968 } 969 fmt.Print("}\n") 970 } 971 972 func printCaseRange(lo, hi *caseState) { 973 if lo == nil { 974 return 975 } 976 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 977 // character represents itself in all cases - no need to mention it 978 return 979 } 980 switch { 981 case hi.point > lo.point && lo.isUpperLower(): 982 fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 983 lo.point, hi.point) 984 case hi.point > lo.point && lo.isLowerUpper(): 985 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 986 fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 987 lo.point, hi.point) 988 default: 989 fmt.Printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 990 lo.point, hi.point, 991 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 992 } 993 } 994 995 // If the cased value in the Char is 0, it means use the rune itself. 996 func caseIt(r, cased rune) rune { 997 if cased == 0 { 998 return r 999 } 1000 return cased 1001 } 1002 1003 func fullCaseTest() { 1004 for j, c := range chars { 1005 i := rune(j) 1006 lower := unicode.ToLower(i) 1007 want := caseIt(i, c.lowerCase) 1008 if lower != want { 1009 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 1010 } 1011 upper := unicode.ToUpper(i) 1012 want = caseIt(i, c.upperCase) 1013 if upper != want { 1014 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 1015 } 1016 title := unicode.ToTitle(i) 1017 want = caseIt(i, c.titleCase) 1018 if title != want { 1019 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 1020 } 1021 } 1022 } 1023 1024 func printLatinProperties() { 1025 if *test { 1026 return 1027 } 1028 fmt.Println("var properties = [MaxLatin1+1]uint8{") 1029 for code := 0; code <= unicode.MaxLatin1; code++ { 1030 var property string 1031 switch chars[code].category { 1032 case "Cc", "": // NUL has no category. 1033 property = "pC" 1034 case "Cf": // soft hyphen, unique category, not printable. 1035 property = "0" 1036 case "Ll": 1037 property = "pLl | pp" 1038 case "Lo": 1039 property = "pLo | pp" 1040 case "Lu": 1041 property = "pLu | pp" 1042 case "Nd", "No": 1043 property = "pN | pp" 1044 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 1045 property = "pP | pp" 1046 case "Sc", "Sk", "Sm", "So": 1047 property = "pS | pp" 1048 case "Zs": 1049 property = "pZ" 1050 default: 1051 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 1052 } 1053 // Special case 1054 if code == ' ' { 1055 property = "pZ | pp" 1056 } 1057 fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code) 1058 } 1059 fmt.Printf("}\n\n") 1060 } 1061 1062 type runeSlice []rune 1063 1064 func (p runeSlice) Len() int { return len(p) } 1065 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 1066 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 1067 1068 func printCasefold() { 1069 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 1070 var caseOrbit = make([][]rune, MaxChar+1) 1071 for j := range chars { 1072 i := rune(j) 1073 c := &chars[i] 1074 if c.foldCase == 0 { 1075 continue 1076 } 1077 orb := caseOrbit[c.foldCase] 1078 if orb == nil { 1079 orb = append(orb, c.foldCase) 1080 } 1081 caseOrbit[c.foldCase] = append(orb, i) 1082 } 1083 1084 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 1085 for j := range chars { 1086 i := rune(j) 1087 c := &chars[i] 1088 f := c.foldCase 1089 if f == 0 { 1090 f = i 1091 } 1092 orb := caseOrbit[f] 1093 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 1094 // Default assumption of [upper, lower] is wrong. 1095 caseOrbit[i] = []rune{i} 1096 } 1097 } 1098 1099 // Delete the groups for which assuming [lower, upper] is right. 1100 for i, orb := range caseOrbit { 1101 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 1102 caseOrbit[i] = nil 1103 } 1104 } 1105 1106 // Record orbit information in chars. 1107 for _, orb := range caseOrbit { 1108 if orb == nil { 1109 continue 1110 } 1111 sort.Sort(runeSlice(orb)) 1112 c := orb[len(orb)-1] 1113 for _, d := range orb { 1114 chars[c].caseOrbit = d 1115 c = d 1116 } 1117 } 1118 1119 printCaseOrbit() 1120 1121 // Tables of category and script folding exceptions: code points 1122 // that must be added when interpreting a particular category/script 1123 // in a case-folding context. 1124 cat := make(map[string]map[rune]bool) 1125 for name := range category { 1126 if x := foldExceptions(inCategory(name)); len(x) > 0 { 1127 cat[name] = x 1128 } 1129 } 1130 1131 scr := make(map[string]map[rune]bool) 1132 for name := range scripts { 1133 if x := foldExceptions(inScript(name)); len(x) > 0 { 1134 cat[name] = x 1135 } 1136 } 1137 1138 printCatFold("FoldCategory", cat) 1139 printCatFold("FoldScript", scr) 1140 } 1141 1142 // inCategory returns a list of all the runes in the category. 1143 func inCategory(name string) []rune { 1144 var x []rune 1145 for j := range chars { 1146 i := rune(j) 1147 c := &chars[i] 1148 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 1149 x = append(x, i) 1150 } 1151 } 1152 return x 1153 } 1154 1155 // inScript returns a list of all the runes in the script. 1156 func inScript(name string) []rune { 1157 var x []rune 1158 for _, s := range scripts[name] { 1159 for c := s.lo; c <= s.hi; c++ { 1160 x = append(x, rune(c)) 1161 } 1162 } 1163 return x 1164 } 1165 1166 // foldExceptions returns a list of all the runes fold-equivalent 1167 // to runes in class but not in class themselves. 1168 func foldExceptions(class []rune) map[rune]bool { 1169 // Create map containing class and all fold-equivalent chars. 1170 m := make(map[rune]bool) 1171 for _, r := range class { 1172 c := &chars[r] 1173 if c.caseOrbit == 0 { 1174 // Just upper and lower. 1175 if u := c.upperCase; u != 0 { 1176 m[u] = true 1177 } 1178 if l := c.lowerCase; l != 0 { 1179 m[l] = true 1180 } 1181 m[r] = true 1182 continue 1183 } 1184 // Otherwise walk orbit. 1185 r0 := r 1186 for { 1187 m[r] = true 1188 r = chars[r].caseOrbit 1189 if r == r0 { 1190 break 1191 } 1192 } 1193 } 1194 1195 // Remove class itself. 1196 for _, r := range class { 1197 delete(m, r) 1198 } 1199 1200 // What's left is the exceptions. 1201 return m 1202 } 1203 1204 var comment = map[string]string{ 1205 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 1206 "// code points outside the category that are equivalent under\n" + 1207 "// simple case folding to code points inside the category.\n" + 1208 "// If there is no entry for a category name, there are no such points.\n", 1209 1210 "FoldScript": "// FoldScript maps a script name to a table of\n" + 1211 "// code points outside the script that are equivalent under\n" + 1212 "// simple case folding to code points inside the script.\n" + 1213 "// If there is no entry for a script name, there are no such points.\n", 1214 } 1215 1216 func printCaseOrbit() { 1217 if *test { 1218 for j := range chars { 1219 i := rune(j) 1220 c := &chars[i] 1221 f := c.caseOrbit 1222 if f == 0 { 1223 if c.lowerCase != i && c.lowerCase != 0 { 1224 f = c.lowerCase 1225 } else if c.upperCase != i && c.upperCase != 0 { 1226 f = c.upperCase 1227 } else { 1228 f = i 1229 } 1230 } 1231 if g := unicode.SimpleFold(i); g != f { 1232 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 1233 } 1234 } 1235 return 1236 } 1237 1238 fmt.Printf("var caseOrbit = []foldPair{\n") 1239 for i := range chars { 1240 c := &chars[i] 1241 if c.caseOrbit != 0 { 1242 fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 1243 foldPairCount++ 1244 } 1245 } 1246 fmt.Printf("}\n\n") 1247 } 1248 1249 func printCatFold(name string, m map[string]map[rune]bool) { 1250 if *test { 1251 var pkgMap map[string]*unicode.RangeTable 1252 if name == "FoldCategory" { 1253 pkgMap = unicode.FoldCategory 1254 } else { 1255 pkgMap = unicode.FoldScript 1256 } 1257 if len(pkgMap) != len(m) { 1258 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 1259 return 1260 } 1261 for k, v := range m { 1262 t, ok := pkgMap[k] 1263 if !ok { 1264 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 1265 continue 1266 } 1267 n := 0 1268 for _, r := range t.R16 { 1269 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1270 if !v[c] { 1271 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1272 } 1273 n++ 1274 } 1275 } 1276 for _, r := range t.R32 { 1277 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 1278 if !v[c] { 1279 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 1280 } 1281 n++ 1282 } 1283 } 1284 if n != len(v) { 1285 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 1286 } 1287 } 1288 return 1289 } 1290 1291 fmt.Print(comment[name]) 1292 fmt.Printf("var %s = map[string]*RangeTable{\n", name) 1293 for _, name := range allCatFold(m) { 1294 fmt.Printf("\t%q: fold%s,\n", name, name) 1295 } 1296 fmt.Printf("}\n\n") 1297 for _, name := range allCatFold(m) { 1298 class := m[name] 1299 dumpRange( 1300 fmt.Sprintf("var fold%s = &RangeTable{\n", name), 1301 func(code rune) bool { return class[code] }) 1302 } 1303 } 1304 1305 var range16Count = 0 // Number of entries in the 16-bit range tables. 1306 var range32Count = 0 // Number of entries in the 32-bit range tables. 1307 var foldPairCount = 0 // Number of fold pairs in the exception tables. 1308 1309 func printSizes() { 1310 if *test { 1311 return 1312 } 1313 fmt.Println() 1314 fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1315 range16Bytes := range16Count * 3 * 2 1316 range32Bytes := range32Count * 3 * 4 1317 fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1318 fmt.Println() 1319 fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1320 }