github.com/tcnksm/go@v0.0.0-20141208075154-439b32936367/src/lib9/utf/mkrunetype.c (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 /* 8 * make is(upper|lower|title|space|alpha)rune and 9 * to(upper|lower|title)rune from a UnicodeData.txt file. 10 * these can be found at unicode.org 11 * 12 * with -c, runs a check of the existing runetype functions vs. 13 * those extracted from UnicodeData. 14 * 15 * with -p, generates tables for pairs of chars, as well as for ranges 16 * and singletons. 17 * 18 * UnicodeData defines 4 fields of interest: 19 * 1) a category 20 * 2) an upper case mapping 21 * 3) a lower case mapping 22 * 4) a title case mapping 23 * 24 * toupper, tolower, and totitle are defined directly from the mapping. 25 * 26 * isalpharune(c) is true iff c is a "letter" category 27 * isupperrune(c) is true iff c is the target of toupperrune, 28 * or is in the uppercase letter category 29 * similarly for islowerrune and istitlerune. 30 * isspacerune is true for space category chars, "C" locale white space chars, 31 * and two additions: 32 * 0085 "next line" control char 33 * feff] "zero-width non-break space" 34 * isdigitrune is true iff c is a numeric-digit category. 35 */ 36 37 #include <u.h> 38 #include <libc.h> 39 #include <stdio.h> 40 #include "utf.h" 41 #include "utfdef.h" 42 43 enum { 44 /* 45 * fields in the unicode data file 46 */ 47 FIELD_CODE, 48 FIELD_NAME, 49 FIELD_CATEGORY, 50 FIELD_COMBINING, 51 FIELD_BIDIR, 52 FIELD_DECOMP, 53 FIELD_DECIMAL_DIG, 54 FIELD_DIG, 55 FIELD_NUMERIC_VAL, 56 FIELD_MIRRORED, 57 FIELD_UNICODE_1_NAME, 58 FIELD_COMMENT, 59 FIELD_UPPER, 60 FIELD_LOWER, 61 FIELD_TITLE, 62 NFIELDS, 63 64 MAX_LINE = 1024, 65 66 TO_OFFSET = 1 << 20, 67 68 NRUNES = 1 << 21, 69 }; 70 71 #define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x)) 72 73 static char myisspace[NRUNES]; 74 static char myisalpha[NRUNES]; 75 static char myisdigit[NRUNES]; 76 static char myisupper[NRUNES]; 77 static char myislower[NRUNES]; 78 static char myistitle[NRUNES]; 79 80 static int mytoupper[NRUNES]; 81 static int mytolower[NRUNES]; 82 static int mytotitle[NRUNES]; 83 84 static void check(void); 85 static void mktables(char *src, int usepairs); 86 static void fatal(const char *fmt, ...); 87 static int mygetfields(char **fields, int nfields, char *str, const char *delim); 88 static int getunicodeline(FILE *in, char **fields, char *buf); 89 static int getcode(char *s); 90 91 static void 92 usage(void) 93 { 94 fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n"); 95 exit(1); 96 } 97 98 void 99 main(int argc, char *argv[]) 100 { 101 FILE *in; 102 char buf[MAX_LINE], buf2[MAX_LINE]; 103 char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; 104 char *p; 105 int i, code, last, docheck, usepairs; 106 107 docheck = 0; 108 usepairs = 0; 109 ARGBEGIN{ 110 case 'c': 111 docheck = 1; 112 break; 113 case 'p': 114 usepairs = 1; 115 break; 116 default: 117 usage(); 118 }ARGEND 119 120 if(argc != 1){ 121 usage(); 122 } 123 124 in = fopen(argv[0], "r"); 125 if(in == NULL){ 126 fatal("can't open %s", argv[0]); 127 } 128 129 for(i = 0; i < NRUNES; i++){ 130 mytoupper[i] = i; 131 mytolower[i] = i; 132 mytotitle[i] = i; 133 } 134 135 /* 136 * make sure isspace has all of the "C" locale whitespace chars 137 */ 138 myisspace['\t'] = 1; 139 myisspace['\n'] = 1; 140 myisspace['\r'] = 1; 141 myisspace['\f'] = 1; 142 myisspace['\v'] = 1; 143 144 /* 145 * a couple of other exceptions 146 */ 147 myisspace[0x85] = 1; /* control char, "next line" */ 148 myisspace[0xfeff] = 1; /* zero-width non-break space */ 149 150 last = -1; 151 while(getunicodeline(in, fields, buf)){ 152 code = getcode(fields[FIELD_CODE]); 153 if (code >= NRUNES) 154 fatal("code-point value too big: %x", code); 155 if(code <= last) 156 fatal("bad code sequence: %x then %x", last, code); 157 last = code; 158 159 /* 160 * check for ranges 161 */ 162 p = fields[FIELD_CATEGORY]; 163 if(strstr(fields[FIELD_NAME], ", First>") != NULL){ 164 if(!getunicodeline(in, fields2, buf2)) 165 fatal("range start at eof"); 166 if (strstr(fields2[FIELD_NAME], ", Last>") == NULL) 167 fatal("range start not followed by range end"); 168 last = getcode(fields2[FIELD_CODE]); 169 if(last <= code) 170 fatal("range out of sequence: %x then %x", code, last); 171 if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) 172 fatal("range with mismatched category"); 173 } 174 175 /* 176 * set properties and conversions 177 */ 178 for (; code <= last; code++){ 179 if(p[0] == 'L') 180 myisalpha[code] = 1; 181 if(p[0] == 'Z') 182 myisspace[code] = 1; 183 184 if(strcmp(p, "Lu") == 0) 185 myisupper[code] = 1; 186 if(strcmp(p, "Ll") == 0) 187 myislower[code] = 1; 188 189 if(strcmp(p, "Lt") == 0) 190 myistitle[code] = 1; 191 192 if(strcmp(p, "Nd") == 0) 193 myisdigit[code] = 1; 194 195 /* 196 * when finding conversions, also need to mark 197 * upper/lower case, since some chars, like 198 * "III" (0x2162), aren't defined as letters but have a 199 * lower case mapping ("iii" (0x2172)). 200 */ 201 if(fields[FIELD_UPPER][0] != '\0'){ 202 mytoupper[code] = getcode(fields[FIELD_UPPER]); 203 } 204 if(fields[FIELD_LOWER][0] != '\0'){ 205 mytolower[code] = getcode(fields[FIELD_LOWER]); 206 } 207 if(fields[FIELD_TITLE][0] != '\0'){ 208 mytotitle[code] = getcode(fields[FIELD_TITLE]); 209 } 210 } 211 } 212 213 fclose(in); 214 215 /* 216 * check for codes with no totitle mapping but a toupper mapping. 217 * these appear in UnicodeData-2.0.14.txt, but are almost certainly 218 * erroneous. 219 */ 220 for(i = 0; i < NRUNES; i++){ 221 if(mytotitle[i] == i 222 && mytoupper[i] != i 223 && !myistitle[i]) 224 fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]); 225 } 226 227 /* 228 * make sure isupper[c] is true if for some x toupper[x] == c 229 * ditto for islower and istitle 230 */ 231 for(i = 0; i < NRUNES; i++) { 232 if(mytoupper[i] != i) 233 myisupper[mytoupper[i]] = 1; 234 if(mytolower[i] != i) 235 myislower[mytolower[i]] = 1; 236 if(mytotitle[i] != i) 237 myistitle[mytotitle[i]] = 1; 238 } 239 240 if(docheck){ 241 check(); 242 }else{ 243 mktables(argv[0], usepairs); 244 } 245 exit(0); 246 } 247 248 /* 249 * generate a properties array for ranges, clearing those cases covered. 250 * if force, generate one-entry ranges for singletons. 251 */ 252 static int 253 mkisrange(const char* label, char* prop, int force) 254 { 255 int start, stop, some; 256 257 /* 258 * first, the ranges 259 */ 260 some = 0; 261 for(start = 0; start < NRUNES; ) { 262 if(!prop[start]){ 263 start++; 264 continue; 265 } 266 267 for(stop = start + 1; stop < NRUNES; stop++){ 268 if(!prop[stop]){ 269 break; 270 } 271 prop[stop] = 0; 272 } 273 if(force || stop != start + 1){ 274 if(!some){ 275 printf("static Rune __is%sr[] = {\n", label); 276 some = 1; 277 } 278 prop[start] = 0; 279 printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1); 280 } 281 282 start = stop; 283 } 284 if(some) 285 printf("};\n\n"); 286 return some; 287 } 288 289 /* 290 * generate a mapping array for pairs with a skip between, 291 * clearing those entries covered. 292 */ 293 static int 294 mkispair(const char *label, char *prop) 295 { 296 int start, stop, some; 297 298 some = 0; 299 for(start = 0; start + 2 < NRUNES; ) { 300 if(!prop[start]){ 301 start++; 302 continue; 303 } 304 305 for(stop = start + 2; stop < NRUNES; stop += 2){ 306 if(!prop[stop]){ 307 break; 308 } 309 prop[stop] = 0; 310 } 311 if(stop != start + 2){ 312 if(!some){ 313 printf("static Rune __is%sp[] = {\n", label); 314 some = 1; 315 } 316 prop[start] = 0; 317 printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2); 318 } 319 320 start = stop; 321 } 322 if(some) 323 printf("};\n\n"); 324 return some; 325 } 326 327 /* 328 * generate a properties array for singletons, clearing those cases covered. 329 */ 330 static int 331 mkissingle(const char *label, char *prop) 332 { 333 int start, some; 334 335 some = 0; 336 for(start = 0; start < NRUNES; start++) { 337 if(!prop[start]){ 338 continue; 339 } 340 341 if(!some){ 342 printf("static Rune __is%ss[] = {\n", label); 343 some = 1; 344 } 345 prop[start] = 0; 346 printf("\t0x%.4x,\n", start); 347 } 348 if(some) 349 printf("};\n\n"); 350 return some; 351 } 352 353 /* 354 * generate tables and a function for is<label>rune 355 */ 356 static void 357 mkis(const char* label, char* prop, int usepairs) 358 { 359 int isr, isp, iss; 360 361 isr = mkisrange(label, prop, 0); 362 isp = 0; 363 if(usepairs) 364 isp = mkispair(label, prop); 365 iss = mkissingle(label, prop); 366 367 printf( 368 "int\n" 369 "is%srune(Rune c)\n" 370 "{\n" 371 " Rune *p;\n" 372 "\n", 373 label); 374 375 if(isr) 376 printf( 377 " p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n" 378 " if(p && c >= p[0] && c <= p[1])\n" 379 " return 1;\n", 380 label, label); 381 382 if(isp) 383 printf( 384 " p = rbsearch(c, __is%sp, nelem(__is%sp)/2, 2);\n" 385 " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" 386 " return 1;\n", 387 label, label); 388 389 if(iss) 390 printf( 391 " p = rbsearch(c, __is%ss, nelem(__is%ss), 1);\n" 392 " if(p && c == p[0])\n" 393 " return 1;\n", 394 label, label); 395 396 397 printf( 398 " return 0;\n" 399 "}\n" 400 "\n" 401 ); 402 } 403 404 /* 405 * generate a mapping array for ranges, clearing those entries covered. 406 * if force, generate one-entry ranges for singletons. 407 */ 408 static int 409 mktorange(const char* label, int* map, int force) 410 { 411 int start, stop, delta, some; 412 413 some = 0; 414 for(start = 0; start < NRUNES; ) { 415 if(map[start] == start){ 416 start++; 417 continue; 418 } 419 420 delta = TO_DELTA(map[start], start); 421 if(delta != (Rune)delta) 422 fatal("bad map delta %d", delta); 423 for(stop = start + 1; stop < NRUNES; stop++){ 424 if(TO_DELTA(map[stop], stop) != delta){ 425 break; 426 } 427 map[stop] = stop; 428 } 429 if(stop != start + 1){ 430 if(!some){ 431 printf("static Rune __to%sr[] = {\n", label); 432 some = 1; 433 } 434 map[start] = start; 435 printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta); 436 } 437 438 start = stop; 439 } 440 if(some) 441 printf("};\n\n"); 442 return some; 443 } 444 445 /* 446 * generate a mapping array for pairs with a skip between, 447 * clearing those entries covered. 448 */ 449 static int 450 mktopair(const char* label, int* map) 451 { 452 int start, stop, delta, some; 453 454 some = 0; 455 for(start = 0; start + 2 < NRUNES; ) { 456 if(map[start] == start){ 457 start++; 458 continue; 459 } 460 461 delta = TO_DELTA(map[start], start); 462 if(delta != (Rune)delta) 463 fatal("bad map delta %d", delta); 464 for(stop = start + 2; stop < NRUNES; stop += 2){ 465 if(TO_DELTA(map[stop], stop) != delta){ 466 break; 467 } 468 map[stop] = stop; 469 } 470 if(stop != start + 2){ 471 if(!some){ 472 printf("static Rune __to%sp[] = {\n", label); 473 some = 1; 474 } 475 map[start] = start; 476 printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta); 477 } 478 479 start = stop; 480 } 481 if(some) 482 printf("};\n\n"); 483 return some; 484 } 485 486 /* 487 * generate a mapping array for singletons, clearing those entries covered. 488 */ 489 static int 490 mktosingle(const char* label, int* map) 491 { 492 int start, delta, some; 493 494 some = 0; 495 for(start = 0; start < NRUNES; start++) { 496 if(map[start] == start){ 497 continue; 498 } 499 500 delta = TO_DELTA(map[start], start); 501 if(delta != (Rune)delta) 502 fatal("bad map delta %d", delta); 503 if(!some){ 504 printf("static Rune __to%ss[] = {\n", label); 505 some = 1; 506 } 507 map[start] = start; 508 printf("\t0x%.4x, %d,\n", start, delta); 509 } 510 if(some) 511 printf("};\n\n"); 512 return some; 513 } 514 515 /* 516 * generate tables and a function for to<label>rune 517 */ 518 static void 519 mkto(const char* label, int* map, int usepairs) 520 { 521 int tor, top, tos; 522 523 tor = mktorange(label, map, 0); 524 top = 0; 525 if(usepairs) 526 top = mktopair(label, map); 527 tos = mktosingle(label, map); 528 529 printf( 530 "Rune\n" 531 "to%srune(Rune c)\n" 532 "{\n" 533 " Rune *p;\n" 534 "\n", 535 label); 536 537 if(tor) 538 printf( 539 " p = rbsearch(c, __to%sr, nelem(__to%sr)/3, 3);\n" 540 " if(p && c >= p[0] && c <= p[1])\n" 541 " return c + p[2] - %d;\n", 542 label, label, TO_OFFSET); 543 544 if(top) 545 printf( 546 " p = rbsearch(c, __to%sp, nelem(__to%sp)/3, 3);\n" 547 " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" 548 " return c + p[2] - %d;\n", 549 label, label, TO_OFFSET); 550 551 if(tos) 552 printf( 553 " p = rbsearch(c, __to%ss, nelem(__to%ss)/2, 2);\n" 554 " if(p && c == p[0])\n" 555 " return c + p[1] - %d;\n", 556 label, label, TO_OFFSET); 557 558 559 printf( 560 " return c;\n" 561 "}\n" 562 "\n" 563 ); 564 } 565 566 // Make only range tables and a function for is<label>rune. 567 static void 568 mkisronly(const char* label, char* prop) 569 { 570 mkisrange(label, prop, 1); 571 printf( 572 "int\n" 573 "is%srune(Rune c)\n" 574 "{\n" 575 " Rune *p;\n" 576 "\n" 577 " p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n" 578 " if(p && c >= p[0] && c <= p[1])\n" 579 " return 1;\n" 580 " return 0;\n" 581 "}\n" 582 "\n", 583 label, label, label); 584 } 585 586 /* 587 * generate the body of runetype. 588 * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne); 589 */ 590 static void 591 mktables(char *src, int usepairs) 592 { 593 printf("/* generated automatically by mkrunetype.c from %s */\n\n", src); 594 595 /* 596 * we special case the space and digit tables, since they are assumed 597 * to be small with several ranges. 598 */ 599 mkisronly("space", myisspace); 600 mkisronly("digit", myisdigit); 601 602 mkis("alpha", myisalpha, 0); 603 mkis("upper", myisupper, usepairs); 604 mkis("lower", myislower, usepairs); 605 mkis("title", myistitle, usepairs); 606 607 mkto("upper", mytoupper, usepairs); 608 mkto("lower", mytolower, usepairs); 609 mkto("title", mytotitle, usepairs); 610 } 611 612 /* 613 * find differences between the newly generated tables and current runetypes. 614 */ 615 static void 616 check(void) 617 { 618 int i; 619 620 for(i = 0; i < NRUNES; i++){ 621 if(isdigitrune(i) != myisdigit[i]) 622 fprintf(stderr, "isdigit diff at %x: runetype=%x, unicode=%x\n", 623 i, isdigitrune(i), myisdigit[i]); 624 625 if(isspacerune(i) != myisspace[i]) 626 fprintf(stderr, "isspace diff at %x: runetype=%x, unicode=%x\n", 627 i, isspacerune(i), myisspace[i]); 628 629 if(isupperrune(i) != myisupper[i]) 630 fprintf(stderr, "isupper diff at %x: runetype=%x, unicode=%x\n", 631 i, isupperrune(i), myisupper[i]); 632 633 if(islowerrune(i) != myislower[i]) 634 fprintf(stderr, "islower diff at %x: runetype=%x, unicode=%x\n", 635 i, islowerrune(i), myislower[i]); 636 637 if(isalpharune(i) != myisalpha[i]) 638 fprintf(stderr, "isalpha diff at %x: runetype=%x, unicode=%x\n", 639 i, isalpharune(i), myisalpha[i]); 640 641 if(toupperrune(i) != mytoupper[i]) 642 fprintf(stderr, "toupper diff at %x: runetype=%x, unicode=%x\n", 643 i, toupperrune(i), mytoupper[i]); 644 645 if(tolowerrune(i) != mytolower[i]) 646 fprintf(stderr, "tolower diff at %x: runetype=%x, unicode=%x\n", 647 i, tolowerrune(i), mytolower[i]); 648 649 if(istitlerune(i) != myistitle[i]) 650 fprintf(stderr, "istitle diff at %x: runetype=%x, unicode=%x\n", 651 i, istitlerune(i), myistitle[i]); 652 653 if(totitlerune(i) != mytotitle[i]) 654 fprintf(stderr, "totitle diff at %x: runetype=%x, unicode=%x\n", 655 i, totitlerune(i), mytotitle[i]); 656 657 658 } 659 } 660 661 static int 662 mygetfields(char **fields, int nfields, char *str, const char *delim) 663 { 664 int nf; 665 666 fields[0] = str; 667 nf = 1; 668 if(nf >= nfields) 669 return nf; 670 671 for(; *str; str++){ 672 if(strchr(delim, *str) != NULL){ 673 *str = '\0'; 674 fields[nf++] = str + 1; 675 if(nf >= nfields) 676 break; 677 } 678 } 679 return nf; 680 } 681 682 static int 683 getunicodeline(FILE *in, char **fields, char *buf) 684 { 685 char *p; 686 687 if(fgets(buf, MAX_LINE, in) == NULL) 688 return 0; 689 690 p = strchr(buf, '\n'); 691 if (p == NULL) 692 fatal("line too long"); 693 *p = '\0'; 694 695 if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS) 696 fatal("bad number of fields"); 697 698 return 1; 699 } 700 701 static int 702 getcode(char *s) 703 { 704 int i, code; 705 706 code = 0; 707 i = 0; 708 /* Parse a hex number */ 709 while(s[i]) { 710 code <<= 4; 711 if(s[i] >= '0' && s[i] <= '9') 712 code += s[i] - '0'; 713 else if(s[i] >= 'A' && s[i] <= 'F') 714 code += s[i] - 'A' + 10; 715 else 716 fatal("bad code char '%c'", s[i]); 717 i++; 718 } 719 return code; 720 } 721 722 static void 723 fatal(const char *fmt, ...) 724 { 725 va_list arg; 726 727 fprintf(stderr, "%s: fatal error: ", argv0); 728 va_start(arg, fmt); 729 vfprintf(stderr, fmt, arg); 730 va_end(arg); 731 fprintf(stderr, "\n"); 732 733 exit(1); 734 }