modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/unicode/mkunicode.tcl (about) 1 2 source [file join [file dirname [info script]] parseunicode.tcl] 3 4 proc print_rd {map} { 5 global tl_lookup_table 6 set aChar [list] 7 set lRange [list] 8 9 set nRange 1 10 set iFirst [lindex $map 0 0] 11 set cPrev [lindex $map 0 1] 12 13 foreach m [lrange $map 1 end] { 14 foreach {i c} $m {} 15 16 if {$cPrev == $c} { 17 for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { 18 if {[info exists tl_lookup_table($j)]==0} break 19 } 20 21 if {$j==$i} { 22 set nNew [expr {(1 + $i - $iFirst)}] 23 if {$nNew<=8} { 24 set nRange $nNew 25 continue 26 } 27 } 28 } 29 30 lappend lRange [list $iFirst $nRange] 31 lappend aChar $cPrev 32 33 set iFirst $i 34 set cPrev $c 35 set nRange 1 36 } 37 lappend lRange [list $iFirst $nRange] 38 lappend aChar $cPrev 39 40 puts "/*" 41 puts "** If the argument is a codepoint corresponding to a lowercase letter" 42 puts "** in the ASCII range with a diacritic added, return the codepoint" 43 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" 44 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" 45 puts "** E\"). The resuls of passing a codepoint that corresponds to an" 46 puts "** uppercase letter are undefined." 47 puts "*/" 48 puts "static int ${::remove_diacritic}(int c)\{" 49 puts " unsigned short aDia\[\] = \{" 50 puts -nonewline " 0, " 51 set i 1 52 foreach r $lRange { 53 foreach {iCode nRange} $r {} 54 if {($i % 8)==0} {puts "" ; puts -nonewline " " } 55 incr i 56 57 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] 58 puts -nonewline ", " 59 } 60 puts "" 61 puts " \};" 62 puts " char aChar\[\] = \{" 63 puts -nonewline " '\\0', " 64 set i 1 65 foreach c $aChar { 66 set str "'$c', " 67 if {$c == ""} { set str "'\\0', " } 68 69 if {($i % 12)==0} {puts "" ; puts -nonewline " " } 70 incr i 71 puts -nonewline "$str" 72 } 73 puts "" 74 puts " \};" 75 puts { 76 unsigned int key = (((unsigned int)c)<<3) | 0x00000007; 77 int iRes = 0; 78 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; 79 int iLo = 0; 80 while( iHi>=iLo ){ 81 int iTest = (iHi + iLo) / 2; 82 if( key >= aDia[iTest] ){ 83 iRes = iTest; 84 iLo = iTest+1; 85 }else{ 86 iHi = iTest-1; 87 } 88 } 89 assert( key>=aDia[iRes] ); 90 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);} 91 puts "\}" 92 } 93 94 proc print_isdiacritic {zFunc map} { 95 96 set lCode [list] 97 foreach m $map { 98 foreach {code char} $m {} 99 if {$code && $char == ""} { lappend lCode $code } 100 } 101 set lCode [lsort -integer $lCode] 102 set iFirst [lindex $lCode 0] 103 set iLast [lindex $lCode end] 104 105 set i1 0 106 set i2 0 107 108 foreach c $lCode { 109 set i [expr $c - $iFirst] 110 if {$i < 32} { 111 set i1 [expr {$i1 | (1<<$i)}] 112 } else { 113 set i2 [expr {$i2 | (1<<($i-32))}] 114 } 115 } 116 117 puts "/*" 118 puts "** Return true if the argument interpreted as a unicode codepoint" 119 puts "** is a diacritical modifier character." 120 puts "*/" 121 puts "int ${zFunc}\(int c)\{" 122 puts " unsigned int mask0 = [format "0x%08X" $i1];" 123 puts " unsigned int mask1 = [format "0x%08X" $i2];" 124 125 puts " if( c<$iFirst || c>$iLast ) return 0;" 126 puts " return (c < $iFirst+32) ?" 127 puts " (mask0 & (1 << (c-$iFirst))) :" 128 puts " (mask1 & (1 << (c-$iFirst-32)));" 129 puts "\}" 130 } 131 132 133 #------------------------------------------------------------------------- 134 135 proc an_load_separator_ranges {} { 136 global unicodedata.txt 137 set lSep [an_load_unicodedata_text ${unicodedata.txt}] 138 unset -nocomplain iFirst 139 unset -nocomplain nRange 140 set lRange [list] 141 foreach sep $lSep { 142 if {0==[info exists iFirst]} { 143 set iFirst $sep 144 set nRange 1 145 } elseif { $sep == ($iFirst+$nRange) } { 146 incr nRange 147 } else { 148 lappend lRange [list $iFirst $nRange] 149 set iFirst $sep 150 set nRange 1 151 } 152 } 153 lappend lRange [list $iFirst $nRange] 154 set lRange 155 } 156 157 proc an_print_range_array {lRange} { 158 set iFirstMax 0 159 set nRangeMax 0 160 foreach range $lRange { 161 foreach {iFirst nRange} $range {} 162 if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} 163 if {$nRange > $nRangeMax} {set nRangeMax $nRange} 164 } 165 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} 166 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} 167 168 puts -nonewline " " 169 puts [string trim { 170 /* Each unsigned integer in the following array corresponds to a contiguous 171 ** range of unicode codepoints that are not either letters or numbers (i.e. 172 ** codepoints for which this function should return 0). 173 ** 174 ** The most significant 22 bits in each 32-bit value contain the first 175 ** codepoint in the range. The least significant 10 bits are used to store 176 ** the size of the range (always at least 1). In other words, the value 177 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 178 ** C. It is not possible to represent a range larger than 1023 codepoints 179 ** using this format. 180 */ 181 }] 182 puts -nonewline " static const unsigned int aEntry\[\] = \{" 183 set i 0 184 foreach range $lRange { 185 foreach {iFirst nRange} $range {} 186 set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] 187 188 if {($i % 5)==0} {puts "" ; puts -nonewline " "} 189 puts -nonewline " $u32," 190 incr i 191 } 192 puts "" 193 puts " \};" 194 } 195 196 proc an_print_ascii_bitmap {lRange} { 197 foreach range $lRange { 198 foreach {iFirst nRange} $range {} 199 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { 200 if {$i<=127} { set a($i) 1 } 201 } 202 } 203 204 set aAscii [list 0 0 0 0] 205 foreach key [array names a] { 206 set idx [expr $key >> 5] 207 lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))] 208 } 209 210 puts " static const unsigned int aAscii\[4\] = \{" 211 puts -nonewline " " 212 foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] } 213 puts "" 214 puts " \};" 215 } 216 217 proc print_isalnum {zFunc lRange} { 218 puts "/*" 219 puts "** Return true if the argument corresponds to a unicode codepoint" 220 puts "** classified as either a letter or a number. Otherwise false." 221 puts "**" 222 puts "** The results are undefined if the value passed to this function" 223 puts "** is less than zero." 224 puts "*/" 225 puts "int ${zFunc}\(int c)\{" 226 an_print_range_array $lRange 227 an_print_ascii_bitmap $lRange 228 puts { 229 if( (unsigned int)c<128 ){ 230 return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 ); 231 }else if( (unsigned int)c<(1<<22) ){ 232 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; 233 int iRes = 0; 234 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 235 int iLo = 0; 236 while( iHi>=iLo ){ 237 int iTest = (iHi + iLo) / 2; 238 if( key >= aEntry[iTest] ){ 239 iRes = iTest; 240 iLo = iTest+1; 241 }else{ 242 iHi = iTest-1; 243 } 244 } 245 assert( aEntry[0]<key ); 246 assert( key>=aEntry[iRes] ); 247 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); 248 } 249 return 1;} 250 puts "\}" 251 } 252 253 proc print_test_isalnum {zFunc lRange} { 254 foreach range $lRange { 255 foreach {iFirst nRange} $range {} 256 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 } 257 } 258 259 puts "static int isalnum_test(int *piCode)\{" 260 puts -nonewline " unsigned char aAlnum\[\] = \{" 261 for {set i 0} {$i < 70000} {incr i} { 262 if {($i % 32)==0} { puts "" ; puts -nonewline " " } 263 set bFlag [expr ![info exists a($i)]] 264 puts -nonewline "${bFlag}," 265 } 266 puts "" 267 puts " \};" 268 269 puts -nonewline " int aLargeSep\[\] = \{" 270 set i 0 271 foreach iSep [lsort -integer [array names a]] { 272 if {$iSep<70000} continue 273 if {($i % 8)==0} { puts "" ; puts -nonewline " " } 274 puts -nonewline " $iSep," 275 incr i 276 } 277 puts "" 278 puts " \};" 279 puts -nonewline " int aLargeOther\[\] = \{" 280 set i 0 281 foreach iSep [lsort -integer [array names a]] { 282 if {$iSep<70000} continue 283 if {[info exists a([expr $iSep-1])]==0} { 284 if {($i % 8)==0} { puts "" ; puts -nonewline " " } 285 puts -nonewline " [expr $iSep-1]," 286 incr i 287 } 288 if {[info exists a([expr $iSep+1])]==0} { 289 if {($i % 8)==0} { puts "" ; puts -nonewline " " } 290 puts -nonewline " [expr $iSep+1]," 291 incr i 292 } 293 } 294 puts "" 295 puts " \};" 296 297 puts [subst -nocommands { 298 int i; 299 for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){ 300 if( ${zFunc}(i)!=aAlnum[i] ){ 301 *piCode = i; 302 return 1; 303 } 304 } 305 for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){ 306 if( ${zFunc}(aLargeSep[i])!=0 ){ 307 *piCode = aLargeSep[i]; 308 return 1; 309 } 310 } 311 for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){ 312 if( ${zFunc}(aLargeOther[i])!=1 ){ 313 *piCode = aLargeOther[i]; 314 return 1; 315 } 316 } 317 }] 318 puts " return 0;" 319 puts "\}" 320 } 321 322 #------------------------------------------------------------------------- 323 324 proc tl_create_records {} { 325 global tl_lookup_table 326 327 set iFirst "" 328 set nOff 0 329 set nRange 0 330 set nIncr 0 331 332 set lRecord [list] 333 foreach code [lsort -integer [array names tl_lookup_table]] { 334 set mapping $tl_lookup_table($code) 335 if {$iFirst == ""} { 336 set iFirst $code 337 set nOff [expr $mapping - $code] 338 set nRange 1 339 set nIncr 1 340 } else { 341 set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))] 342 if { $nRange==1 && ($diff==1 || $diff==2) } { 343 set nIncr $diff 344 } 345 346 if {$diff != $nIncr || ($mapping - $code)!=$nOff} { 347 if { $nRange==1 } {set nIncr 1} 348 lappend lRecord [list $iFirst $nIncr $nRange $nOff] 349 set iFirst $code 350 set nOff [expr $mapping - $code] 351 set nRange 1 352 set nIncr 1 353 } else { 354 incr nRange 355 } 356 } 357 } 358 359 lappend lRecord [list $iFirst $nIncr $nRange $nOff] 360 361 set lRecord 362 } 363 364 proc tl_print_table_header {} { 365 puts -nonewline " " 366 puts [string trim { 367 /* Each entry in the following array defines a rule for folding a range 368 ** of codepoints to lower case. The rule applies to a range of nRange 369 ** codepoints starting at codepoint iCode. 370 ** 371 ** If the least significant bit in flags is clear, then the rule applies 372 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and 373 ** need to be folded). Or, if it is set, then the rule only applies to 374 ** every second codepoint in the range, starting with codepoint C. 375 ** 376 ** The 7 most significant bits in flags are an index into the aiOff[] 377 ** array. If a specific codepoint C does require folding, then its lower 378 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). 379 ** 380 ** The contents of this array are generated by parsing the CaseFolding.txt 381 ** file distributed as part of the "Unicode Character Database". See 382 ** http://www.unicode.org for details. 383 */ 384 }] 385 puts " static const struct TableEntry \{" 386 puts " unsigned short iCode;" 387 puts " unsigned char flags;" 388 puts " unsigned char nRange;" 389 puts " \} aEntry\[\] = \{" 390 } 391 392 proc tl_print_table_entry {togglevar entry liOff} { 393 upvar $togglevar t 394 foreach {iFirst nIncr nRange nOff} $entry {} 395 396 if {$iFirst > (1<<16)} { return 1 } 397 398 if {[info exists t]==0} {set t 0} 399 if {$t==0} { puts -nonewline " " } 400 401 set flags 0 402 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]} 403 if {$nOff<0} { incr nOff [expr (1<<16)] } 404 405 set idx [lsearch $liOff $nOff] 406 if {$idx<0} {error "malfunction generating aiOff"} 407 set flags [expr $flags + $idx*2] 408 409 set txt "{$iFirst, $flags, $nRange}," 410 if {$t==2} { 411 puts $txt 412 } else { 413 puts -nonewline [format "% -23s" $txt] 414 } 415 set t [expr ($t+1)%3] 416 417 return 0 418 } 419 420 proc tl_print_table_footer {togglevar} { 421 upvar $togglevar t 422 if {$t!=0} {puts ""} 423 puts " \};" 424 } 425 426 proc tl_print_if_entry {entry} { 427 foreach {iFirst nIncr nRange nOff} $entry {} 428 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"} 429 430 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{" 431 puts " ret = c + $nOff;" 432 puts " \}" 433 } 434 435 proc tl_generate_ioff_table {lRecord} { 436 foreach entry $lRecord { 437 foreach {iFirst nIncr nRange iOff} $entry {} 438 if {$iOff<0} { incr iOff [expr (1<<16)] } 439 if {[info exists a($iOff)]} continue 440 set a($iOff) 1 441 } 442 443 set liOff [lsort -integer [array names a]] 444 if {[llength $liOff]>128} { error "Too many distinct ioffs" } 445 return $liOff 446 } 447 448 proc tl_print_ioff_table {liOff} { 449 puts -nonewline " static const unsigned short aiOff\[\] = \{" 450 set i 0 451 foreach off $liOff { 452 if {($i % 8)==0} {puts "" ; puts -nonewline " "} 453 puts -nonewline [format "% -7s" "$off,"] 454 incr i 455 } 456 puts "" 457 puts " \};" 458 459 } 460 461 proc print_fold {zFunc} { 462 463 set lRecord [tl_create_records] 464 465 set lHigh [list] 466 puts "/*" 467 puts "** Interpret the argument as a unicode codepoint. If the codepoint" 468 puts "** is an upper case character that has a lower case equivalent," 469 puts "** return the codepoint corresponding to the lower case version." 470 puts "** Otherwise, return a copy of the argument." 471 puts "**" 472 puts "** The results are undefined if the value passed to this function" 473 puts "** is less than zero." 474 puts "*/" 475 puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{" 476 477 set liOff [tl_generate_ioff_table $lRecord] 478 tl_print_table_header 479 foreach entry $lRecord { 480 if {[tl_print_table_entry toggle $entry $liOff]} { 481 lappend lHigh $entry 482 } 483 } 484 tl_print_table_footer toggle 485 tl_print_ioff_table $liOff 486 487 puts [subst -nocommands { 488 int ret = c; 489 490 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); 491 492 if( c<128 ){ 493 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); 494 }else if( c<65536 ){ 495 const struct TableEntry *p; 496 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 497 int iLo = 0; 498 int iRes = -1; 499 500 assert( c>aEntry[0].iCode ); 501 while( iHi>=iLo ){ 502 int iTest = (iHi + iLo) / 2; 503 int cmp = (c - aEntry[iTest].iCode); 504 if( cmp>=0 ){ 505 iRes = iTest; 506 iLo = iTest+1; 507 }else{ 508 iHi = iTest-1; 509 } 510 } 511 512 assert( iRes>=0 && c>=aEntry[iRes].iCode ); 513 p = &aEntry[iRes]; 514 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ 515 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; 516 assert( ret>0 ); 517 } 518 519 if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret); 520 } 521 }] 522 523 foreach entry $lHigh { 524 tl_print_if_entry $entry 525 } 526 527 puts "" 528 puts " return ret;" 529 puts "\}" 530 } 531 532 proc print_fold_test {zFunc mappings} { 533 global tl_lookup_table 534 535 foreach m $mappings { 536 set c [lindex $m 1] 537 if {$c == ""} { 538 set extra([lindex $m 0]) 0 539 } else { 540 scan $c %c i 541 set extra([lindex $m 0]) $i 542 } 543 } 544 545 puts "static int fold_test(int *piCode)\{" 546 puts -nonewline " static int aLookup\[\] = \{" 547 for {set i 0} {$i < 70000} {incr i} { 548 549 set expected $i 550 catch { set expected $tl_lookup_table($i) } 551 set expected2 $expected 552 catch { set expected2 $extra($expected2) } 553 554 if {($i % 4)==0} { puts "" ; puts -nonewline " " } 555 puts -nonewline "$expected, $expected2, " 556 } 557 puts " \};" 558 puts " int i;" 559 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{" 560 puts " int iCode = (i/2);" 561 puts " int bFlag = i & 0x0001;" 562 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{" 563 puts " *piCode = iCode;" 564 puts " return 1;" 565 puts " \}" 566 puts " \}" 567 puts " return 0;" 568 puts "\}" 569 } 570 571 572 proc print_fileheader {} { 573 puts [string trim { 574 /* 575 ** 2012 May 25 576 ** 577 ** The author disclaims copyright to this source code. In place of 578 ** a legal notice, here is a blessing: 579 ** 580 ** May you do good and not evil. 581 ** May you find forgiveness for yourself and forgive others. 582 ** May you share freely, never taking more than you give. 583 ** 584 ****************************************************************************** 585 */ 586 587 /* 588 ** DO NOT EDIT THIS MACHINE GENERATED FILE. 589 */ 590 }] 591 puts "" 592 if {$::generate_fts5_code} { 593 # no-op 594 } else { 595 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" 596 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" 597 } 598 puts "" 599 puts "#include <assert.h>" 600 puts "" 601 } 602 603 proc print_test_main {} { 604 puts "" 605 puts "#include <stdio.h>" 606 puts "" 607 puts "int main(int argc, char **argv)\{" 608 puts " int r1, r2;" 609 puts " int code;" 610 puts " r1 = isalnum_test(&code);" 611 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" 612 puts " else printf(\"isalnum(): test passed\\n\");" 613 puts " r2 = fold_test(&code);" 614 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" 615 puts " else printf(\"fold(): test passed\\n\");" 616 puts " return (r1 || r2);" 617 puts "\}" 618 } 619 620 # Proces the command line arguments. Exit early if they are not to 621 # our liking. 622 # 623 proc usage {} { 624 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? " 625 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" 626 exit 1 627 } 628 if {[llength $argv]<2} usage 629 set unicodedata.txt [lindex $argv end] 630 set casefolding.txt [lindex $argv end-1] 631 632 set remove_diacritic remove_diacritic 633 set generate_test_code 0 634 set generate_fts5_code 0 635 set function_prefix "sqlite3Fts" 636 for {set i 0} {$i < [llength $argv]-2} {incr i} { 637 switch -- [lindex $argv $i] { 638 -test { 639 set generate_test_code 1 640 } 641 -fts5 { 642 set function_prefix sqlite3Fts5 643 set generate_fts5_code 1 644 set remove_diacritic fts5_remove_diacritic 645 } 646 default { 647 usage 648 } 649 } 650 } 651 652 print_fileheader 653 654 # Print the isalnum() function to stdout. 655 # 656 set lRange [an_load_separator_ranges] 657 print_isalnum ${function_prefix}UnicodeIsalnum $lRange 658 659 # Leave a gap between the two generated C functions. 660 # 661 puts "" 662 puts "" 663 664 # Load the fold data. This is used by the [rd_XXX] commands 665 # as well as [print_fold]. 666 tl_load_casefolding_txt ${casefolding.txt} 667 668 set mappings [rd_load_unicodedata_text ${unicodedata.txt}] 669 print_rd $mappings 670 puts "" 671 puts "" 672 print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings 673 puts "" 674 puts "" 675 676 # Print the fold() function to stdout. 677 # 678 print_fold ${function_prefix}UnicodeFold 679 680 # Print the test routines and main() function to stdout, if -test 681 # was specified. 682 # 683 if {$::generate_test_code} { 684 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange 685 print_fold_test ${function_prefix}UnicodeFold $mappings 686 print_test_main 687 } 688 689 if {$generate_fts5_code} { 690 # no-op 691 } else { 692 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" 693 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" 694 }