modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/unicode/parseunicode.tcl (about)

     1  
     2  #--------------------------------------------------------------------------
     3  # Parameter $zName must be a path to the file UnicodeData.txt. This command
     4  # reads the file and returns a list of mappings required to remove all
     5  # diacritical marks from a unicode string. Each mapping is itself a list
     6  # consisting of two elements - the unicode codepoint and the single ASCII
     7  # character that it should be replaced with, or an empty string if the 
     8  # codepoint should simply be removed from the input. Examples:
     9  #
    10  #   { 224 a  }     (replace codepoint 224 to "a")
    11  #   { 769 "" }     (remove codepoint 769 from input)
    12  #
    13  # Mappings are only returned for non-upper case codepoints. It is assumed
    14  # that the input has already been folded to lower case.
    15  #
    16  proc rd_load_unicodedata_text {zName} {
    17    global tl_lookup_table
    18  
    19    set fd [open $zName]
    20    set lField {
    21      code
    22      character_name
    23      general_category
    24      canonical_combining_classes
    25      bidirectional_category
    26      character_decomposition_mapping
    27      decimal_digit_value
    28      digit_value
    29      numeric_value
    30      mirrored
    31      unicode_1_name
    32      iso10646_comment_field
    33      uppercase_mapping
    34      lowercase_mapping
    35      titlecase_mapping
    36    }
    37    set lRet [list]
    38  
    39    while { ![eof $fd] } {
    40      set line [gets $fd]
    41      if {$line == ""} continue
    42  
    43      set fields [split $line ";"]
    44      if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
    45      foreach $lField $fields {}
    46      if { [llength $character_decomposition_mapping]!=2
    47        || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
    48      } {
    49        continue
    50      }
    51  
    52      set iCode  [expr "0x$code"]
    53      set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    54      set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
    55  
    56      if {[info exists tl_lookup_table($iCode)]} continue
    57  
    58      if { ($iAscii >= 97 && $iAscii <= 122)
    59        || ($iAscii >= 65 && $iAscii <= 90)
    60      } {
    61        lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
    62        set dia($iDia) 1
    63      }
    64    }
    65  
    66    foreach d [array names dia] {
    67      lappend lRet [list $d ""]
    68    }
    69    set lRet [lsort -integer -index 0 $lRet]
    70  
    71    close $fd
    72    set lRet
    73  }
    74  
    75  #-------------------------------------------------------------------------
    76  # Parameter $zName must be a path to the file UnicodeData.txt. This command
    77  # reads the file and returns a list of codepoints (integers). The list
    78  # contains all codepoints in the UnicodeData.txt assigned to any "General
    79  # Category" that is not a "Letter" or "Number".
    80  #
    81  proc an_load_unicodedata_text {zName} {
    82    set fd [open $zName]
    83    set lField {
    84      code
    85      character_name
    86      general_category
    87      canonical_combining_classes
    88      bidirectional_category
    89      character_decomposition_mapping
    90      decimal_digit_value
    91      digit_value
    92      numeric_value
    93      mirrored
    94      unicode_1_name
    95      iso10646_comment_field
    96      uppercase_mapping
    97      lowercase_mapping
    98      titlecase_mapping
    99    }
   100    set lRet [list]
   101  
   102    while { ![eof $fd] } {
   103      set line [gets $fd]
   104      if {$line == ""} continue
   105  
   106      set fields [split $line ";"]
   107      if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
   108      foreach $lField $fields {}
   109  
   110      set iCode [expr "0x$code"]
   111      set bAlnum [expr {
   112           [lsearch {L N} [string range $general_category 0 0]] >= 0
   113        || $general_category=="Co"
   114      }]
   115  
   116      if { !$bAlnum } { lappend lRet $iCode }
   117    }
   118  
   119    close $fd
   120    set lRet
   121  }
   122  
   123  proc tl_load_casefolding_txt {zName} {
   124    global tl_lookup_table
   125  
   126    set fd [open $zName]
   127    while { ![eof $fd] } {
   128      set line [gets $fd]
   129      if {[string range $line 0 0] == "#"} continue
   130      if {$line == ""} continue
   131  
   132      foreach x {a b c d} {unset -nocomplain $x}
   133      foreach {a b c d} [split $line ";"] {}
   134  
   135      set a2 [list]
   136      set c2 [list]
   137      foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
   138      foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
   139      set b [string trim $b]
   140      set d [string trim $d]
   141  
   142      if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
   143    }
   144  }
   145  
   146