modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/fts5_unicode2.c (about) 1 /* 2 ** 2012 May 25 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ****************************************************************************** 12 */ 13 14 /* 15 ** DO NOT EDIT THIS MACHINE GENERATED FILE. 16 */ 17 18 19 #include <assert.h> 20 21 /* 22 ** Return true if the argument corresponds to a unicode codepoint 23 ** classified as either a letter or a number. Otherwise false. 24 ** 25 ** The results are undefined if the value passed to this function 26 ** is less than zero. 27 */ 28 int sqlite3Fts5UnicodeIsalnum(int c){ 29 /* Each unsigned integer in the following array corresponds to a contiguous 30 ** range of unicode codepoints that are not either letters or numbers (i.e. 31 ** codepoints for which this function should return 0). 32 ** 33 ** The most significant 22 bits in each 32-bit value contain the first 34 ** codepoint in the range. The least significant 10 bits are used to store 35 ** the size of the range (always at least 1). In other words, the value 36 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 37 ** C. It is not possible to represent a range larger than 1023 codepoints 38 ** using this format. 39 */ 40 static const unsigned int aEntry[] = { 41 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07, 42 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01, 43 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401, 44 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01, 45 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01, 46 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802, 47 0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F, 48 0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401, 49 0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804, 50 0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403, 51 0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812, 52 0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001, 53 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802, 54 0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805, 55 0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401, 56 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03, 57 0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807, 58 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001, 59 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01, 60 0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804, 61 0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001, 62 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802, 63 0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01, 64 0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06, 65 0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007, 66 0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006, 67 0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417, 68 0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14, 69 0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07, 70 0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01, 71 0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001, 72 0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802, 73 0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F, 74 0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 75 0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802, 76 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006, 77 0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D, 78 0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802, 79 0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027, 80 0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 81 0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805, 82 0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04, 83 0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401, 84 0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 85 0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B, 86 0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A, 87 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, 88 0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59, 89 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, 90 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, 91 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, 92 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, 93 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, 94 0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, 95 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 96 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012, 97 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004, 98 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002, 99 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803, 100 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07, 101 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, 102 0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802, 103 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013, 104 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06, 105 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003, 106 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01, 107 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403, 108 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009, 109 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003, 110 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003, 111 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E, 112 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046, 113 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401, 114 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401, 115 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F, 116 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C, 117 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002, 118 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025, 119 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6, 120 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46, 121 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060, 122 0x380400F0, 123 }; 124 static const unsigned int aAscii[4] = { 125 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001, 126 }; 127 128 if( (unsigned int)c<128 ){ 129 return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 ); 130 }else if( (unsigned int)c<(1<<22) ){ 131 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; 132 int iRes = 0; 133 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 134 int iLo = 0; 135 while( iHi>=iLo ){ 136 int iTest = (iHi + iLo) / 2; 137 if( key >= aEntry[iTest] ){ 138 iRes = iTest; 139 iLo = iTest+1; 140 }else{ 141 iHi = iTest-1; 142 } 143 } 144 assert( aEntry[0]<key ); 145 assert( key>=aEntry[iRes] ); 146 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); 147 } 148 return 1; 149 } 150 151 152 /* 153 ** If the argument is a codepoint corresponding to a lowercase letter 154 ** in the ASCII range with a diacritic added, return the codepoint 155 ** of the ASCII letter only. For example, if passed 235 - "LATIN 156 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER 157 ** E"). The resuls of passing a codepoint that corresponds to an 158 ** uppercase letter are undefined. 159 */ 160 static int fts5_remove_diacritic(int c){ 161 unsigned short aDia[] = { 162 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, 163 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, 164 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, 165 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, 166 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, 167 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, 168 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, 169 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, 170 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 171 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 172 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 173 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 174 62924, 63050, 63082, 63274, 63390, 175 }; 176 char aChar[] = { 177 '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', 178 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', 179 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', 180 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', 181 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', 182 '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', 183 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', 184 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', 185 'e', 'i', 'o', 'u', 'y', 186 }; 187 188 unsigned int key = (((unsigned int)c)<<3) | 0x00000007; 189 int iRes = 0; 190 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; 191 int iLo = 0; 192 while( iHi>=iLo ){ 193 int iTest = (iHi + iLo) / 2; 194 if( key >= aDia[iTest] ){ 195 iRes = iTest; 196 iLo = iTest+1; 197 }else{ 198 iHi = iTest-1; 199 } 200 } 201 assert( key>=aDia[iRes] ); 202 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); 203 } 204 205 206 /* 207 ** Return true if the argument interpreted as a unicode codepoint 208 ** is a diacritical modifier character. 209 */ 210 int sqlite3Fts5UnicodeIsdiacritic(int c){ 211 unsigned int mask0 = 0x08029FDF; 212 unsigned int mask1 = 0x000361F8; 213 if( c<768 || c>817 ) return 0; 214 return (c < 768+32) ? 215 (mask0 & (1 << (c-768))) : 216 (mask1 & (1 << (c-768-32))); 217 } 218 219 220 /* 221 ** Interpret the argument as a unicode codepoint. If the codepoint 222 ** is an upper case character that has a lower case equivalent, 223 ** return the codepoint corresponding to the lower case version. 224 ** Otherwise, return a copy of the argument. 225 ** 226 ** The results are undefined if the value passed to this function 227 ** is less than zero. 228 */ 229 int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){ 230 /* Each entry in the following array defines a rule for folding a range 231 ** of codepoints to lower case. The rule applies to a range of nRange 232 ** codepoints starting at codepoint iCode. 233 ** 234 ** If the least significant bit in flags is clear, then the rule applies 235 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and 236 ** need to be folded). Or, if it is set, then the rule only applies to 237 ** every second codepoint in the range, starting with codepoint C. 238 ** 239 ** The 7 most significant bits in flags are an index into the aiOff[] 240 ** array. If a specific codepoint C does require folding, then its lower 241 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). 242 ** 243 ** The contents of this array are generated by parsing the CaseFolding.txt 244 ** file distributed as part of the "Unicode Character Database". See 245 ** http://www.unicode.org for details. 246 */ 247 static const struct TableEntry { 248 unsigned short iCode; 249 unsigned char flags; 250 unsigned char nRange; 251 } aEntry[] = { 252 {65, 14, 26}, {181, 64, 1}, {192, 14, 23}, 253 {216, 14, 7}, {256, 1, 48}, {306, 1, 6}, 254 {313, 1, 16}, {330, 1, 46}, {376, 116, 1}, 255 {377, 1, 6}, {383, 104, 1}, {385, 50, 1}, 256 {386, 1, 4}, {390, 44, 1}, {391, 0, 1}, 257 {393, 42, 2}, {395, 0, 1}, {398, 32, 1}, 258 {399, 38, 1}, {400, 40, 1}, {401, 0, 1}, 259 {403, 42, 1}, {404, 46, 1}, {406, 52, 1}, 260 {407, 48, 1}, {408, 0, 1}, {412, 52, 1}, 261 {413, 54, 1}, {415, 56, 1}, {416, 1, 6}, 262 {422, 60, 1}, {423, 0, 1}, {425, 60, 1}, 263 {428, 0, 1}, {430, 60, 1}, {431, 0, 1}, 264 {433, 58, 2}, {435, 1, 4}, {439, 62, 1}, 265 {440, 0, 1}, {444, 0, 1}, {452, 2, 1}, 266 {453, 0, 1}, {455, 2, 1}, {456, 0, 1}, 267 {458, 2, 1}, {459, 1, 18}, {478, 1, 18}, 268 {497, 2, 1}, {498, 1, 4}, {502, 122, 1}, 269 {503, 134, 1}, {504, 1, 40}, {544, 110, 1}, 270 {546, 1, 18}, {570, 70, 1}, {571, 0, 1}, 271 {573, 108, 1}, {574, 68, 1}, {577, 0, 1}, 272 {579, 106, 1}, {580, 28, 1}, {581, 30, 1}, 273 {582, 1, 10}, {837, 36, 1}, {880, 1, 4}, 274 {886, 0, 1}, {902, 18, 1}, {904, 16, 3}, 275 {908, 26, 1}, {910, 24, 2}, {913, 14, 17}, 276 {931, 14, 9}, {962, 0, 1}, {975, 4, 1}, 277 {976, 140, 1}, {977, 142, 1}, {981, 146, 1}, 278 {982, 144, 1}, {984, 1, 24}, {1008, 136, 1}, 279 {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1}, 280 {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1}, 281 {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32}, 282 {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1}, 283 {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38}, 284 {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1}, 285 {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1}, 286 {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6}, 287 {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6}, 288 {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8}, 289 {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2}, 290 {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1}, 291 {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2}, 292 {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2}, 293 {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2}, 294 {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1}, 295 {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16}, 296 {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47}, 297 {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1}, 298 {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1}, 299 {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1}, 300 {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2}, 301 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1}, 302 {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14}, 303 {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1}, 304 {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1}, 305 {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1}, 306 {65313, 14, 26}, 307 }; 308 static const unsigned short aiOff[] = { 309 1, 2, 8, 15, 16, 26, 28, 32, 310 37, 38, 40, 48, 63, 64, 69, 71, 311 79, 80, 116, 202, 203, 205, 206, 207, 312 209, 210, 211, 213, 214, 217, 218, 219, 313 775, 7264, 10792, 10795, 23228, 23256, 30204, 54721, 314 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 315 57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 316 65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 317 65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 318 65514, 65521, 65527, 65528, 65529, 319 }; 320 321 int ret = c; 322 323 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); 324 325 if( c<128 ){ 326 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); 327 }else if( c<65536 ){ 328 const struct TableEntry *p; 329 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 330 int iLo = 0; 331 int iRes = -1; 332 333 assert( c>aEntry[0].iCode ); 334 while( iHi>=iLo ){ 335 int iTest = (iHi + iLo) / 2; 336 int cmp = (c - aEntry[iTest].iCode); 337 if( cmp>=0 ){ 338 iRes = iTest; 339 iLo = iTest+1; 340 }else{ 341 iHi = iTest-1; 342 } 343 } 344 345 assert( iRes>=0 && c>=aEntry[iRes].iCode ); 346 p = &aEntry[iRes]; 347 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ 348 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; 349 assert( ret>0 ); 350 } 351 352 if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret); 353 } 354 355 else if( c>=66560 && c<66600 ){ 356 ret = c + 40; 357 } 358 359 return ret; 360 }