modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/fts5_unicode2.c (about)

     1  /*
     2  ** 2012 May 25
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  ******************************************************************************
    12  */
    13  
    14  /*
    15  ** DO NOT EDIT THIS MACHINE GENERATED FILE.
    16  */
    17  
    18  
    19  #include <assert.h>
    20  
    21  /*
    22  ** Return true if the argument corresponds to a unicode codepoint
    23  ** classified as either a letter or a number. Otherwise false.
    24  **
    25  ** The results are undefined if the value passed to this function
    26  ** is less than zero.
    27  */
    28  int sqlite3Fts5UnicodeIsalnum(int c){
    29    /* Each unsigned integer in the following array corresponds to a contiguous
    30    ** range of unicode codepoints that are not either letters or numbers (i.e.
    31    ** codepoints for which this function should return 0).
    32    **
    33    ** The most significant 22 bits in each 32-bit value contain the first 
    34    ** codepoint in the range. The least significant 10 bits are used to store
    35    ** the size of the range (always at least 1). In other words, the value 
    36    ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
    37    ** C. It is not possible to represent a range larger than 1023 codepoints 
    38    ** using this format.
    39    */
    40    static const unsigned int aEntry[] = {
    41      0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
    42      0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
    43      0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
    44      0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
    45      0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
    46      0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
    47      0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
    48      0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
    49      0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
    50      0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
    51      0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
    52      0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
    53      0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
    54      0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
    55      0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
    56      0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
    57      0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
    58      0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
    59      0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
    60      0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
    61      0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
    62      0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
    63      0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
    64      0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
    65      0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
    66      0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
    67      0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
    68      0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
    69      0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
    70      0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
    71      0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
    72      0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
    73      0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
    74      0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
    75      0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
    76      0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
    77      0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
    78      0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
    79      0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
    80      0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
    81      0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
    82      0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
    83      0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
    84      0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
    85      0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
    86      0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
    87      0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
    88      0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
    89      0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
    90      0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
    91      0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
    92      0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
    93      0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
    94      0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
    95      0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
    96      0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
    97      0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
    98      0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
    99      0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
   100      0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
   101      0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
   102      0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
   103      0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
   104      0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
   105      0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
   106      0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
   107      0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
   108      0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
   109      0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
   110      0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
   111      0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
   112      0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
   113      0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
   114      0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
   115      0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
   116      0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
   117      0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
   118      0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
   119      0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
   120      0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
   121      0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
   122      0x380400F0,
   123    };
   124    static const unsigned int aAscii[4] = {
   125      0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
   126    };
   127  
   128    if( (unsigned int)c<128 ){
   129      return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
   130    }else if( (unsigned int)c<(1<<22) ){
   131      unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
   132      int iRes = 0;
   133      int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
   134      int iLo = 0;
   135      while( iHi>=iLo ){
   136        int iTest = (iHi + iLo) / 2;
   137        if( key >= aEntry[iTest] ){
   138          iRes = iTest;
   139          iLo = iTest+1;
   140        }else{
   141          iHi = iTest-1;
   142        }
   143      }
   144      assert( aEntry[0]<key );
   145      assert( key>=aEntry[iRes] );
   146      return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
   147    }
   148    return 1;
   149  }
   150  
   151  
   152  /*
   153  ** If the argument is a codepoint corresponding to a lowercase letter
   154  ** in the ASCII range with a diacritic added, return the codepoint
   155  ** of the ASCII letter only. For example, if passed 235 - "LATIN
   156  ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
   157  ** E"). The resuls of passing a codepoint that corresponds to an
   158  ** uppercase letter are undefined.
   159  */
   160  static int fts5_remove_diacritic(int c){
   161    unsigned short aDia[] = {
   162          0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
   163       2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
   164       2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
   165       2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
   166       3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
   167       3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
   168       4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
   169       6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
   170      61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
   171      61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
   172      62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
   173      62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
   174      62924, 63050, 63082, 63274, 63390, 
   175    };
   176    char aChar[] = {
   177      '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
   178      'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
   179      's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
   180      'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
   181      'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
   182      '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
   183      'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
   184      'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
   185      'e',  'i',  'o',  'u',  'y',  
   186    };
   187  
   188    unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
   189    int iRes = 0;
   190    int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
   191    int iLo = 0;
   192    while( iHi>=iLo ){
   193      int iTest = (iHi + iLo) / 2;
   194      if( key >= aDia[iTest] ){
   195        iRes = iTest;
   196        iLo = iTest+1;
   197      }else{
   198        iHi = iTest-1;
   199      }
   200    }
   201    assert( key>=aDia[iRes] );
   202    return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
   203  }
   204  
   205  
   206  /*
   207  ** Return true if the argument interpreted as a unicode codepoint
   208  ** is a diacritical modifier character.
   209  */
   210  int sqlite3Fts5UnicodeIsdiacritic(int c){
   211    unsigned int mask0 = 0x08029FDF;
   212    unsigned int mask1 = 0x000361F8;
   213    if( c<768 || c>817 ) return 0;
   214    return (c < 768+32) ?
   215        (mask0 & (1 << (c-768))) :
   216        (mask1 & (1 << (c-768-32)));
   217  }
   218  
   219  
   220  /*
   221  ** Interpret the argument as a unicode codepoint. If the codepoint
   222  ** is an upper case character that has a lower case equivalent,
   223  ** return the codepoint corresponding to the lower case version.
   224  ** Otherwise, return a copy of the argument.
   225  **
   226  ** The results are undefined if the value passed to this function
   227  ** is less than zero.
   228  */
   229  int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
   230    /* Each entry in the following array defines a rule for folding a range
   231    ** of codepoints to lower case. The rule applies to a range of nRange
   232    ** codepoints starting at codepoint iCode.
   233    **
   234    ** If the least significant bit in flags is clear, then the rule applies
   235    ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
   236    ** need to be folded). Or, if it is set, then the rule only applies to
   237    ** every second codepoint in the range, starting with codepoint C.
   238    **
   239    ** The 7 most significant bits in flags are an index into the aiOff[]
   240    ** array. If a specific codepoint C does require folding, then its lower
   241    ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
   242    **
   243    ** The contents of this array are generated by parsing the CaseFolding.txt
   244    ** file distributed as part of the "Unicode Character Database". See
   245    ** http://www.unicode.org for details.
   246    */
   247    static const struct TableEntry {
   248      unsigned short iCode;
   249      unsigned char flags;
   250      unsigned char nRange;
   251    } aEntry[] = {
   252      {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
   253      {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
   254      {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
   255      {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
   256      {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
   257      {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
   258      {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
   259      {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
   260      {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
   261      {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
   262      {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
   263      {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
   264      {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
   265      {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
   266      {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
   267      {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
   268      {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
   269      {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
   270      {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
   271      {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
   272      {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
   273      {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
   274      {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
   275      {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
   276      {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
   277      {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
   278      {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
   279      {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
   280      {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
   281      {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
   282      {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
   283      {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
   284      {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
   285      {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
   286      {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
   287      {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
   288      {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
   289      {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
   290      {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
   291      {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
   292      {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
   293      {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
   294      {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
   295      {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
   296      {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
   297      {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
   298      {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
   299      {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
   300      {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
   301      {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
   302      {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
   303      {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
   304      {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
   305      {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
   306      {65313, 14, 26},       
   307    };
   308    static const unsigned short aiOff[] = {
   309     1,     2,     8,     15,    16,    26,    28,    32,    
   310     37,    38,    40,    48,    63,    64,    69,    71,    
   311     79,    80,    116,   202,   203,   205,   206,   207,   
   312     209,   210,   211,   213,   214,   217,   218,   219,   
   313     775,   7264,  10792, 10795, 23228, 23256, 30204, 54721, 
   314     54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 
   315     57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 
   316     65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 
   317     65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 
   318     65514, 65521, 65527, 65528, 65529, 
   319    };
   320  
   321    int ret = c;
   322  
   323    assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
   324  
   325    if( c<128 ){
   326      if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
   327    }else if( c<65536 ){
   328      const struct TableEntry *p;
   329      int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
   330      int iLo = 0;
   331      int iRes = -1;
   332  
   333      assert( c>aEntry[0].iCode );
   334      while( iHi>=iLo ){
   335        int iTest = (iHi + iLo) / 2;
   336        int cmp = (c - aEntry[iTest].iCode);
   337        if( cmp>=0 ){
   338          iRes = iTest;
   339          iLo = iTest+1;
   340        }else{
   341          iHi = iTest-1;
   342        }
   343      }
   344  
   345      assert( iRes>=0 && c>=aEntry[iRes].iCode );
   346      p = &aEntry[iRes];
   347      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
   348        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   349        assert( ret>0 );
   350      }
   351  
   352      if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
   353    }
   354    
   355    else if( c>=66560 && c<66600 ){
   356      ret = c + 40;
   357    }
   358  
   359    return ret;
   360  }