modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_unicode.c

modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_unicode.c (about)

     1  /*
     2  ** 2012 May 24
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  ******************************************************************************
    12  **
    13  ** Implementation of the "unicode" full-text-search tokenizer.
    14  */
    15  
    16  #ifndef SQLITE_DISABLE_FTS3_UNICODE
    17  
    18  #include "fts3Int.h"
    19  #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
    20  
    21  #include <assert.h>
    22  #include <stdlib.h>
    23  #include <stdio.h>
    24  #include <string.h>
    25  
    26  #include "fts3_tokenizer.h"
    27  
    28  /*
    29  ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
    30  ** from the sqlite3 source file utf.c. If this file is compiled as part
    31  ** of the amalgamation, they are not required.
    32  */
    33  #ifndef SQLITE_AMALGAMATION
    34  
    35  static const unsigned char sqlite3Utf8Trans1[] = {
    36    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    37    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    38    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    39    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    40    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    41    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    42    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    43    0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
    44  };
    45  
    46  #define READ_UTF8(zIn, zTerm, c)                           \
    47    c = *(zIn++);                                            \
    48    if( c>=0xc0 ){                                           \
    49      c = sqlite3Utf8Trans1[c-0xc0];                         \
    50      while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
    51        c = (c<<6) + (0x3f & *(zIn++));                      \
    52      }                                                      \
    53      if( c<0x80                                             \
    54          || (c&0xFFFFF800)==0xD800                          \
    55          || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
    56    }
    57  
    58  #define WRITE_UTF8(zOut, c) {                          \
    59    if( c<0x00080 ){                                     \
    60      *zOut++ = (u8)(c&0xFF);                            \
    61    }                                                    \
    62    else if( c<0x00800 ){                                \
    63      *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
    64      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    65    }                                                    \
    66    else if( c<0x10000 ){                                \
    67      *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
    68      *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
    69      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    70    }else{                                               \
    71      *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
    72      *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
    73      *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
    74      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    75    }                                                    \
    76  }
    77  
    78  #endif /* ifndef SQLITE_AMALGAMATION */
    79  
    80  typedef struct unicode_tokenizer unicode_tokenizer;
    81  typedef struct unicode_cursor unicode_cursor;
    82  
    83  struct unicode_tokenizer {
    84    sqlite3_tokenizer base;
    85    int bRemoveDiacritic;
    86    int nException;
    87    int *aiException;
    88  };
    89  
    90  struct unicode_cursor {
    91    sqlite3_tokenizer_cursor base;
    92    const unsigned char *aInput;    /* Input text being tokenized */
    93    int nInput;                     /* Size of aInput[] in bytes */
    94    int iOff;                       /* Current offset within aInput[] */
    95    int iToken;                     /* Index of next token to be returned */
    96    char *zToken;                   /* storage for current token */
    97    int nAlloc;                     /* space allocated at zToken */
    98  };
    99  
   100  
   101  /*
   102  ** Destroy a tokenizer allocated by unicodeCreate().
   103  */
   104  static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
   105    if( pTokenizer ){
   106      unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer;
   107      sqlite3_free(p->aiException);
   108      sqlite3_free(p);
   109    }
   110    return SQLITE_OK;
   111  }
   112  
   113  /*
   114  ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
   115  ** statement has specified that the tokenizer for this table shall consider
   116  ** all characters in string zIn/nIn to be separators (if bAlnum==0) or
   117  ** token characters (if bAlnum==1).
   118  **
   119  ** For each codepoint in the zIn/nIn string, this function checks if the
   120  ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
   121  ** If so, no action is taken. Otherwise, the codepoint is added to the 
   122  ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
   123  ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
   124  ** codepoints in the aiException[] array.
   125  **
   126  ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
   127  ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
   128  ** It is not possible to change the behavior of the tokenizer with respect
   129  ** to these codepoints.
   130  */
   131  static int unicodeAddExceptions(
   132    unicode_tokenizer *p,           /* Tokenizer to add exceptions to */
   133    int bAlnum,                     /* Replace Isalnum() return value with this */
   134    const char *zIn,                /* Array of characters to make exceptions */
   135    int nIn                         /* Length of z in bytes */
   136  ){
   137    const unsigned char *z = (const unsigned char *)zIn;
   138    const unsigned char *zTerm = &z[nIn];
   139    unsigned int iCode;
   140    int nEntry = 0;
   141  
   142    assert( bAlnum==0 || bAlnum==1 );
   143  
   144    while( z<zTerm ){
   145      READ_UTF8(z, zTerm, iCode);
   146      assert( (sqlite3FtsUnicodeIsalnum((int)iCode) & 0xFFFFFFFE)==0 );
   147      if( sqlite3FtsUnicodeIsalnum((int)iCode)!=bAlnum 
   148       && sqlite3FtsUnicodeIsdiacritic((int)iCode)==0 
   149      ){
   150        nEntry++;
   151      }
   152    }
   153  
   154    if( nEntry ){
   155      int *aNew;                    /* New aiException[] array */
   156      int nNew;                     /* Number of valid entries in array aNew[] */
   157  
   158      aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));
   159      if( aNew==0 ) return SQLITE_NOMEM;
   160      nNew = p->nException;
   161  
   162      z = (const unsigned char *)zIn;
   163      while( z<zTerm ){
   164        READ_UTF8(z, zTerm, iCode);
   165        if( sqlite3FtsUnicodeIsalnum((int)iCode)!=bAlnum 
   166         && sqlite3FtsUnicodeIsdiacritic((int)iCode)==0
   167        ){
   168          int i, j;
   169          for(i=0; i<nNew && aNew[i]<(int)iCode; i++);
   170          for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];
   171          aNew[i] = (int)iCode;
   172          nNew++;
   173        }
   174      }
   175      p->aiException = aNew;
   176      p->nException = nNew;
   177    }
   178  
   179    return SQLITE_OK;
   180  }
   181  
   182  /*
   183  ** Return true if the p->aiException[] array contains the value iCode.
   184  */
   185  static int unicodeIsException(unicode_tokenizer *p, int iCode){
   186    if( p->nException>0 ){
   187      int *a = p->aiException;
   188      int iLo = 0;
   189      int iHi = p->nException-1;
   190  
   191      while( iHi>=iLo ){
   192        int iTest = (iHi + iLo) / 2;
   193        if( iCode==a[iTest] ){
   194          return 1;
   195        }else if( iCode>a[iTest] ){
   196          iLo = iTest+1;
   197        }else{
   198          iHi = iTest-1;
   199        }
   200      }
   201    }
   202  
   203    return 0;
   204  }
   205  
   206  /*
   207  ** Return true if, for the purposes of tokenization, codepoint iCode is
   208  ** considered a token character (not a separator).
   209  */
   210  static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){
   211    assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
   212    return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);
   213  }
   214  
   215  /*
   216  ** Create a new tokenizer instance.
   217  */
   218  static int unicodeCreate(
   219    int nArg,                       /* Size of array argv[] */
   220    const char * const *azArg,      /* Tokenizer creation arguments */
   221    sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
   222  ){
   223    unicode_tokenizer *pNew;        /* New tokenizer object */
   224    int i;
   225    int rc = SQLITE_OK;
   226  
   227    pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
   228    if( pNew==NULL ) return SQLITE_NOMEM;
   229    memset(pNew, 0, sizeof(unicode_tokenizer));
   230    pNew->bRemoveDiacritic = 1;
   231  
   232    for(i=0; rc==SQLITE_OK && i<nArg; i++){
   233      const char *z = azArg[i];
   234      int n = (int)strlen(z);
   235  
   236      if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
   237        pNew->bRemoveDiacritic = 1;
   238      }
   239      else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
   240        pNew->bRemoveDiacritic = 0;
   241      }
   242      else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
   243        rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
   244      }
   245      else if( n>=11 && memcmp("separators=", z, 11)==0 ){
   246        rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
   247      }
   248      else{
   249        /* Unrecognized argument */
   250        rc  = SQLITE_ERROR;
   251      }
   252    }
   253  
   254    if( rc!=SQLITE_OK ){
   255      unicodeDestroy((sqlite3_tokenizer *)pNew);
   256      pNew = 0;
   257    }
   258    *pp = (sqlite3_tokenizer *)pNew;
   259    return rc;
   260  }
   261  
   262  /*
   263  ** Prepare to begin tokenizing a particular string.  The input
   264  ** string to be tokenized is pInput[0..nBytes-1].  A cursor
   265  ** used to incrementally tokenize this string is returned in 
   266  ** *ppCursor.
   267  */
   268  static int unicodeOpen(
   269    sqlite3_tokenizer *p,           /* The tokenizer */
   270    const char *aInput,             /* Input string */
   271    int nInput,                     /* Size of string aInput in bytes */
   272    sqlite3_tokenizer_cursor **pp   /* OUT: New cursor object */
   273  ){
   274    unicode_cursor *pCsr;
   275  
   276    pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
   277    if( pCsr==0 ){
   278      return SQLITE_NOMEM;
   279    }
   280    memset(pCsr, 0, sizeof(unicode_cursor));
   281  
   282    pCsr->aInput = (const unsigned char *)aInput;
   283    if( aInput==0 ){
   284      pCsr->nInput = 0;
   285    }else if( nInput<0 ){
   286      pCsr->nInput = (int)strlen(aInput);
   287    }else{
   288      pCsr->nInput = nInput;
   289    }
   290  
   291    *pp = &pCsr->base;
   292    UNUSED_PARAMETER(p);
   293    return SQLITE_OK;
   294  }
   295  
   296  /*
   297  ** Close a tokenization cursor previously opened by a call to
   298  ** simpleOpen() above.
   299  */
   300  static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
   301    unicode_cursor *pCsr = (unicode_cursor *) pCursor;
   302    sqlite3_free(pCsr->zToken);
   303    sqlite3_free(pCsr);
   304    return SQLITE_OK;
   305  }
   306  
   307  /*
   308  ** Extract the next token from a tokenization cursor.  The cursor must
   309  ** have been opened by a prior call to simpleOpen().
   310  */
   311  static int unicodeNext(
   312    sqlite3_tokenizer_cursor *pC,   /* Cursor returned by simpleOpen */
   313    const char **paToken,           /* OUT: Token text */
   314    int *pnToken,                   /* OUT: Number of bytes at *paToken */
   315    int *piStart,                   /* OUT: Starting offset of token */
   316    int *piEnd,                     /* OUT: Ending offset of token */
   317    int *piPos                      /* OUT: Position integer of token */
   318  ){
   319    unicode_cursor *pCsr = (unicode_cursor *)pC;
   320    unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
   321    unsigned int iCode = 0;
   322    char *zOut;
   323    const unsigned char *z = &pCsr->aInput[pCsr->iOff];
   324    const unsigned char *zStart = z;
   325    const unsigned char *zEnd;
   326    const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
   327  
   328    /* Scan past any delimiter characters before the start of the next token.
   329    ** Return SQLITE_DONE early if this takes us all the way to the end of 
   330    ** the input.  */
   331    while( z<zTerm ){
   332      READ_UTF8(z, zTerm, iCode);
   333      if( unicodeIsAlnum(p, (int)iCode) ) break;
   334      zStart = z;
   335    }
   336    if( zStart>=zTerm ) return SQLITE_DONE;
   337  
   338    zOut = pCsr->zToken;
   339    do {
   340      int iOut;
   341  
   342      /* Grow the output buffer if required. */
   343      if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
   344        char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
   345        if( !zNew ) return SQLITE_NOMEM;
   346        zOut = &zNew[zOut - pCsr->zToken];
   347        pCsr->zToken = zNew;
   348        pCsr->nAlloc += 64;
   349      }
   350  
   351      /* Write the folded case of the last character read to the output */
   352      zEnd = z;
   353      iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
   354      if( iOut ){
   355        WRITE_UTF8(zOut, iOut);
   356      }
   357  
   358      /* If the cursor is not at EOF, read the next character */
   359      if( z>=zTerm ) break;
   360      READ_UTF8(z, zTerm, iCode);
   361    }while( unicodeIsAlnum(p, (int)iCode) 
   362         || sqlite3FtsUnicodeIsdiacritic((int)iCode)
   363    );
   364  
   365    /* Set the output variables and return. */
   366    pCsr->iOff = (int)(z - pCsr->aInput);
   367    *paToken = pCsr->zToken;
   368    *pnToken = (int)(zOut - pCsr->zToken);
   369    *piStart = (int)(zStart - pCsr->aInput);
   370    *piEnd = (int)(zEnd - pCsr->aInput);
   371    *piPos = pCsr->iToken++;
   372    return SQLITE_OK;
   373  }
   374  
   375  /*
   376  ** Set *ppModule to a pointer to the sqlite3_tokenizer_module 
   377  ** structure for the unicode tokenizer.
   378  */
   379  void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){
   380    static const sqlite3_tokenizer_module module = {
   381      0,
   382      unicodeCreate,
   383      unicodeDestroy,
   384      unicodeOpen,
   385      unicodeClose,
   386      unicodeNext,
   387      0,
   388    };
   389    *ppModule = &module;
   390  }
   391  
   392  #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
   393  #endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */