modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_icu.c (about)

     1  /*
     2  ** 2007 June 22
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  ** This file implements a tokenizer for fts3 based on the ICU library.
    13  */
    14  #include "fts3Int.h"
    15  #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
    16  #ifdef SQLITE_ENABLE_ICU
    17  
    18  #include <assert.h>
    19  #include <string.h>
    20  #include "fts3_tokenizer.h"
    21  
    22  #include <unicode/ubrk.h>
    23  #include <unicode/ucol.h>
    24  #include <unicode/ustring.h>
    25  #include <unicode/utf16.h>
    26  
    27  typedef struct IcuTokenizer IcuTokenizer;
    28  typedef struct IcuCursor IcuCursor;
    29  
    30  struct IcuTokenizer {
    31    sqlite3_tokenizer base;
    32    char *zLocale;
    33  };
    34  
    35  struct IcuCursor {
    36    sqlite3_tokenizer_cursor base;
    37  
    38    UBreakIterator *pIter;      /* ICU break-iterator object */
    39    int nChar;                  /* Number of UChar elements in pInput */
    40    UChar *aChar;               /* Copy of input using utf-16 encoding */
    41    int *aOffset;               /* Offsets of each character in utf-8 input */
    42  
    43    int nBuffer;
    44    char *zBuffer;
    45  
    46    int iToken;
    47  };
    48  
    49  /*
    50  ** Create a new tokenizer instance.
    51  */
    52  static int icuCreate(
    53    int argc,                            /* Number of entries in argv[] */
    54    const char * const *argv,            /* Tokenizer creation arguments */
    55    sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
    56  ){
    57    IcuTokenizer *p;
    58    int n = 0;
    59  
    60    if( argc>0 ){
    61      n = strlen(argv[0])+1;
    62    }
    63    p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
    64    if( !p ){
    65      return SQLITE_NOMEM;
    66    }
    67    memset(p, 0, sizeof(IcuTokenizer));
    68  
    69    if( n ){
    70      p->zLocale = (char *)&p[1];
    71      memcpy(p->zLocale, argv[0], n);
    72    }
    73  
    74    *ppTokenizer = (sqlite3_tokenizer *)p;
    75  
    76    return SQLITE_OK;
    77  }
    78  
    79  /*
    80  ** Destroy a tokenizer
    81  */
    82  static int icuDestroy(sqlite3_tokenizer *pTokenizer){
    83    IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
    84    sqlite3_free(p);
    85    return SQLITE_OK;
    86  }
    87  
    88  /*
    89  ** Prepare to begin tokenizing a particular string.  The input
    90  ** string to be tokenized is pInput[0..nBytes-1].  A cursor
    91  ** used to incrementally tokenize this string is returned in 
    92  ** *ppCursor.
    93  */
    94  static int icuOpen(
    95    sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
    96    const char *zInput,                    /* Input string */
    97    int nInput,                            /* Length of zInput in bytes */
    98    sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
    99  ){
   100    IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
   101    IcuCursor *pCsr;
   102  
   103    const int32_t opt = U_FOLD_CASE_DEFAULT;
   104    UErrorCode status = U_ZERO_ERROR;
   105    int nChar;
   106  
   107    UChar32 c;
   108    int iInput = 0;
   109    int iOut = 0;
   110  
   111    *ppCursor = 0;
   112  
   113    if( zInput==0 ){
   114      nInput = 0;
   115      zInput = "";
   116    }else if( nInput<0 ){
   117      nInput = strlen(zInput);
   118    }
   119    nChar = nInput+1;
   120    pCsr = (IcuCursor *)sqlite3_malloc(
   121        sizeof(IcuCursor) +                /* IcuCursor */
   122        ((nChar+3)&~3) * sizeof(UChar) +   /* IcuCursor.aChar[] */
   123        (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
   124    );
   125    if( !pCsr ){
   126      return SQLITE_NOMEM;
   127    }
   128    memset(pCsr, 0, sizeof(IcuCursor));
   129    pCsr->aChar = (UChar *)&pCsr[1];
   130    pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
   131  
   132    pCsr->aOffset[iOut] = iInput;
   133    U8_NEXT(zInput, iInput, nInput, c); 
   134    while( c>0 ){
   135      int isError = 0;
   136      c = u_foldCase(c, opt);
   137      U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
   138      if( isError ){
   139        sqlite3_free(pCsr);
   140        return SQLITE_ERROR;
   141      }
   142      pCsr->aOffset[iOut] = iInput;
   143  
   144      if( iInput<nInput ){
   145        U8_NEXT(zInput, iInput, nInput, c);
   146      }else{
   147        c = 0;
   148      }
   149    }
   150  
   151    pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
   152    if( !U_SUCCESS(status) ){
   153      sqlite3_free(pCsr);
   154      return SQLITE_ERROR;
   155    }
   156    pCsr->nChar = iOut;
   157  
   158    ubrk_first(pCsr->pIter);
   159    *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
   160    return SQLITE_OK;
   161  }
   162  
   163  /*
   164  ** Close a tokenization cursor previously opened by a call to icuOpen().
   165  */
   166  static int icuClose(sqlite3_tokenizer_cursor *pCursor){
   167    IcuCursor *pCsr = (IcuCursor *)pCursor;
   168    ubrk_close(pCsr->pIter);
   169    sqlite3_free(pCsr->zBuffer);
   170    sqlite3_free(pCsr);
   171    return SQLITE_OK;
   172  }
   173  
   174  /*
   175  ** Extract the next token from a tokenization cursor.
   176  */
   177  static int icuNext(
   178    sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
   179    const char **ppToken,               /* OUT: *ppToken is the token text */
   180    int *pnBytes,                       /* OUT: Number of bytes in token */
   181    int *piStartOffset,                 /* OUT: Starting offset of token */
   182    int *piEndOffset,                   /* OUT: Ending offset of token */
   183    int *piPosition                     /* OUT: Position integer of token */
   184  ){
   185    IcuCursor *pCsr = (IcuCursor *)pCursor;
   186  
   187    int iStart = 0;
   188    int iEnd = 0;
   189    int nByte = 0;
   190  
   191    while( iStart==iEnd ){
   192      UChar32 c;
   193  
   194      iStart = ubrk_current(pCsr->pIter);
   195      iEnd = ubrk_next(pCsr->pIter);
   196      if( iEnd==UBRK_DONE ){
   197        return SQLITE_DONE;
   198      }
   199  
   200      while( iStart<iEnd ){
   201        int iWhite = iStart;
   202        U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
   203        if( u_isspace(c) ){
   204          iStart = iWhite;
   205        }else{
   206          break;
   207        }
   208      }
   209      assert(iStart<=iEnd);
   210    }
   211  
   212    do {
   213      UErrorCode status = U_ZERO_ERROR;
   214      if( nByte ){
   215        char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
   216        if( !zNew ){
   217          return SQLITE_NOMEM;
   218        }
   219        pCsr->zBuffer = zNew;
   220        pCsr->nBuffer = nByte;
   221      }
   222  
   223      u_strToUTF8(
   224          pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
   225          &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
   226          &status                                  /* Output success/failure */
   227      );
   228    } while( nByte>pCsr->nBuffer );
   229  
   230    *ppToken = pCsr->zBuffer;
   231    *pnBytes = nByte;
   232    *piStartOffset = pCsr->aOffset[iStart];
   233    *piEndOffset = pCsr->aOffset[iEnd];
   234    *piPosition = pCsr->iToken++;
   235  
   236    return SQLITE_OK;
   237  }
   238  
   239  /*
   240  ** The set of routines that implement the simple tokenizer
   241  */
   242  static const sqlite3_tokenizer_module icuTokenizerModule = {
   243    0,                           /* iVersion    */
   244    icuCreate,                   /* xCreate     */
   245    icuDestroy,                  /* xCreate     */
   246    icuOpen,                     /* xOpen       */
   247    icuClose,                    /* xClose      */
   248    icuNext,                     /* xNext       */
   249    0,                           /* xLanguageid */
   250  };
   251  
   252  /*
   253  ** Set *ppModule to point at the implementation of the ICU tokenizer.
   254  */
   255  void sqlite3Fts3IcuTokenizerModule(
   256    sqlite3_tokenizer_module const**ppModule
   257  ){
   258    *ppModule = &icuTokenizerModule;
   259  }
   260  
   261  #endif /* defined(SQLITE_ENABLE_ICU) */
   262  #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */