modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts2/fts2_icu.c (about)

     1  /*
     2  ** 2007 June 22
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  ** This file implements a tokenizer for fts2 based on the ICU library.
    13  ** 
    14  ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
    15  */
    16  
    17  #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
    18  #ifdef SQLITE_ENABLE_ICU
    19  
    20  #include <assert.h>
    21  #include <string.h>
    22  #include "fts2_tokenizer.h"
    23  
    24  #include <unicode/ubrk.h>
    25  #include <unicode/ucol.h>
    26  #include <unicode/ustring.h>
    27  #include <unicode/utf16.h>
    28  
    29  typedef struct IcuTokenizer IcuTokenizer;
    30  typedef struct IcuCursor IcuCursor;
    31  
    32  struct IcuTokenizer {
    33    sqlite3_tokenizer base;
    34    char *zLocale;
    35  };
    36  
    37  struct IcuCursor {
    38    sqlite3_tokenizer_cursor base;
    39  
    40    UBreakIterator *pIter;      /* ICU break-iterator object */
    41    int nChar;                  /* Number of UChar elements in pInput */
    42    UChar *aChar;               /* Copy of input using utf-16 encoding */
    43    int *aOffset;               /* Offsets of each character in utf-8 input */
    44  
    45    int nBuffer;
    46    char *zBuffer;
    47  
    48    int iToken;
    49  };
    50  
    51  /*
    52  ** Create a new tokenizer instance.
    53  */
    54  static int icuCreate(
    55    int argc,                            /* Number of entries in argv[] */
    56    const char * const *argv,            /* Tokenizer creation arguments */
    57    sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
    58  ){
    59    IcuTokenizer *p;
    60    int n = 0;
    61  
    62    if( argc>0 ){
    63      n = strlen(argv[0])+1;
    64    }
    65    p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
    66    if( !p ){
    67      return SQLITE_NOMEM;
    68    }
    69    memset(p, 0, sizeof(IcuTokenizer));
    70  
    71    if( n ){
    72      p->zLocale = (char *)&p[1];
    73      memcpy(p->zLocale, argv[0], n);
    74    }
    75  
    76    *ppTokenizer = (sqlite3_tokenizer *)p;
    77  
    78    return SQLITE_OK;
    79  }
    80  
    81  /*
    82  ** Destroy a tokenizer
    83  */
    84  static int icuDestroy(sqlite3_tokenizer *pTokenizer){
    85    IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
    86    sqlite3_free(p);
    87    return SQLITE_OK;
    88  }
    89  
    90  /*
    91  ** Prepare to begin tokenizing a particular string.  The input
    92  ** string to be tokenized is pInput[0..nBytes-1].  A cursor
    93  ** used to incrementally tokenize this string is returned in 
    94  ** *ppCursor.
    95  */
    96  static int icuOpen(
    97    sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
    98    const char *zInput,                    /* Input string */
    99    int nInput,                            /* Length of zInput in bytes */
   100    sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
   101  ){
   102    IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
   103    IcuCursor *pCsr;
   104  
   105    const int32_t opt = U_FOLD_CASE_DEFAULT;
   106    UErrorCode status = U_ZERO_ERROR;
   107    int nChar;
   108  
   109    UChar32 c;
   110    int iInput = 0;
   111    int iOut = 0;
   112  
   113    *ppCursor = 0;
   114  
   115    if( nInput<0 ){
   116      nInput = strlen(zInput);
   117    }
   118    nChar = nInput+1;
   119    pCsr = (IcuCursor *)sqlite3_malloc(
   120        sizeof(IcuCursor) +                /* IcuCursor */
   121        ((nChar+3)&~3) * sizeof(UChar) +   /* IcuCursor.aChar[] */
   122        (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
   123    );
   124    if( !pCsr ){
   125      return SQLITE_NOMEM;
   126    }
   127    memset(pCsr, 0, sizeof(IcuCursor));
   128    pCsr->aChar = (UChar *)&pCsr[1];
   129    pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
   130  
   131    pCsr->aOffset[iOut] = iInput;
   132    U8_NEXT(zInput, iInput, nInput, c); 
   133    while( c>0 ){
   134      int isError = 0;
   135      c = u_foldCase(c, opt);
   136      U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
   137      if( isError ){
   138        sqlite3_free(pCsr);
   139        return SQLITE_ERROR;
   140      }
   141      pCsr->aOffset[iOut] = iInput;
   142  
   143      if( iInput<nInput ){
   144        U8_NEXT(zInput, iInput, nInput, c);
   145      }else{
   146        c = 0;
   147      }
   148    }
   149  
   150    pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
   151    if( !U_SUCCESS(status) ){
   152      sqlite3_free(pCsr);
   153      return SQLITE_ERROR;
   154    }
   155    pCsr->nChar = iOut;
   156  
   157    ubrk_first(pCsr->pIter);
   158    *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
   159    return SQLITE_OK;
   160  }
   161  
   162  /*
   163  ** Close a tokenization cursor previously opened by a call to icuOpen().
   164  */
   165  static int icuClose(sqlite3_tokenizer_cursor *pCursor){
   166    IcuCursor *pCsr = (IcuCursor *)pCursor;
   167    ubrk_close(pCsr->pIter);
   168    sqlite3_free(pCsr->zBuffer);
   169    sqlite3_free(pCsr);
   170    return SQLITE_OK;
   171  }
   172  
   173  /*
   174  ** Extract the next token from a tokenization cursor.
   175  */
   176  static int icuNext(
   177    sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
   178    const char **ppToken,               /* OUT: *ppToken is the token text */
   179    int *pnBytes,                       /* OUT: Number of bytes in token */
   180    int *piStartOffset,                 /* OUT: Starting offset of token */
   181    int *piEndOffset,                   /* OUT: Ending offset of token */
   182    int *piPosition                     /* OUT: Position integer of token */
   183  ){
   184    IcuCursor *pCsr = (IcuCursor *)pCursor;
   185  
   186    int iStart = 0;
   187    int iEnd = 0;
   188    int nByte = 0;
   189  
   190    while( iStart==iEnd ){
   191      UChar32 c;
   192  
   193      iStart = ubrk_current(pCsr->pIter);
   194      iEnd = ubrk_next(pCsr->pIter);
   195      if( iEnd==UBRK_DONE ){
   196        return SQLITE_DONE;
   197      }
   198  
   199      while( iStart<iEnd ){
   200        int iWhite = iStart;
   201        U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
   202        if( u_isspace(c) ){
   203          iStart = iWhite;
   204        }else{
   205          break;
   206        }
   207      }
   208      assert(iStart<=iEnd);
   209    }
   210  
   211    do {
   212      UErrorCode status = U_ZERO_ERROR;
   213      if( nByte ){
   214        char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
   215        if( !zNew ){
   216          return SQLITE_NOMEM;
   217        }
   218        pCsr->zBuffer = zNew;
   219        pCsr->nBuffer = nByte;
   220      }
   221  
   222      u_strToUTF8(
   223          pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
   224          &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
   225          &status                                  /* Output success/failure */
   226      );
   227    } while( nByte>pCsr->nBuffer );
   228  
   229    *ppToken = pCsr->zBuffer;
   230    *pnBytes = nByte;
   231    *piStartOffset = pCsr->aOffset[iStart];
   232    *piEndOffset = pCsr->aOffset[iEnd];
   233    *piPosition = pCsr->iToken++;
   234  
   235    return SQLITE_OK;
   236  }
   237  
   238  /*
   239  ** The set of routines that implement the simple tokenizer
   240  */
   241  static const sqlite3_tokenizer_module icuTokenizerModule = {
   242    0,                           /* iVersion */
   243    icuCreate,                   /* xCreate  */
   244    icuDestroy,                  /* xCreate  */
   245    icuOpen,                     /* xOpen    */
   246    icuClose,                    /* xClose   */
   247    icuNext,                     /* xNext    */
   248  };
   249  
   250  /*
   251  ** Set *ppModule to point at the implementation of the ICU tokenizer.
   252  */
   253  void sqlite3Fts2IcuTokenizerModule(
   254    sqlite3_tokenizer_module const**ppModule
   255  ){
   256    *ppModule = &icuTokenizerModule;
   257  }
   258  
   259  #endif /* defined(SQLITE_ENABLE_ICU) */
   260  #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */