modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_icu.c (about) 1 /* 2 ** 2007 June 22 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file implements a tokenizer for fts3 based on the ICU library. 13 */ 14 #include "fts3Int.h" 15 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 16 #ifdef SQLITE_ENABLE_ICU 17 18 #include <assert.h> 19 #include <string.h> 20 #include "fts3_tokenizer.h" 21 22 #include <unicode/ubrk.h> 23 #include <unicode/ucol.h> 24 #include <unicode/ustring.h> 25 #include <unicode/utf16.h> 26 27 typedef struct IcuTokenizer IcuTokenizer; 28 typedef struct IcuCursor IcuCursor; 29 30 struct IcuTokenizer { 31 sqlite3_tokenizer base; 32 char *zLocale; 33 }; 34 35 struct IcuCursor { 36 sqlite3_tokenizer_cursor base; 37 38 UBreakIterator *pIter; /* ICU break-iterator object */ 39 int nChar; /* Number of UChar elements in pInput */ 40 UChar *aChar; /* Copy of input using utf-16 encoding */ 41 int *aOffset; /* Offsets of each character in utf-8 input */ 42 43 int nBuffer; 44 char *zBuffer; 45 46 int iToken; 47 }; 48 49 /* 50 ** Create a new tokenizer instance. 51 */ 52 static int icuCreate( 53 int argc, /* Number of entries in argv[] */ 54 const char * const *argv, /* Tokenizer creation arguments */ 55 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 56 ){ 57 IcuTokenizer *p; 58 int n = 0; 59 60 if( argc>0 ){ 61 n = strlen(argv[0])+1; 62 } 63 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); 64 if( !p ){ 65 return SQLITE_NOMEM; 66 } 67 memset(p, 0, sizeof(IcuTokenizer)); 68 69 if( n ){ 70 p->zLocale = (char *)&p[1]; 71 memcpy(p->zLocale, argv[0], n); 72 } 73 74 *ppTokenizer = (sqlite3_tokenizer *)p; 75 76 return SQLITE_OK; 77 } 78 79 /* 80 ** Destroy a tokenizer 81 */ 82 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ 83 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 84 sqlite3_free(p); 85 return SQLITE_OK; 86 } 87 88 /* 89 ** Prepare to begin tokenizing a particular string. The input 90 ** string to be tokenized is pInput[0..nBytes-1]. A cursor 91 ** used to incrementally tokenize this string is returned in 92 ** *ppCursor. 93 */ 94 static int icuOpen( 95 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 96 const char *zInput, /* Input string */ 97 int nInput, /* Length of zInput in bytes */ 98 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 99 ){ 100 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 101 IcuCursor *pCsr; 102 103 const int32_t opt = U_FOLD_CASE_DEFAULT; 104 UErrorCode status = U_ZERO_ERROR; 105 int nChar; 106 107 UChar32 c; 108 int iInput = 0; 109 int iOut = 0; 110 111 *ppCursor = 0; 112 113 if( zInput==0 ){ 114 nInput = 0; 115 zInput = ""; 116 }else if( nInput<0 ){ 117 nInput = strlen(zInput); 118 } 119 nChar = nInput+1; 120 pCsr = (IcuCursor *)sqlite3_malloc( 121 sizeof(IcuCursor) + /* IcuCursor */ 122 ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */ 123 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ 124 ); 125 if( !pCsr ){ 126 return SQLITE_NOMEM; 127 } 128 memset(pCsr, 0, sizeof(IcuCursor)); 129 pCsr->aChar = (UChar *)&pCsr[1]; 130 pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3]; 131 132 pCsr->aOffset[iOut] = iInput; 133 U8_NEXT(zInput, iInput, nInput, c); 134 while( c>0 ){ 135 int isError = 0; 136 c = u_foldCase(c, opt); 137 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); 138 if( isError ){ 139 sqlite3_free(pCsr); 140 return SQLITE_ERROR; 141 } 142 pCsr->aOffset[iOut] = iInput; 143 144 if( iInput<nInput ){ 145 U8_NEXT(zInput, iInput, nInput, c); 146 }else{ 147 c = 0; 148 } 149 } 150 151 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); 152 if( !U_SUCCESS(status) ){ 153 sqlite3_free(pCsr); 154 return SQLITE_ERROR; 155 } 156 pCsr->nChar = iOut; 157 158 ubrk_first(pCsr->pIter); 159 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; 160 return SQLITE_OK; 161 } 162 163 /* 164 ** Close a tokenization cursor previously opened by a call to icuOpen(). 165 */ 166 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ 167 IcuCursor *pCsr = (IcuCursor *)pCursor; 168 ubrk_close(pCsr->pIter); 169 sqlite3_free(pCsr->zBuffer); 170 sqlite3_free(pCsr); 171 return SQLITE_OK; 172 } 173 174 /* 175 ** Extract the next token from a tokenization cursor. 176 */ 177 static int icuNext( 178 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 179 const char **ppToken, /* OUT: *ppToken is the token text */ 180 int *pnBytes, /* OUT: Number of bytes in token */ 181 int *piStartOffset, /* OUT: Starting offset of token */ 182 int *piEndOffset, /* OUT: Ending offset of token */ 183 int *piPosition /* OUT: Position integer of token */ 184 ){ 185 IcuCursor *pCsr = (IcuCursor *)pCursor; 186 187 int iStart = 0; 188 int iEnd = 0; 189 int nByte = 0; 190 191 while( iStart==iEnd ){ 192 UChar32 c; 193 194 iStart = ubrk_current(pCsr->pIter); 195 iEnd = ubrk_next(pCsr->pIter); 196 if( iEnd==UBRK_DONE ){ 197 return SQLITE_DONE; 198 } 199 200 while( iStart<iEnd ){ 201 int iWhite = iStart; 202 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); 203 if( u_isspace(c) ){ 204 iStart = iWhite; 205 }else{ 206 break; 207 } 208 } 209 assert(iStart<=iEnd); 210 } 211 212 do { 213 UErrorCode status = U_ZERO_ERROR; 214 if( nByte ){ 215 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); 216 if( !zNew ){ 217 return SQLITE_NOMEM; 218 } 219 pCsr->zBuffer = zNew; 220 pCsr->nBuffer = nByte; 221 } 222 223 u_strToUTF8( 224 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ 225 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ 226 &status /* Output success/failure */ 227 ); 228 } while( nByte>pCsr->nBuffer ); 229 230 *ppToken = pCsr->zBuffer; 231 *pnBytes = nByte; 232 *piStartOffset = pCsr->aOffset[iStart]; 233 *piEndOffset = pCsr->aOffset[iEnd]; 234 *piPosition = pCsr->iToken++; 235 236 return SQLITE_OK; 237 } 238 239 /* 240 ** The set of routines that implement the simple tokenizer 241 */ 242 static const sqlite3_tokenizer_module icuTokenizerModule = { 243 0, /* iVersion */ 244 icuCreate, /* xCreate */ 245 icuDestroy, /* xCreate */ 246 icuOpen, /* xOpen */ 247 icuClose, /* xClose */ 248 icuNext, /* xNext */ 249 0, /* xLanguageid */ 250 }; 251 252 /* 253 ** Set *ppModule to point at the implementation of the ICU tokenizer. 254 */ 255 void sqlite3Fts3IcuTokenizerModule( 256 sqlite3_tokenizer_module const**ppModule 257 ){ 258 *ppModule = &icuTokenizerModule; 259 } 260 261 #endif /* defined(SQLITE_ENABLE_ICU) */ 262 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */