modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_test.c (about) 1 /* 2 ** 2011 Jun 13 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ****************************************************************************** 12 ** 13 ** This file is not part of the production FTS code. It is only used for 14 ** testing. It contains a Tcl command that can be used to test if a document 15 ** matches an FTS NEAR expression. 16 ** 17 ** As of March 2012, it also contains a version 1 tokenizer used for testing 18 ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly. 19 */ 20 21 #if defined(INCLUDE_SQLITE_TCL_H) 22 # include "sqlite_tcl.h" 23 #else 24 # include "tcl.h" 25 # ifndef SQLITE_TCLAPI 26 # define SQLITE_TCLAPI 27 # endif 28 #endif 29 #include <string.h> 30 #include <assert.h> 31 32 #if defined(SQLITE_TEST) 33 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) 34 35 /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */ 36 #include "fts3Int.h" 37 38 #define NM_MAX_TOKEN 12 39 40 typedef struct NearPhrase NearPhrase; 41 typedef struct NearDocument NearDocument; 42 typedef struct NearToken NearToken; 43 44 struct NearDocument { 45 int nToken; /* Length of token in bytes */ 46 NearToken *aToken; /* Token array */ 47 }; 48 49 struct NearToken { 50 int n; /* Length of token in bytes */ 51 const char *z; /* Pointer to token string */ 52 }; 53 54 struct NearPhrase { 55 int nNear; /* Preceding NEAR value */ 56 int nToken; /* Number of tokens in this phrase */ 57 NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */ 58 }; 59 60 static int nm_phrase_match( 61 NearPhrase *p, 62 NearToken *aToken 63 ){ 64 int ii; 65 66 for(ii=0; ii<p->nToken; ii++){ 67 NearToken *pToken = &p->aToken[ii]; 68 if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){ 69 if( aToken[ii].n<(pToken->n-1) ) return 0; 70 if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0; 71 }else{ 72 if( aToken[ii].n!=pToken->n ) return 0; 73 if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0; 74 } 75 } 76 77 return 1; 78 } 79 80 static int nm_near_chain( 81 int iDir, /* Direction to iterate through aPhrase[] */ 82 NearDocument *pDoc, /* Document to match against */ 83 int iPos, /* Position at which iPhrase was found */ 84 int nPhrase, /* Size of phrase array */ 85 NearPhrase *aPhrase, /* Phrase array */ 86 int iPhrase /* Index of phrase found */ 87 ){ 88 int iStart; 89 int iStop; 90 int ii; 91 int nNear; 92 int iPhrase2; 93 NearPhrase *p; 94 NearPhrase *pPrev; 95 96 assert( iDir==1 || iDir==-1 ); 97 98 if( iDir==1 ){ 99 if( (iPhrase+1)==nPhrase ) return 1; 100 nNear = aPhrase[iPhrase+1].nNear; 101 }else{ 102 if( iPhrase==0 ) return 1; 103 nNear = aPhrase[iPhrase].nNear; 104 } 105 pPrev = &aPhrase[iPhrase]; 106 iPhrase2 = iPhrase+iDir; 107 p = &aPhrase[iPhrase2]; 108 109 iStart = iPos - nNear - p->nToken; 110 iStop = iPos + nNear + pPrev->nToken; 111 112 if( iStart<0 ) iStart = 0; 113 if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken; 114 115 for(ii=iStart; ii<=iStop; ii++){ 116 if( nm_phrase_match(p, &pDoc->aToken[ii]) ){ 117 if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1; 118 } 119 } 120 121 return 0; 122 } 123 124 static int nm_match_count( 125 NearDocument *pDoc, /* Document to match against */ 126 int nPhrase, /* Size of phrase array */ 127 NearPhrase *aPhrase, /* Phrase array */ 128 int iPhrase /* Index of phrase to count matches for */ 129 ){ 130 int nOcc = 0; 131 int ii; 132 NearPhrase *p = &aPhrase[iPhrase]; 133 134 for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){ 135 if( nm_phrase_match(p, &pDoc->aToken[ii]) ){ 136 /* Test forward NEAR chain (i>iPhrase) */ 137 if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue; 138 139 /* Test reverse NEAR chain (i<iPhrase) */ 140 if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue; 141 142 /* This is a real match. Increment the counter. */ 143 nOcc++; 144 } 145 } 146 147 return nOcc; 148 } 149 150 /* 151 ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS? 152 */ 153 static int SQLITE_TCLAPI fts3_near_match_cmd( 154 ClientData clientData, 155 Tcl_Interp *interp, 156 int objc, 157 Tcl_Obj *CONST objv[] 158 ){ 159 int nTotal = 0; 160 int rc; 161 int ii; 162 int nPhrase; 163 NearPhrase *aPhrase = 0; 164 NearDocument doc = {0, 0}; 165 Tcl_Obj **apDocToken; 166 Tcl_Obj *pRet; 167 Tcl_Obj *pPhrasecount = 0; 168 169 Tcl_Obj **apExprToken; 170 int nExprToken; 171 172 UNUSED_PARAMETER(clientData); 173 174 /* Must have 3 or more arguments. */ 175 if( objc<3 || (objc%2)==0 ){ 176 Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?..."); 177 rc = TCL_ERROR; 178 goto near_match_out; 179 } 180 181 for(ii=3; ii<objc; ii+=2){ 182 enum NM_enum { NM_PHRASECOUNTS }; 183 struct TestnmSubcmd { 184 char *zName; 185 enum NM_enum eOpt; 186 } aOpt[] = { 187 { "-phrasecountvar", NM_PHRASECOUNTS }, 188 { 0, 0 } 189 }; 190 int iOpt; 191 if( Tcl_GetIndexFromObjStruct( 192 interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt) 193 ){ 194 return TCL_ERROR; 195 } 196 197 switch( aOpt[iOpt].eOpt ){ 198 case NM_PHRASECOUNTS: 199 pPhrasecount = objv[ii+1]; 200 break; 201 } 202 } 203 204 rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken); 205 if( rc!=TCL_OK ) goto near_match_out; 206 doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken)); 207 for(ii=0; ii<doc.nToken; ii++){ 208 doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n); 209 } 210 211 rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken); 212 if( rc!=TCL_OK ) goto near_match_out; 213 214 nPhrase = (nExprToken + 1) / 2; 215 aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase)); 216 memset(aPhrase, 0, nPhrase * sizeof(NearPhrase)); 217 for(ii=0; ii<nPhrase; ii++){ 218 Tcl_Obj *pPhrase = apExprToken[ii*2]; 219 Tcl_Obj **apToken; 220 int nToken; 221 int jj; 222 223 rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken); 224 if( rc!=TCL_OK ) goto near_match_out; 225 if( nToken>NM_MAX_TOKEN ){ 226 Tcl_AppendResult(interp, "Too many tokens in phrase", 0); 227 rc = TCL_ERROR; 228 goto near_match_out; 229 } 230 for(jj=0; jj<nToken; jj++){ 231 NearToken *pT = &aPhrase[ii].aToken[jj]; 232 pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n); 233 } 234 aPhrase[ii].nToken = nToken; 235 } 236 for(ii=1; ii<nPhrase; ii++){ 237 Tcl_Obj *pNear = apExprToken[2*ii-1]; 238 int nNear; 239 rc = Tcl_GetIntFromObj(interp, pNear, &nNear); 240 if( rc!=TCL_OK ) goto near_match_out; 241 aPhrase[ii].nNear = nNear; 242 } 243 244 pRet = Tcl_NewObj(); 245 Tcl_IncrRefCount(pRet); 246 for(ii=0; ii<nPhrase; ii++){ 247 int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii); 248 Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc)); 249 nTotal += nOcc; 250 } 251 if( pPhrasecount ){ 252 Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0); 253 } 254 Tcl_DecrRefCount(pRet); 255 Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0)); 256 257 near_match_out: 258 ckfree((char *)aPhrase); 259 ckfree((char *)doc.aToken); 260 return rc; 261 } 262 263 /* 264 ** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD? 265 ** 266 ** Normally, FTS uses hard-coded values to determine the minimum doclist 267 ** size eligible for incremental loading, and the size of the chunks loaded 268 ** when a doclist is incrementally loaded. This command allows the built-in 269 ** values to be overridden for testing purposes. 270 ** 271 ** If present, the first argument is the chunksize in bytes to load doclists 272 ** in. The second argument is the minimum doclist size in bytes to use 273 ** incremental loading with. 274 ** 275 ** Whether or not the arguments are present, this command returns a list of 276 ** two integers - the initial chunksize and threshold when the command is 277 ** invoked. This can be used to restore the default behavior after running 278 ** tests. For example: 279 ** 280 ** # Override incr-load settings for testing: 281 ** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold] 282 ** 283 ** .... run tests .... 284 ** 285 ** # Restore initial incr-load settings: 286 ** eval fts3_configure_incr_load $cfg 287 */ 288 static int SQLITE_TCLAPI fts3_configure_incr_load_cmd( 289 ClientData clientData, 290 Tcl_Interp *interp, 291 int objc, 292 Tcl_Obj *CONST objv[] 293 ){ 294 #ifdef SQLITE_ENABLE_FTS3 295 extern int test_fts3_node_chunksize; 296 extern int test_fts3_node_chunk_threshold; 297 Tcl_Obj *pRet; 298 299 if( objc!=1 && objc!=3 ){ 300 Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?"); 301 return TCL_ERROR; 302 } 303 304 pRet = Tcl_NewObj(); 305 Tcl_IncrRefCount(pRet); 306 Tcl_ListObjAppendElement( 307 interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize)); 308 Tcl_ListObjAppendElement( 309 interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold)); 310 311 if( objc==3 ){ 312 int iArg1; 313 int iArg2; 314 if( Tcl_GetIntFromObj(interp, objv[1], &iArg1) 315 || Tcl_GetIntFromObj(interp, objv[2], &iArg2) 316 ){ 317 Tcl_DecrRefCount(pRet); 318 return TCL_ERROR; 319 } 320 test_fts3_node_chunksize = iArg1; 321 test_fts3_node_chunk_threshold = iArg2; 322 } 323 324 Tcl_SetObjResult(interp, pRet); 325 Tcl_DecrRefCount(pRet); 326 #endif 327 UNUSED_PARAMETER(clientData); 328 return TCL_OK; 329 } 330 331 #ifdef SQLITE_ENABLE_FTS3 332 /************************************************************************** 333 ** Beginning of test tokenizer code. 334 ** 335 ** For language 0, this tokenizer is similar to the default 'simple' 336 ** tokenizer. For other languages L, the following: 337 ** 338 ** * Odd numbered languages are case-sensitive. Even numbered 339 ** languages are not. 340 ** 341 ** * Language ids 100 or greater are considered an error. 342 ** 343 ** The implementation assumes that the input contains only ASCII characters 344 ** (i.e. those that may be encoded in UTF-8 using a single byte). 345 */ 346 typedef struct test_tokenizer { 347 sqlite3_tokenizer base; 348 } test_tokenizer; 349 350 typedef struct test_tokenizer_cursor { 351 sqlite3_tokenizer_cursor base; 352 const char *aInput; /* Input being tokenized */ 353 int nInput; /* Size of the input in bytes */ 354 int iInput; /* Current offset in aInput */ 355 int iToken; /* Index of next token to be returned */ 356 char *aBuffer; /* Buffer containing current token */ 357 int nBuffer; /* Number of bytes allocated at pToken */ 358 int iLangid; /* Configured language id */ 359 } test_tokenizer_cursor; 360 361 static int testTokenizerCreate( 362 int argc, const char * const *argv, 363 sqlite3_tokenizer **ppTokenizer 364 ){ 365 test_tokenizer *pNew; 366 UNUSED_PARAMETER(argc); 367 UNUSED_PARAMETER(argv); 368 369 pNew = sqlite3_malloc(sizeof(test_tokenizer)); 370 if( !pNew ) return SQLITE_NOMEM; 371 memset(pNew, 0, sizeof(test_tokenizer)); 372 373 *ppTokenizer = (sqlite3_tokenizer *)pNew; 374 return SQLITE_OK; 375 } 376 377 static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){ 378 test_tokenizer *p = (test_tokenizer *)pTokenizer; 379 sqlite3_free(p); 380 return SQLITE_OK; 381 } 382 383 static int testTokenizerOpen( 384 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 385 const char *pInput, int nBytes, /* String to be tokenized */ 386 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 387 ){ 388 int rc = SQLITE_OK; /* Return code */ 389 test_tokenizer_cursor *pCsr; /* New cursor object */ 390 391 UNUSED_PARAMETER(pTokenizer); 392 393 pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor)); 394 if( pCsr==0 ){ 395 rc = SQLITE_NOMEM; 396 }else{ 397 memset(pCsr, 0, sizeof(test_tokenizer_cursor)); 398 pCsr->aInput = pInput; 399 if( nBytes<0 ){ 400 pCsr->nInput = (int)strlen(pInput); 401 }else{ 402 pCsr->nInput = nBytes; 403 } 404 } 405 406 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; 407 return rc; 408 } 409 410 static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){ 411 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; 412 sqlite3_free(pCsr->aBuffer); 413 sqlite3_free(pCsr); 414 return SQLITE_OK; 415 } 416 417 static int testIsTokenChar(char c){ 418 return (c>='a' && c<='z') || (c>='A' && c<='Z'); 419 } 420 static int testTolower(char c){ 421 char ret = c; 422 if( ret>='A' && ret<='Z') ret = ret - ('A'-'a'); 423 return ret; 424 } 425 426 static int testTokenizerNext( 427 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */ 428 const char **ppToken, /* OUT: *ppToken is the token text */ 429 int *pnBytes, /* OUT: Number of bytes in token */ 430 int *piStartOffset, /* OUT: Starting offset of token */ 431 int *piEndOffset, /* OUT: Ending offset of token */ 432 int *piPosition /* OUT: Position integer of token */ 433 ){ 434 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; 435 int rc = SQLITE_OK; 436 const char *p; 437 const char *pEnd; 438 439 p = &pCsr->aInput[pCsr->iInput]; 440 pEnd = &pCsr->aInput[pCsr->nInput]; 441 442 /* Skip past any white-space */ 443 assert( p<=pEnd ); 444 while( p<pEnd && testIsTokenChar(*p)==0 ) p++; 445 446 if( p==pEnd ){ 447 rc = SQLITE_DONE; 448 }else{ 449 /* Advance to the end of the token */ 450 const char *pToken = p; 451 int nToken; 452 while( p<pEnd && testIsTokenChar(*p) ) p++; 453 nToken = (int)(p-pToken); 454 455 /* Copy the token into the buffer */ 456 if( nToken>pCsr->nBuffer ){ 457 sqlite3_free(pCsr->aBuffer); 458 pCsr->aBuffer = sqlite3_malloc(nToken); 459 } 460 if( pCsr->aBuffer==0 ){ 461 rc = SQLITE_NOMEM; 462 }else{ 463 int i; 464 465 if( pCsr->iLangid & 0x00000001 ){ 466 for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i]; 467 }else{ 468 for(i=0; i<nToken; i++) pCsr->aBuffer[i] = (char)testTolower(pToken[i]); 469 } 470 pCsr->iToken++; 471 pCsr->iInput = (int)(p - pCsr->aInput); 472 473 *ppToken = pCsr->aBuffer; 474 *pnBytes = nToken; 475 *piStartOffset = (int)(pToken - pCsr->aInput); 476 *piEndOffset = (int)(p - pCsr->aInput); 477 *piPosition = pCsr->iToken; 478 } 479 } 480 481 return rc; 482 } 483 484 static int testTokenizerLanguage( 485 sqlite3_tokenizer_cursor *pCursor, 486 int iLangid 487 ){ 488 int rc = SQLITE_OK; 489 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; 490 pCsr->iLangid = iLangid; 491 if( pCsr->iLangid>=100 ){ 492 rc = SQLITE_ERROR; 493 } 494 return rc; 495 } 496 #endif 497 498 static int SQLITE_TCLAPI fts3_test_tokenizer_cmd( 499 ClientData clientData, 500 Tcl_Interp *interp, 501 int objc, 502 Tcl_Obj *CONST objv[] 503 ){ 504 #ifdef SQLITE_ENABLE_FTS3 505 static const sqlite3_tokenizer_module testTokenizerModule = { 506 1, 507 testTokenizerCreate, 508 testTokenizerDestroy, 509 testTokenizerOpen, 510 testTokenizerClose, 511 testTokenizerNext, 512 testTokenizerLanguage 513 }; 514 const sqlite3_tokenizer_module *pPtr = &testTokenizerModule; 515 if( objc!=1 ){ 516 Tcl_WrongNumArgs(interp, 1, objv, ""); 517 return TCL_ERROR; 518 } 519 Tcl_SetObjResult(interp, Tcl_NewByteArrayObj( 520 (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *) 521 )); 522 #endif 523 UNUSED_PARAMETER(clientData); 524 return TCL_OK; 525 } 526 527 static int SQLITE_TCLAPI fts3_test_varint_cmd( 528 ClientData clientData, 529 Tcl_Interp *interp, 530 int objc, 531 Tcl_Obj *CONST objv[] 532 ){ 533 #ifdef SQLITE_ENABLE_FTS3 534 char aBuf[24]; 535 int rc; 536 Tcl_WideInt w; 537 sqlite3_int64 w2; 538 int nByte, nByte2; 539 540 if( objc!=2 ){ 541 Tcl_WrongNumArgs(interp, 1, objv, "INTEGER"); 542 return TCL_ERROR; 543 } 544 545 rc = Tcl_GetWideIntFromObj(interp, objv[1], &w); 546 if( rc!=TCL_OK ) return rc; 547 548 nByte = sqlite3Fts3PutVarint(aBuf, w); 549 nByte2 = sqlite3Fts3GetVarint(aBuf, &w2); 550 if( w!=w2 || nByte!=nByte2 ){ 551 char *zErr = sqlite3_mprintf("error testing %lld", w); 552 Tcl_ResetResult(interp); 553 Tcl_AppendResult(interp, zErr, 0); 554 return TCL_ERROR; 555 } 556 557 if( w<=2147483647 && w>=0 ){ 558 int i; 559 nByte2 = fts3GetVarint32(aBuf, &i); 560 if( (int)w!=i || nByte!=nByte2 ){ 561 char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w); 562 Tcl_ResetResult(interp); 563 Tcl_AppendResult(interp, zErr, 0); 564 return TCL_ERROR; 565 } 566 } 567 568 #endif 569 UNUSED_PARAMETER(clientData); 570 return TCL_OK; 571 } 572 573 /* 574 ** End of tokenizer code. 575 **************************************************************************/ 576 577 int Sqlitetestfts3_Init(Tcl_Interp *interp){ 578 Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0); 579 Tcl_CreateObjCommand(interp, 580 "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0 581 ); 582 Tcl_CreateObjCommand( 583 interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0 584 ); 585 586 Tcl_CreateObjCommand( 587 interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0 588 ); 589 return TCL_OK; 590 } 591 #endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */ 592 #endif /* ifdef SQLITE_TEST */