modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_snippet.c (about) 1 /* 2 ** 2009 Oct 23 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ****************************************************************************** 12 */ 13 14 #include "fts3Int.h" 15 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 16 17 #include <string.h> 18 #include <assert.h> 19 20 /* 21 ** Characters that may appear in the second argument to matchinfo(). 22 */ 23 #define FTS3_MATCHINFO_NPHRASE 'p' /* 1 value */ 24 #define FTS3_MATCHINFO_NCOL 'c' /* 1 value */ 25 #define FTS3_MATCHINFO_NDOC 'n' /* 1 value */ 26 #define FTS3_MATCHINFO_AVGLENGTH 'a' /* nCol values */ 27 #define FTS3_MATCHINFO_LENGTH 'l' /* nCol values */ 28 #define FTS3_MATCHINFO_LCS 's' /* nCol values */ 29 #define FTS3_MATCHINFO_HITS 'x' /* 3*nCol*nPhrase values */ 30 #define FTS3_MATCHINFO_LHITS 'y' /* nCol*nPhrase values */ 31 #define FTS3_MATCHINFO_LHITS_BM 'b' /* nCol*nPhrase values */ 32 33 /* 34 ** The default value for the second argument to matchinfo(). 35 */ 36 #define FTS3_MATCHINFO_DEFAULT "pcx" 37 38 39 /* 40 ** Used as an fts3ExprIterate() context when loading phrase doclists to 41 ** Fts3Expr.aDoclist[]/nDoclist. 42 */ 43 typedef struct LoadDoclistCtx LoadDoclistCtx; 44 struct LoadDoclistCtx { 45 Fts3Cursor *pCsr; /* FTS3 Cursor */ 46 int nPhrase; /* Number of phrases seen so far */ 47 int nToken; /* Number of tokens seen so far */ 48 }; 49 50 /* 51 ** The following types are used as part of the implementation of the 52 ** fts3BestSnippet() routine. 53 */ 54 typedef struct SnippetIter SnippetIter; 55 typedef struct SnippetPhrase SnippetPhrase; 56 typedef struct SnippetFragment SnippetFragment; 57 58 struct SnippetIter { 59 Fts3Cursor *pCsr; /* Cursor snippet is being generated from */ 60 int iCol; /* Extract snippet from this column */ 61 int nSnippet; /* Requested snippet length (in tokens) */ 62 int nPhrase; /* Number of phrases in query */ 63 SnippetPhrase *aPhrase; /* Array of size nPhrase */ 64 int iCurrent; /* First token of current snippet */ 65 }; 66 67 struct SnippetPhrase { 68 int nToken; /* Number of tokens in phrase */ 69 char *pList; /* Pointer to start of phrase position list */ 70 int iHead; /* Next value in position list */ 71 char *pHead; /* Position list data following iHead */ 72 int iTail; /* Next value in trailing position list */ 73 char *pTail; /* Position list data following iTail */ 74 }; 75 76 struct SnippetFragment { 77 int iCol; /* Column snippet is extracted from */ 78 int iPos; /* Index of first token in snippet */ 79 u64 covered; /* Mask of query phrases covered */ 80 u64 hlmask; /* Mask of snippet terms to highlight */ 81 }; 82 83 /* 84 ** This type is used as an fts3ExprIterate() context object while 85 ** accumulating the data returned by the matchinfo() function. 86 */ 87 typedef struct MatchInfo MatchInfo; 88 struct MatchInfo { 89 Fts3Cursor *pCursor; /* FTS3 Cursor */ 90 int nCol; /* Number of columns in table */ 91 int nPhrase; /* Number of matchable phrases in query */ 92 sqlite3_int64 nDoc; /* Number of docs in database */ 93 char flag; 94 u32 *aMatchinfo; /* Pre-allocated buffer */ 95 }; 96 97 /* 98 ** An instance of this structure is used to manage a pair of buffers, each 99 ** (nElem * sizeof(u32)) bytes in size. See the MatchinfoBuffer code below 100 ** for details. 101 */ 102 struct MatchinfoBuffer { 103 u8 aRef[3]; 104 int nElem; 105 int bGlobal; /* Set if global data is loaded */ 106 char *zMatchinfo; 107 u32 aMatchinfo[1]; 108 }; 109 110 111 /* 112 ** The snippet() and offsets() functions both return text values. An instance 113 ** of the following structure is used to accumulate those values while the 114 ** functions are running. See fts3StringAppend() for details. 115 */ 116 typedef struct StrBuffer StrBuffer; 117 struct StrBuffer { 118 char *z; /* Pointer to buffer containing string */ 119 int n; /* Length of z in bytes (excl. nul-term) */ 120 int nAlloc; /* Allocated size of buffer z in bytes */ 121 }; 122 123 124 /************************************************************************* 125 ** Start of MatchinfoBuffer code. 126 */ 127 128 /* 129 ** Allocate a two-slot MatchinfoBuffer object. 130 */ 131 static MatchinfoBuffer *fts3MIBufferNew(int nElem, const char *zMatchinfo){ 132 MatchinfoBuffer *pRet; 133 int nByte = sizeof(u32) * (2*nElem + 1) + sizeof(MatchinfoBuffer); 134 int nStr = (int)strlen(zMatchinfo); 135 136 pRet = sqlite3_malloc(nByte + nStr+1); 137 if( pRet ){ 138 memset(pRet, 0, nByte); 139 pRet->aMatchinfo[0] = (u8*)(&pRet->aMatchinfo[1]) - (u8*)pRet; 140 pRet->aMatchinfo[1+nElem] = pRet->aMatchinfo[0] + sizeof(u32)*(nElem+1); 141 pRet->nElem = nElem; 142 pRet->zMatchinfo = ((char*)pRet) + nByte; 143 memcpy(pRet->zMatchinfo, zMatchinfo, nStr+1); 144 pRet->aRef[0] = 1; 145 } 146 147 return pRet; 148 } 149 150 static void fts3MIBufferFree(void *p){ 151 MatchinfoBuffer *pBuf = (MatchinfoBuffer*)((u8*)p - ((u32*)p)[-1]); 152 153 assert( (u32*)p==&pBuf->aMatchinfo[1] 154 || (u32*)p==&pBuf->aMatchinfo[pBuf->nElem+2] 155 ); 156 if( (u32*)p==&pBuf->aMatchinfo[1] ){ 157 pBuf->aRef[1] = 0; 158 }else{ 159 pBuf->aRef[2] = 0; 160 } 161 162 if( pBuf->aRef[0]==0 && pBuf->aRef[1]==0 && pBuf->aRef[2]==0 ){ 163 sqlite3_free(pBuf); 164 } 165 } 166 167 static void (*fts3MIBufferAlloc(MatchinfoBuffer *p, u32 **paOut))(void*){ 168 void (*xRet)(void*) = 0; 169 u32 *aOut = 0; 170 171 if( p->aRef[1]==0 ){ 172 p->aRef[1] = 1; 173 aOut = &p->aMatchinfo[1]; 174 xRet = fts3MIBufferFree; 175 } 176 else if( p->aRef[2]==0 ){ 177 p->aRef[2] = 1; 178 aOut = &p->aMatchinfo[p->nElem+2]; 179 xRet = fts3MIBufferFree; 180 }else{ 181 aOut = (u32*)sqlite3_malloc(p->nElem * sizeof(u32)); 182 if( aOut ){ 183 xRet = sqlite3_free; 184 if( p->bGlobal ) memcpy(aOut, &p->aMatchinfo[1], p->nElem*sizeof(u32)); 185 } 186 } 187 188 *paOut = aOut; 189 return xRet; 190 } 191 192 static void fts3MIBufferSetGlobal(MatchinfoBuffer *p){ 193 p->bGlobal = 1; 194 memcpy(&p->aMatchinfo[2+p->nElem], &p->aMatchinfo[1], p->nElem*sizeof(u32)); 195 } 196 197 /* 198 ** Free a MatchinfoBuffer object allocated using fts3MIBufferNew() 199 */ 200 void sqlite3Fts3MIBufferFree(MatchinfoBuffer *p){ 201 if( p ){ 202 assert( p->aRef[0]==1 ); 203 p->aRef[0] = 0; 204 if( p->aRef[0]==0 && p->aRef[1]==0 && p->aRef[2]==0 ){ 205 sqlite3_free(p); 206 } 207 } 208 } 209 210 /* 211 ** End of MatchinfoBuffer code. 212 *************************************************************************/ 213 214 215 /* 216 ** This function is used to help iterate through a position-list. A position 217 ** list is a list of unique integers, sorted from smallest to largest. Each 218 ** element of the list is represented by an FTS3 varint that takes the value 219 ** of the difference between the current element and the previous one plus 220 ** two. For example, to store the position-list: 221 ** 222 ** 4 9 113 223 ** 224 ** the three varints: 225 ** 226 ** 6 7 106 227 ** 228 ** are encoded. 229 ** 230 ** When this function is called, *pp points to the start of an element of 231 ** the list. *piPos contains the value of the previous entry in the list. 232 ** After it returns, *piPos contains the value of the next element of the 233 ** list and *pp is advanced to the following varint. 234 */ 235 static void fts3GetDeltaPosition(char **pp, int *piPos){ 236 int iVal; 237 *pp += fts3GetVarint32(*pp, &iVal); 238 *piPos += (iVal-2); 239 } 240 241 /* 242 ** Helper function for fts3ExprIterate() (see below). 243 */ 244 static int fts3ExprIterate2( 245 Fts3Expr *pExpr, /* Expression to iterate phrases of */ 246 int *piPhrase, /* Pointer to phrase counter */ 247 int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ 248 void *pCtx /* Second argument to pass to callback */ 249 ){ 250 int rc; /* Return code */ 251 int eType = pExpr->eType; /* Type of expression node pExpr */ 252 253 if( eType!=FTSQUERY_PHRASE ){ 254 assert( pExpr->pLeft && pExpr->pRight ); 255 rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx); 256 if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){ 257 rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx); 258 } 259 }else{ 260 rc = x(pExpr, *piPhrase, pCtx); 261 (*piPhrase)++; 262 } 263 return rc; 264 } 265 266 /* 267 ** Iterate through all phrase nodes in an FTS3 query, except those that 268 ** are part of a sub-tree that is the right-hand-side of a NOT operator. 269 ** For each phrase node found, the supplied callback function is invoked. 270 ** 271 ** If the callback function returns anything other than SQLITE_OK, 272 ** the iteration is abandoned and the error code returned immediately. 273 ** Otherwise, SQLITE_OK is returned after a callback has been made for 274 ** all eligible phrase nodes. 275 */ 276 static int fts3ExprIterate( 277 Fts3Expr *pExpr, /* Expression to iterate phrases of */ 278 int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ 279 void *pCtx /* Second argument to pass to callback */ 280 ){ 281 int iPhrase = 0; /* Variable used as the phrase counter */ 282 return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx); 283 } 284 285 286 /* 287 ** This is an fts3ExprIterate() callback used while loading the doclists 288 ** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also 289 ** fts3ExprLoadDoclists(). 290 */ 291 static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){ 292 int rc = SQLITE_OK; 293 Fts3Phrase *pPhrase = pExpr->pPhrase; 294 LoadDoclistCtx *p = (LoadDoclistCtx *)ctx; 295 296 UNUSED_PARAMETER(iPhrase); 297 298 p->nPhrase++; 299 p->nToken += pPhrase->nToken; 300 301 return rc; 302 } 303 304 /* 305 ** Load the doclists for each phrase in the query associated with FTS3 cursor 306 ** pCsr. 307 ** 308 ** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable 309 ** phrases in the expression (all phrases except those directly or 310 ** indirectly descended from the right-hand-side of a NOT operator). If 311 ** pnToken is not NULL, then it is set to the number of tokens in all 312 ** matchable phrases of the expression. 313 */ 314 static int fts3ExprLoadDoclists( 315 Fts3Cursor *pCsr, /* Fts3 cursor for current query */ 316 int *pnPhrase, /* OUT: Number of phrases in query */ 317 int *pnToken /* OUT: Number of tokens in query */ 318 ){ 319 int rc; /* Return Code */ 320 LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */ 321 sCtx.pCsr = pCsr; 322 rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx); 323 if( pnPhrase ) *pnPhrase = sCtx.nPhrase; 324 if( pnToken ) *pnToken = sCtx.nToken; 325 return rc; 326 } 327 328 static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){ 329 (*(int *)ctx)++; 330 pExpr->iPhrase = iPhrase; 331 return SQLITE_OK; 332 } 333 static int fts3ExprPhraseCount(Fts3Expr *pExpr){ 334 int nPhrase = 0; 335 (void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase); 336 return nPhrase; 337 } 338 339 /* 340 ** Advance the position list iterator specified by the first two 341 ** arguments so that it points to the first element with a value greater 342 ** than or equal to parameter iNext. 343 */ 344 static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){ 345 char *pIter = *ppIter; 346 if( pIter ){ 347 int iIter = *piIter; 348 349 while( iIter<iNext ){ 350 if( 0==(*pIter & 0xFE) ){ 351 iIter = -1; 352 pIter = 0; 353 break; 354 } 355 fts3GetDeltaPosition(&pIter, &iIter); 356 } 357 358 *piIter = iIter; 359 *ppIter = pIter; 360 } 361 } 362 363 /* 364 ** Advance the snippet iterator to the next candidate snippet. 365 */ 366 static int fts3SnippetNextCandidate(SnippetIter *pIter){ 367 int i; /* Loop counter */ 368 369 if( pIter->iCurrent<0 ){ 370 /* The SnippetIter object has just been initialized. The first snippet 371 ** candidate always starts at offset 0 (even if this candidate has a 372 ** score of 0.0). 373 */ 374 pIter->iCurrent = 0; 375 376 /* Advance the 'head' iterator of each phrase to the first offset that 377 ** is greater than or equal to (iNext+nSnippet). 378 */ 379 for(i=0; i<pIter->nPhrase; i++){ 380 SnippetPhrase *pPhrase = &pIter->aPhrase[i]; 381 fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet); 382 } 383 }else{ 384 int iStart; 385 int iEnd = 0x7FFFFFFF; 386 387 for(i=0; i<pIter->nPhrase; i++){ 388 SnippetPhrase *pPhrase = &pIter->aPhrase[i]; 389 if( pPhrase->pHead && pPhrase->iHead<iEnd ){ 390 iEnd = pPhrase->iHead; 391 } 392 } 393 if( iEnd==0x7FFFFFFF ){ 394 return 1; 395 } 396 397 pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1; 398 for(i=0; i<pIter->nPhrase; i++){ 399 SnippetPhrase *pPhrase = &pIter->aPhrase[i]; 400 fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1); 401 fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart); 402 } 403 } 404 405 return 0; 406 } 407 408 /* 409 ** Retrieve information about the current candidate snippet of snippet 410 ** iterator pIter. 411 */ 412 static void fts3SnippetDetails( 413 SnippetIter *pIter, /* Snippet iterator */ 414 u64 mCovered, /* Bitmask of phrases already covered */ 415 int *piToken, /* OUT: First token of proposed snippet */ 416 int *piScore, /* OUT: "Score" for this snippet */ 417 u64 *pmCover, /* OUT: Bitmask of phrases covered */ 418 u64 *pmHighlight /* OUT: Bitmask of terms to highlight */ 419 ){ 420 int iStart = pIter->iCurrent; /* First token of snippet */ 421 int iScore = 0; /* Score of this snippet */ 422 int i; /* Loop counter */ 423 u64 mCover = 0; /* Mask of phrases covered by this snippet */ 424 u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */ 425 426 for(i=0; i<pIter->nPhrase; i++){ 427 SnippetPhrase *pPhrase = &pIter->aPhrase[i]; 428 if( pPhrase->pTail ){ 429 char *pCsr = pPhrase->pTail; 430 int iCsr = pPhrase->iTail; 431 432 while( iCsr<(iStart+pIter->nSnippet) ){ 433 int j; 434 u64 mPhrase = (u64)1 << i; 435 u64 mPos = (u64)1 << (iCsr - iStart); 436 assert( iCsr>=iStart ); 437 if( (mCover|mCovered)&mPhrase ){ 438 iScore++; 439 }else{ 440 iScore += 1000; 441 } 442 mCover |= mPhrase; 443 444 for(j=0; j<pPhrase->nToken; j++){ 445 mHighlight |= (mPos>>j); 446 } 447 448 if( 0==(*pCsr & 0x0FE) ) break; 449 fts3GetDeltaPosition(&pCsr, &iCsr); 450 } 451 } 452 } 453 454 /* Set the output variables before returning. */ 455 *piToken = iStart; 456 *piScore = iScore; 457 *pmCover = mCover; 458 *pmHighlight = mHighlight; 459 } 460 461 /* 462 ** This function is an fts3ExprIterate() callback used by fts3BestSnippet(). 463 ** Each invocation populates an element of the SnippetIter.aPhrase[] array. 464 */ 465 static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){ 466 SnippetIter *p = (SnippetIter *)ctx; 467 SnippetPhrase *pPhrase = &p->aPhrase[iPhrase]; 468 char *pCsr; 469 int rc; 470 471 pPhrase->nToken = pExpr->pPhrase->nToken; 472 rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pCsr); 473 assert( rc==SQLITE_OK || pCsr==0 ); 474 if( pCsr ){ 475 int iFirst = 0; 476 pPhrase->pList = pCsr; 477 fts3GetDeltaPosition(&pCsr, &iFirst); 478 assert( iFirst>=0 ); 479 pPhrase->pHead = pCsr; 480 pPhrase->pTail = pCsr; 481 pPhrase->iHead = iFirst; 482 pPhrase->iTail = iFirst; 483 }else{ 484 assert( rc!=SQLITE_OK || ( 485 pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 486 )); 487 } 488 489 return rc; 490 } 491 492 /* 493 ** Select the fragment of text consisting of nFragment contiguous tokens 494 ** from column iCol that represent the "best" snippet. The best snippet 495 ** is the snippet with the highest score, where scores are calculated 496 ** by adding: 497 ** 498 ** (a) +1 point for each occurrence of a matchable phrase in the snippet. 499 ** 500 ** (b) +1000 points for the first occurrence of each matchable phrase in 501 ** the snippet for which the corresponding mCovered bit is not set. 502 ** 503 ** The selected snippet parameters are stored in structure *pFragment before 504 ** returning. The score of the selected snippet is stored in *piScore 505 ** before returning. 506 */ 507 static int fts3BestSnippet( 508 int nSnippet, /* Desired snippet length */ 509 Fts3Cursor *pCsr, /* Cursor to create snippet for */ 510 int iCol, /* Index of column to create snippet from */ 511 u64 mCovered, /* Mask of phrases already covered */ 512 u64 *pmSeen, /* IN/OUT: Mask of phrases seen */ 513 SnippetFragment *pFragment, /* OUT: Best snippet found */ 514 int *piScore /* OUT: Score of snippet pFragment */ 515 ){ 516 int rc; /* Return Code */ 517 int nList; /* Number of phrases in expression */ 518 SnippetIter sIter; /* Iterates through snippet candidates */ 519 int nByte; /* Number of bytes of space to allocate */ 520 int iBestScore = -1; /* Best snippet score found so far */ 521 int i; /* Loop counter */ 522 523 memset(&sIter, 0, sizeof(sIter)); 524 525 /* Iterate through the phrases in the expression to count them. The same 526 ** callback makes sure the doclists are loaded for each phrase. 527 */ 528 rc = fts3ExprLoadDoclists(pCsr, &nList, 0); 529 if( rc!=SQLITE_OK ){ 530 return rc; 531 } 532 533 /* Now that it is known how many phrases there are, allocate and zero 534 ** the required space using malloc(). 535 */ 536 nByte = sizeof(SnippetPhrase) * nList; 537 sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte); 538 if( !sIter.aPhrase ){ 539 return SQLITE_NOMEM; 540 } 541 memset(sIter.aPhrase, 0, nByte); 542 543 /* Initialize the contents of the SnippetIter object. Then iterate through 544 ** the set of phrases in the expression to populate the aPhrase[] array. 545 */ 546 sIter.pCsr = pCsr; 547 sIter.iCol = iCol; 548 sIter.nSnippet = nSnippet; 549 sIter.nPhrase = nList; 550 sIter.iCurrent = -1; 551 rc = fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void*)&sIter); 552 if( rc==SQLITE_OK ){ 553 554 /* Set the *pmSeen output variable. */ 555 for(i=0; i<nList; i++){ 556 if( sIter.aPhrase[i].pHead ){ 557 *pmSeen |= (u64)1 << i; 558 } 559 } 560 561 /* Loop through all candidate snippets. Store the best snippet in 562 ** *pFragment. Store its associated 'score' in iBestScore. 563 */ 564 pFragment->iCol = iCol; 565 while( !fts3SnippetNextCandidate(&sIter) ){ 566 int iPos; 567 int iScore; 568 u64 mCover; 569 u64 mHighlite; 570 fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover,&mHighlite); 571 assert( iScore>=0 ); 572 if( iScore>iBestScore ){ 573 pFragment->iPos = iPos; 574 pFragment->hlmask = mHighlite; 575 pFragment->covered = mCover; 576 iBestScore = iScore; 577 } 578 } 579 580 *piScore = iBestScore; 581 } 582 sqlite3_free(sIter.aPhrase); 583 return rc; 584 } 585 586 587 /* 588 ** Append a string to the string-buffer passed as the first argument. 589 ** 590 ** If nAppend is negative, then the length of the string zAppend is 591 ** determined using strlen(). 592 */ 593 static int fts3StringAppend( 594 StrBuffer *pStr, /* Buffer to append to */ 595 const char *zAppend, /* Pointer to data to append to buffer */ 596 int nAppend /* Size of zAppend in bytes (or -1) */ 597 ){ 598 if( nAppend<0 ){ 599 nAppend = (int)strlen(zAppend); 600 } 601 602 /* If there is insufficient space allocated at StrBuffer.z, use realloc() 603 ** to grow the buffer until so that it is big enough to accomadate the 604 ** appended data. 605 */ 606 if( pStr->n+nAppend+1>=pStr->nAlloc ){ 607 int nAlloc = pStr->nAlloc+nAppend+100; 608 char *zNew = sqlite3_realloc(pStr->z, nAlloc); 609 if( !zNew ){ 610 return SQLITE_NOMEM; 611 } 612 pStr->z = zNew; 613 pStr->nAlloc = nAlloc; 614 } 615 assert( pStr->z!=0 && (pStr->nAlloc >= pStr->n+nAppend+1) ); 616 617 /* Append the data to the string buffer. */ 618 memcpy(&pStr->z[pStr->n], zAppend, nAppend); 619 pStr->n += nAppend; 620 pStr->z[pStr->n] = '\0'; 621 622 return SQLITE_OK; 623 } 624 625 /* 626 ** The fts3BestSnippet() function often selects snippets that end with a 627 ** query term. That is, the final term of the snippet is always a term 628 ** that requires highlighting. For example, if 'X' is a highlighted term 629 ** and '.' is a non-highlighted term, BestSnippet() may select: 630 ** 631 ** ........X.....X 632 ** 633 ** This function "shifts" the beginning of the snippet forward in the 634 ** document so that there are approximately the same number of 635 ** non-highlighted terms to the right of the final highlighted term as there 636 ** are to the left of the first highlighted term. For example, to this: 637 ** 638 ** ....X.....X.... 639 ** 640 ** This is done as part of extracting the snippet text, not when selecting 641 ** the snippet. Snippet selection is done based on doclists only, so there 642 ** is no way for fts3BestSnippet() to know whether or not the document 643 ** actually contains terms that follow the final highlighted term. 644 */ 645 static int fts3SnippetShift( 646 Fts3Table *pTab, /* FTS3 table snippet comes from */ 647 int iLangid, /* Language id to use in tokenizing */ 648 int nSnippet, /* Number of tokens desired for snippet */ 649 const char *zDoc, /* Document text to extract snippet from */ 650 int nDoc, /* Size of buffer zDoc in bytes */ 651 int *piPos, /* IN/OUT: First token of snippet */ 652 u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */ 653 ){ 654 u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */ 655 656 if( hlmask ){ 657 int nLeft; /* Tokens to the left of first highlight */ 658 int nRight; /* Tokens to the right of last highlight */ 659 int nDesired; /* Ideal number of tokens to shift forward */ 660 661 for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++); 662 for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++); 663 nDesired = (nLeft-nRight)/2; 664 665 /* Ideally, the start of the snippet should be pushed forward in the 666 ** document nDesired tokens. This block checks if there are actually 667 ** nDesired tokens to the right of the snippet. If so, *piPos and 668 ** *pHlMask are updated to shift the snippet nDesired tokens to the 669 ** right. Otherwise, the snippet is shifted by the number of tokens 670 ** available. 671 */ 672 if( nDesired>0 ){ 673 int nShift; /* Number of tokens to shift snippet by */ 674 int iCurrent = 0; /* Token counter */ 675 int rc; /* Return Code */ 676 sqlite3_tokenizer_module *pMod; 677 sqlite3_tokenizer_cursor *pC; 678 pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; 679 680 /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired) 681 ** or more tokens in zDoc/nDoc. 682 */ 683 rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC); 684 if( rc!=SQLITE_OK ){ 685 return rc; 686 } 687 while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){ 688 const char *ZDUMMY; int DUMMY1 = 0, DUMMY2 = 0, DUMMY3 = 0; 689 rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent); 690 } 691 pMod->xClose(pC); 692 if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; } 693 694 nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet; 695 assert( nShift<=nDesired ); 696 if( nShift>0 ){ 697 *piPos += nShift; 698 *pHlmask = hlmask >> nShift; 699 } 700 } 701 } 702 return SQLITE_OK; 703 } 704 705 /* 706 ** Extract the snippet text for fragment pFragment from cursor pCsr and 707 ** append it to string buffer pOut. 708 */ 709 static int fts3SnippetText( 710 Fts3Cursor *pCsr, /* FTS3 Cursor */ 711 SnippetFragment *pFragment, /* Snippet to extract */ 712 int iFragment, /* Fragment number */ 713 int isLast, /* True for final fragment in snippet */ 714 int nSnippet, /* Number of tokens in extracted snippet */ 715 const char *zOpen, /* String inserted before highlighted term */ 716 const char *zClose, /* String inserted after highlighted term */ 717 const char *zEllipsis, /* String inserted between snippets */ 718 StrBuffer *pOut /* Write output here */ 719 ){ 720 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; 721 int rc; /* Return code */ 722 const char *zDoc; /* Document text to extract snippet from */ 723 int nDoc; /* Size of zDoc in bytes */ 724 int iCurrent = 0; /* Current token number of document */ 725 int iEnd = 0; /* Byte offset of end of current token */ 726 int isShiftDone = 0; /* True after snippet is shifted */ 727 int iPos = pFragment->iPos; /* First token of snippet */ 728 u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */ 729 int iCol = pFragment->iCol+1; /* Query column to extract text from */ 730 sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */ 731 sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */ 732 733 zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol); 734 if( zDoc==0 ){ 735 if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){ 736 return SQLITE_NOMEM; 737 } 738 return SQLITE_OK; 739 } 740 nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol); 741 742 /* Open a token cursor on the document. */ 743 pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; 744 rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC); 745 if( rc!=SQLITE_OK ){ 746 return rc; 747 } 748 749 while( rc==SQLITE_OK ){ 750 const char *ZDUMMY; /* Dummy argument used with tokenizer */ 751 int DUMMY1 = -1; /* Dummy argument used with tokenizer */ 752 int iBegin = 0; /* Offset in zDoc of start of token */ 753 int iFin = 0; /* Offset in zDoc of end of token */ 754 int isHighlight = 0; /* True for highlighted terms */ 755 756 /* Variable DUMMY1 is initialized to a negative value above. Elsewhere 757 ** in the FTS code the variable that the third argument to xNext points to 758 ** is initialized to zero before the first (*but not necessarily 759 ** subsequent*) call to xNext(). This is done for a particular application 760 ** that needs to know whether or not the tokenizer is being used for 761 ** snippet generation or for some other purpose. 762 ** 763 ** Extreme care is required when writing code to depend on this 764 ** initialization. It is not a documented part of the tokenizer interface. 765 ** If a tokenizer is used directly by any code outside of FTS, this 766 ** convention might not be respected. */ 767 rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent); 768 if( rc!=SQLITE_OK ){ 769 if( rc==SQLITE_DONE ){ 770 /* Special case - the last token of the snippet is also the last token 771 ** of the column. Append any punctuation that occurred between the end 772 ** of the previous token and the end of the document to the output. 773 ** Then break out of the loop. */ 774 rc = fts3StringAppend(pOut, &zDoc[iEnd], -1); 775 } 776 break; 777 } 778 if( iCurrent<iPos ){ continue; } 779 780 if( !isShiftDone ){ 781 int n = nDoc - iBegin; 782 rc = fts3SnippetShift( 783 pTab, pCsr->iLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask 784 ); 785 isShiftDone = 1; 786 787 /* Now that the shift has been done, check if the initial "..." are 788 ** required. They are required if (a) this is not the first fragment, 789 ** or (b) this fragment does not begin at position 0 of its column. 790 */ 791 if( rc==SQLITE_OK ){ 792 if( iPos>0 || iFragment>0 ){ 793 rc = fts3StringAppend(pOut, zEllipsis, -1); 794 }else if( iBegin ){ 795 rc = fts3StringAppend(pOut, zDoc, iBegin); 796 } 797 } 798 if( rc!=SQLITE_OK || iCurrent<iPos ) continue; 799 } 800 801 if( iCurrent>=(iPos+nSnippet) ){ 802 if( isLast ){ 803 rc = fts3StringAppend(pOut, zEllipsis, -1); 804 } 805 break; 806 } 807 808 /* Set isHighlight to true if this term should be highlighted. */ 809 isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0; 810 811 if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd); 812 if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1); 813 if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin); 814 if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1); 815 816 iEnd = iFin; 817 } 818 819 pMod->xClose(pC); 820 return rc; 821 } 822 823 824 /* 825 ** This function is used to count the entries in a column-list (a 826 ** delta-encoded list of term offsets within a single column of a single 827 ** row). When this function is called, *ppCollist should point to the 828 ** beginning of the first varint in the column-list (the varint that 829 ** contains the position of the first matching term in the column data). 830 ** Before returning, *ppCollist is set to point to the first byte after 831 ** the last varint in the column-list (either the 0x00 signifying the end 832 ** of the position-list, or the 0x01 that precedes the column number of 833 ** the next column in the position-list). 834 ** 835 ** The number of elements in the column-list is returned. 836 */ 837 static int fts3ColumnlistCount(char **ppCollist){ 838 char *pEnd = *ppCollist; 839 char c = 0; 840 int nEntry = 0; 841 842 /* A column-list is terminated by either a 0x01 or 0x00. */ 843 while( 0xFE & (*pEnd | c) ){ 844 c = *pEnd++ & 0x80; 845 if( !c ) nEntry++; 846 } 847 848 *ppCollist = pEnd; 849 return nEntry; 850 } 851 852 /* 853 ** This function gathers 'y' or 'b' data for a single phrase. 854 */ 855 static void fts3ExprLHits( 856 Fts3Expr *pExpr, /* Phrase expression node */ 857 MatchInfo *p /* Matchinfo context */ 858 ){ 859 Fts3Table *pTab = (Fts3Table *)p->pCursor->base.pVtab; 860 int iStart; 861 Fts3Phrase *pPhrase = pExpr->pPhrase; 862 char *pIter = pPhrase->doclist.pList; 863 int iCol = 0; 864 865 assert( p->flag==FTS3_MATCHINFO_LHITS_BM || p->flag==FTS3_MATCHINFO_LHITS ); 866 if( p->flag==FTS3_MATCHINFO_LHITS ){ 867 iStart = pExpr->iPhrase * p->nCol; 868 }else{ 869 iStart = pExpr->iPhrase * ((p->nCol + 31) / 32); 870 } 871 872 while( 1 ){ 873 int nHit = fts3ColumnlistCount(&pIter); 874 if( (pPhrase->iColumn>=pTab->nColumn || pPhrase->iColumn==iCol) ){ 875 if( p->flag==FTS3_MATCHINFO_LHITS ){ 876 p->aMatchinfo[iStart + iCol] = (u32)nHit; 877 }else if( nHit ){ 878 p->aMatchinfo[iStart + (iCol+1)/32] |= (1 << (iCol&0x1F)); 879 } 880 } 881 assert( *pIter==0x00 || *pIter==0x01 ); 882 if( *pIter!=0x01 ) break; 883 pIter++; 884 pIter += fts3GetVarint32(pIter, &iCol); 885 } 886 } 887 888 /* 889 ** Gather the results for matchinfo directives 'y' and 'b'. 890 */ 891 static void fts3ExprLHitGather( 892 Fts3Expr *pExpr, 893 MatchInfo *p 894 ){ 895 assert( (pExpr->pLeft==0)==(pExpr->pRight==0) ); 896 if( pExpr->bEof==0 && pExpr->iDocid==p->pCursor->iPrevId ){ 897 if( pExpr->pLeft ){ 898 fts3ExprLHitGather(pExpr->pLeft, p); 899 fts3ExprLHitGather(pExpr->pRight, p); 900 }else{ 901 fts3ExprLHits(pExpr, p); 902 } 903 } 904 } 905 906 /* 907 ** fts3ExprIterate() callback used to collect the "global" matchinfo stats 908 ** for a single query. 909 ** 910 ** fts3ExprIterate() callback to load the 'global' elements of a 911 ** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements 912 ** of the matchinfo array that are constant for all rows returned by the 913 ** current query. 914 ** 915 ** Argument pCtx is actually a pointer to a struct of type MatchInfo. This 916 ** function populates Matchinfo.aMatchinfo[] as follows: 917 ** 918 ** for(iCol=0; iCol<nCol; iCol++){ 919 ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 1] = X; 920 ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 2] = Y; 921 ** } 922 ** 923 ** where X is the number of matches for phrase iPhrase is column iCol of all 924 ** rows of the table. Y is the number of rows for which column iCol contains 925 ** at least one instance of phrase iPhrase. 926 ** 927 ** If the phrase pExpr consists entirely of deferred tokens, then all X and 928 ** Y values are set to nDoc, where nDoc is the number of documents in the 929 ** file system. This is done because the full-text index doclist is required 930 ** to calculate these values properly, and the full-text index doclist is 931 ** not available for deferred tokens. 932 */ 933 static int fts3ExprGlobalHitsCb( 934 Fts3Expr *pExpr, /* Phrase expression node */ 935 int iPhrase, /* Phrase number (numbered from zero) */ 936 void *pCtx /* Pointer to MatchInfo structure */ 937 ){ 938 MatchInfo *p = (MatchInfo *)pCtx; 939 return sqlite3Fts3EvalPhraseStats( 940 p->pCursor, pExpr, &p->aMatchinfo[3*iPhrase*p->nCol] 941 ); 942 } 943 944 /* 945 ** fts3ExprIterate() callback used to collect the "local" part of the 946 ** FTS3_MATCHINFO_HITS array. The local stats are those elements of the 947 ** array that are different for each row returned by the query. 948 */ 949 static int fts3ExprLocalHitsCb( 950 Fts3Expr *pExpr, /* Phrase expression node */ 951 int iPhrase, /* Phrase number */ 952 void *pCtx /* Pointer to MatchInfo structure */ 953 ){ 954 int rc = SQLITE_OK; 955 MatchInfo *p = (MatchInfo *)pCtx; 956 int iStart = iPhrase * p->nCol * 3; 957 int i; 958 959 for(i=0; i<p->nCol && rc==SQLITE_OK; i++){ 960 char *pCsr; 961 rc = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i, &pCsr); 962 if( pCsr ){ 963 p->aMatchinfo[iStart+i*3] = fts3ColumnlistCount(&pCsr); 964 }else{ 965 p->aMatchinfo[iStart+i*3] = 0; 966 } 967 } 968 969 return rc; 970 } 971 972 static int fts3MatchinfoCheck( 973 Fts3Table *pTab, 974 char cArg, 975 char **pzErr 976 ){ 977 if( (cArg==FTS3_MATCHINFO_NPHRASE) 978 || (cArg==FTS3_MATCHINFO_NCOL) 979 || (cArg==FTS3_MATCHINFO_NDOC && pTab->bFts4) 980 || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bFts4) 981 || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize) 982 || (cArg==FTS3_MATCHINFO_LCS) 983 || (cArg==FTS3_MATCHINFO_HITS) 984 || (cArg==FTS3_MATCHINFO_LHITS) 985 || (cArg==FTS3_MATCHINFO_LHITS_BM) 986 ){ 987 return SQLITE_OK; 988 } 989 sqlite3Fts3ErrMsg(pzErr, "unrecognized matchinfo request: %c", cArg); 990 return SQLITE_ERROR; 991 } 992 993 static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){ 994 int nVal; /* Number of integers output by cArg */ 995 996 switch( cArg ){ 997 case FTS3_MATCHINFO_NDOC: 998 case FTS3_MATCHINFO_NPHRASE: 999 case FTS3_MATCHINFO_NCOL: 1000 nVal = 1; 1001 break; 1002 1003 case FTS3_MATCHINFO_AVGLENGTH: 1004 case FTS3_MATCHINFO_LENGTH: 1005 case FTS3_MATCHINFO_LCS: 1006 nVal = pInfo->nCol; 1007 break; 1008 1009 case FTS3_MATCHINFO_LHITS: 1010 nVal = pInfo->nCol * pInfo->nPhrase; 1011 break; 1012 1013 case FTS3_MATCHINFO_LHITS_BM: 1014 nVal = pInfo->nPhrase * ((pInfo->nCol + 31) / 32); 1015 break; 1016 1017 default: 1018 assert( cArg==FTS3_MATCHINFO_HITS ); 1019 nVal = pInfo->nCol * pInfo->nPhrase * 3; 1020 break; 1021 } 1022 1023 return nVal; 1024 } 1025 1026 static int fts3MatchinfoSelectDoctotal( 1027 Fts3Table *pTab, 1028 sqlite3_stmt **ppStmt, 1029 sqlite3_int64 *pnDoc, 1030 const char **paLen 1031 ){ 1032 sqlite3_stmt *pStmt; 1033 const char *a; 1034 sqlite3_int64 nDoc; 1035 1036 if( !*ppStmt ){ 1037 int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt); 1038 if( rc!=SQLITE_OK ) return rc; 1039 } 1040 pStmt = *ppStmt; 1041 assert( sqlite3_data_count(pStmt)==1 ); 1042 1043 a = sqlite3_column_blob(pStmt, 0); 1044 a += sqlite3Fts3GetVarint(a, &nDoc); 1045 if( nDoc==0 ) return FTS_CORRUPT_VTAB; 1046 *pnDoc = (u32)nDoc; 1047 1048 if( paLen ) *paLen = a; 1049 return SQLITE_OK; 1050 } 1051 1052 /* 1053 ** An instance of the following structure is used to store state while 1054 ** iterating through a multi-column position-list corresponding to the 1055 ** hits for a single phrase on a single row in order to calculate the 1056 ** values for a matchinfo() FTS3_MATCHINFO_LCS request. 1057 */ 1058 typedef struct LcsIterator LcsIterator; 1059 struct LcsIterator { 1060 Fts3Expr *pExpr; /* Pointer to phrase expression */ 1061 int iPosOffset; /* Tokens count up to end of this phrase */ 1062 char *pRead; /* Cursor used to iterate through aDoclist */ 1063 int iPos; /* Current position */ 1064 }; 1065 1066 /* 1067 ** If LcsIterator.iCol is set to the following value, the iterator has 1068 ** finished iterating through all offsets for all columns. 1069 */ 1070 #define LCS_ITERATOR_FINISHED 0x7FFFFFFF; 1071 1072 static int fts3MatchinfoLcsCb( 1073 Fts3Expr *pExpr, /* Phrase expression node */ 1074 int iPhrase, /* Phrase number (numbered from zero) */ 1075 void *pCtx /* Pointer to MatchInfo structure */ 1076 ){ 1077 LcsIterator *aIter = (LcsIterator *)pCtx; 1078 aIter[iPhrase].pExpr = pExpr; 1079 return SQLITE_OK; 1080 } 1081 1082 /* 1083 ** Advance the iterator passed as an argument to the next position. Return 1084 ** 1 if the iterator is at EOF or if it now points to the start of the 1085 ** position list for the next column. 1086 */ 1087 static int fts3LcsIteratorAdvance(LcsIterator *pIter){ 1088 char *pRead = pIter->pRead; 1089 sqlite3_int64 iRead; 1090 int rc = 0; 1091 1092 pRead += sqlite3Fts3GetVarint(pRead, &iRead); 1093 if( iRead==0 || iRead==1 ){ 1094 pRead = 0; 1095 rc = 1; 1096 }else{ 1097 pIter->iPos += (int)(iRead-2); 1098 } 1099 1100 pIter->pRead = pRead; 1101 return rc; 1102 } 1103 1104 /* 1105 ** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag. 1106 ** 1107 ** If the call is successful, the longest-common-substring lengths for each 1108 ** column are written into the first nCol elements of the pInfo->aMatchinfo[] 1109 ** array before returning. SQLITE_OK is returned in this case. 1110 ** 1111 ** Otherwise, if an error occurs, an SQLite error code is returned and the 1112 ** data written to the first nCol elements of pInfo->aMatchinfo[] is 1113 ** undefined. 1114 */ 1115 static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){ 1116 LcsIterator *aIter; 1117 int i; 1118 int iCol; 1119 int nToken = 0; 1120 1121 /* Allocate and populate the array of LcsIterator objects. The array 1122 ** contains one element for each matchable phrase in the query. 1123 **/ 1124 aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase); 1125 if( !aIter ) return SQLITE_NOMEM; 1126 memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase); 1127 (void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter); 1128 1129 for(i=0; i<pInfo->nPhrase; i++){ 1130 LcsIterator *pIter = &aIter[i]; 1131 nToken -= pIter->pExpr->pPhrase->nToken; 1132 pIter->iPosOffset = nToken; 1133 } 1134 1135 for(iCol=0; iCol<pInfo->nCol; iCol++){ 1136 int nLcs = 0; /* LCS value for this column */ 1137 int nLive = 0; /* Number of iterators in aIter not at EOF */ 1138 1139 for(i=0; i<pInfo->nPhrase; i++){ 1140 int rc; 1141 LcsIterator *pIt = &aIter[i]; 1142 rc = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol, &pIt->pRead); 1143 if( rc!=SQLITE_OK ) return rc; 1144 if( pIt->pRead ){ 1145 pIt->iPos = pIt->iPosOffset; 1146 fts3LcsIteratorAdvance(&aIter[i]); 1147 nLive++; 1148 } 1149 } 1150 1151 while( nLive>0 ){ 1152 LcsIterator *pAdv = 0; /* The iterator to advance by one position */ 1153 int nThisLcs = 0; /* LCS for the current iterator positions */ 1154 1155 for(i=0; i<pInfo->nPhrase; i++){ 1156 LcsIterator *pIter = &aIter[i]; 1157 if( pIter->pRead==0 ){ 1158 /* This iterator is already at EOF for this column. */ 1159 nThisLcs = 0; 1160 }else{ 1161 if( pAdv==0 || pIter->iPos<pAdv->iPos ){ 1162 pAdv = pIter; 1163 } 1164 if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){ 1165 nThisLcs++; 1166 }else{ 1167 nThisLcs = 1; 1168 } 1169 if( nThisLcs>nLcs ) nLcs = nThisLcs; 1170 } 1171 } 1172 if( fts3LcsIteratorAdvance(pAdv) ) nLive--; 1173 } 1174 1175 pInfo->aMatchinfo[iCol] = nLcs; 1176 } 1177 1178 sqlite3_free(aIter); 1179 return SQLITE_OK; 1180 } 1181 1182 /* 1183 ** Populate the buffer pInfo->aMatchinfo[] with an array of integers to 1184 ** be returned by the matchinfo() function. Argument zArg contains the 1185 ** format string passed as the second argument to matchinfo (or the 1186 ** default value "pcx" if no second argument was specified). The format 1187 ** string has already been validated and the pInfo->aMatchinfo[] array 1188 ** is guaranteed to be large enough for the output. 1189 ** 1190 ** If bGlobal is true, then populate all fields of the matchinfo() output. 1191 ** If it is false, then assume that those fields that do not change between 1192 ** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS) 1193 ** have already been populated. 1194 ** 1195 ** Return SQLITE_OK if successful, or an SQLite error code if an error 1196 ** occurs. If a value other than SQLITE_OK is returned, the state the 1197 ** pInfo->aMatchinfo[] buffer is left in is undefined. 1198 */ 1199 static int fts3MatchinfoValues( 1200 Fts3Cursor *pCsr, /* FTS3 cursor object */ 1201 int bGlobal, /* True to grab the global stats */ 1202 MatchInfo *pInfo, /* Matchinfo context object */ 1203 const char *zArg /* Matchinfo format string */ 1204 ){ 1205 int rc = SQLITE_OK; 1206 int i; 1207 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; 1208 sqlite3_stmt *pSelect = 0; 1209 1210 for(i=0; rc==SQLITE_OK && zArg[i]; i++){ 1211 pInfo->flag = zArg[i]; 1212 switch( zArg[i] ){ 1213 case FTS3_MATCHINFO_NPHRASE: 1214 if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase; 1215 break; 1216 1217 case FTS3_MATCHINFO_NCOL: 1218 if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol; 1219 break; 1220 1221 case FTS3_MATCHINFO_NDOC: 1222 if( bGlobal ){ 1223 sqlite3_int64 nDoc = 0; 1224 rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0); 1225 pInfo->aMatchinfo[0] = (u32)nDoc; 1226 } 1227 break; 1228 1229 case FTS3_MATCHINFO_AVGLENGTH: 1230 if( bGlobal ){ 1231 sqlite3_int64 nDoc; /* Number of rows in table */ 1232 const char *a; /* Aggregate column length array */ 1233 1234 rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a); 1235 if( rc==SQLITE_OK ){ 1236 int iCol; 1237 for(iCol=0; iCol<pInfo->nCol; iCol++){ 1238 u32 iVal; 1239 sqlite3_int64 nToken; 1240 a += sqlite3Fts3GetVarint(a, &nToken); 1241 iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc); 1242 pInfo->aMatchinfo[iCol] = iVal; 1243 } 1244 } 1245 } 1246 break; 1247 1248 case FTS3_MATCHINFO_LENGTH: { 1249 sqlite3_stmt *pSelectDocsize = 0; 1250 rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize); 1251 if( rc==SQLITE_OK ){ 1252 int iCol; 1253 const char *a = sqlite3_column_blob(pSelectDocsize, 0); 1254 for(iCol=0; iCol<pInfo->nCol; iCol++){ 1255 sqlite3_int64 nToken; 1256 a += sqlite3Fts3GetVarint(a, &nToken); 1257 pInfo->aMatchinfo[iCol] = (u32)nToken; 1258 } 1259 } 1260 sqlite3_reset(pSelectDocsize); 1261 break; 1262 } 1263 1264 case FTS3_MATCHINFO_LCS: 1265 rc = fts3ExprLoadDoclists(pCsr, 0, 0); 1266 if( rc==SQLITE_OK ){ 1267 rc = fts3MatchinfoLcs(pCsr, pInfo); 1268 } 1269 break; 1270 1271 case FTS3_MATCHINFO_LHITS_BM: 1272 case FTS3_MATCHINFO_LHITS: { 1273 int nZero = fts3MatchinfoSize(pInfo, zArg[i]) * sizeof(u32); 1274 memset(pInfo->aMatchinfo, 0, nZero); 1275 fts3ExprLHitGather(pCsr->pExpr, pInfo); 1276 break; 1277 } 1278 1279 default: { 1280 Fts3Expr *pExpr; 1281 assert( zArg[i]==FTS3_MATCHINFO_HITS ); 1282 pExpr = pCsr->pExpr; 1283 rc = fts3ExprLoadDoclists(pCsr, 0, 0); 1284 if( rc!=SQLITE_OK ) break; 1285 if( bGlobal ){ 1286 if( pCsr->pDeferred ){ 1287 rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0); 1288 if( rc!=SQLITE_OK ) break; 1289 } 1290 rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo); 1291 sqlite3Fts3EvalTestDeferred(pCsr, &rc); 1292 if( rc!=SQLITE_OK ) break; 1293 } 1294 (void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo); 1295 break; 1296 } 1297 } 1298 1299 pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]); 1300 } 1301 1302 sqlite3_reset(pSelect); 1303 return rc; 1304 } 1305 1306 1307 /* 1308 ** Populate pCsr->aMatchinfo[] with data for the current row. The 1309 ** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32). 1310 */ 1311 static void fts3GetMatchinfo( 1312 sqlite3_context *pCtx, /* Return results here */ 1313 Fts3Cursor *pCsr, /* FTS3 Cursor object */ 1314 const char *zArg /* Second argument to matchinfo() function */ 1315 ){ 1316 MatchInfo sInfo; 1317 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; 1318 int rc = SQLITE_OK; 1319 int bGlobal = 0; /* Collect 'global' stats as well as local */ 1320 1321 u32 *aOut = 0; 1322 void (*xDestroyOut)(void*) = 0; 1323 1324 memset(&sInfo, 0, sizeof(MatchInfo)); 1325 sInfo.pCursor = pCsr; 1326 sInfo.nCol = pTab->nColumn; 1327 1328 /* If there is cached matchinfo() data, but the format string for the 1329 ** cache does not match the format string for this request, discard 1330 ** the cached data. */ 1331 if( pCsr->pMIBuffer && strcmp(pCsr->pMIBuffer->zMatchinfo, zArg) ){ 1332 sqlite3Fts3MIBufferFree(pCsr->pMIBuffer); 1333 pCsr->pMIBuffer = 0; 1334 } 1335 1336 /* If Fts3Cursor.pMIBuffer is NULL, then this is the first time the 1337 ** matchinfo function has been called for this query. In this case 1338 ** allocate the array used to accumulate the matchinfo data and 1339 ** initialize those elements that are constant for every row. 1340 */ 1341 if( pCsr->pMIBuffer==0 ){ 1342 int nMatchinfo = 0; /* Number of u32 elements in match-info */ 1343 int i; /* Used to iterate through zArg */ 1344 1345 /* Determine the number of phrases in the query */ 1346 pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr); 1347 sInfo.nPhrase = pCsr->nPhrase; 1348 1349 /* Determine the number of integers in the buffer returned by this call. */ 1350 for(i=0; zArg[i]; i++){ 1351 char *zErr = 0; 1352 if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){ 1353 sqlite3_result_error(pCtx, zErr, -1); 1354 sqlite3_free(zErr); 1355 return; 1356 } 1357 nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]); 1358 } 1359 1360 /* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */ 1361 pCsr->pMIBuffer = fts3MIBufferNew(nMatchinfo, zArg); 1362 if( !pCsr->pMIBuffer ) rc = SQLITE_NOMEM; 1363 1364 pCsr->isMatchinfoNeeded = 1; 1365 bGlobal = 1; 1366 } 1367 1368 if( rc==SQLITE_OK ){ 1369 xDestroyOut = fts3MIBufferAlloc(pCsr->pMIBuffer, &aOut); 1370 if( xDestroyOut==0 ){ 1371 rc = SQLITE_NOMEM; 1372 } 1373 } 1374 1375 if( rc==SQLITE_OK ){ 1376 sInfo.aMatchinfo = aOut; 1377 sInfo.nPhrase = pCsr->nPhrase; 1378 rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg); 1379 if( bGlobal ){ 1380 fts3MIBufferSetGlobal(pCsr->pMIBuffer); 1381 } 1382 } 1383 1384 if( rc!=SQLITE_OK ){ 1385 sqlite3_result_error_code(pCtx, rc); 1386 if( xDestroyOut ) xDestroyOut(aOut); 1387 }else{ 1388 int n = pCsr->pMIBuffer->nElem * sizeof(u32); 1389 sqlite3_result_blob(pCtx, aOut, n, xDestroyOut); 1390 } 1391 } 1392 1393 /* 1394 ** Implementation of snippet() function. 1395 */ 1396 void sqlite3Fts3Snippet( 1397 sqlite3_context *pCtx, /* SQLite function call context */ 1398 Fts3Cursor *pCsr, /* Cursor object */ 1399 const char *zStart, /* Snippet start text - "<b>" */ 1400 const char *zEnd, /* Snippet end text - "</b>" */ 1401 const char *zEllipsis, /* Snippet ellipsis text - "<b>...</b>" */ 1402 int iCol, /* Extract snippet from this column */ 1403 int nToken /* Approximate number of tokens in snippet */ 1404 ){ 1405 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; 1406 int rc = SQLITE_OK; 1407 int i; 1408 StrBuffer res = {0, 0, 0}; 1409 1410 /* The returned text includes up to four fragments of text extracted from 1411 ** the data in the current row. The first iteration of the for(...) loop 1412 ** below attempts to locate a single fragment of text nToken tokens in 1413 ** size that contains at least one instance of all phrases in the query 1414 ** expression that appear in the current row. If such a fragment of text 1415 ** cannot be found, the second iteration of the loop attempts to locate 1416 ** a pair of fragments, and so on. 1417 */ 1418 int nSnippet = 0; /* Number of fragments in this snippet */ 1419 SnippetFragment aSnippet[4]; /* Maximum of 4 fragments per snippet */ 1420 int nFToken = -1; /* Number of tokens in each fragment */ 1421 1422 if( !pCsr->pExpr ){ 1423 sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC); 1424 return; 1425 } 1426 1427 for(nSnippet=1; 1; nSnippet++){ 1428 1429 int iSnip; /* Loop counter 0..nSnippet-1 */ 1430 u64 mCovered = 0; /* Bitmask of phrases covered by snippet */ 1431 u64 mSeen = 0; /* Bitmask of phrases seen by BestSnippet() */ 1432 1433 if( nToken>=0 ){ 1434 nFToken = (nToken+nSnippet-1) / nSnippet; 1435 }else{ 1436 nFToken = -1 * nToken; 1437 } 1438 1439 for(iSnip=0; iSnip<nSnippet; iSnip++){ 1440 int iBestScore = -1; /* Best score of columns checked so far */ 1441 int iRead; /* Used to iterate through columns */ 1442 SnippetFragment *pFragment = &aSnippet[iSnip]; 1443 1444 memset(pFragment, 0, sizeof(*pFragment)); 1445 1446 /* Loop through all columns of the table being considered for snippets. 1447 ** If the iCol argument to this function was negative, this means all 1448 ** columns of the FTS3 table. Otherwise, only column iCol is considered. 1449 */ 1450 for(iRead=0; iRead<pTab->nColumn; iRead++){ 1451 SnippetFragment sF = {0, 0, 0, 0}; 1452 int iS = 0; 1453 if( iCol>=0 && iRead!=iCol ) continue; 1454 1455 /* Find the best snippet of nFToken tokens in column iRead. */ 1456 rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS); 1457 if( rc!=SQLITE_OK ){ 1458 goto snippet_out; 1459 } 1460 if( iS>iBestScore ){ 1461 *pFragment = sF; 1462 iBestScore = iS; 1463 } 1464 } 1465 1466 mCovered |= pFragment->covered; 1467 } 1468 1469 /* If all query phrases seen by fts3BestSnippet() are present in at least 1470 ** one of the nSnippet snippet fragments, break out of the loop. 1471 */ 1472 assert( (mCovered&mSeen)==mCovered ); 1473 if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break; 1474 } 1475 1476 assert( nFToken>0 ); 1477 1478 for(i=0; i<nSnippet && rc==SQLITE_OK; i++){ 1479 rc = fts3SnippetText(pCsr, &aSnippet[i], 1480 i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res 1481 ); 1482 } 1483 1484 snippet_out: 1485 sqlite3Fts3SegmentsClose(pTab); 1486 if( rc!=SQLITE_OK ){ 1487 sqlite3_result_error_code(pCtx, rc); 1488 sqlite3_free(res.z); 1489 }else{ 1490 sqlite3_result_text(pCtx, res.z, -1, sqlite3_free); 1491 } 1492 } 1493 1494 1495 typedef struct TermOffset TermOffset; 1496 typedef struct TermOffsetCtx TermOffsetCtx; 1497 1498 struct TermOffset { 1499 char *pList; /* Position-list */ 1500 int iPos; /* Position just read from pList */ 1501 int iOff; /* Offset of this term from read positions */ 1502 }; 1503 1504 struct TermOffsetCtx { 1505 Fts3Cursor *pCsr; 1506 int iCol; /* Column of table to populate aTerm for */ 1507 int iTerm; 1508 sqlite3_int64 iDocid; 1509 TermOffset *aTerm; 1510 }; 1511 1512 /* 1513 ** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets(). 1514 */ 1515 static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){ 1516 TermOffsetCtx *p = (TermOffsetCtx *)ctx; 1517 int nTerm; /* Number of tokens in phrase */ 1518 int iTerm; /* For looping through nTerm phrase terms */ 1519 char *pList; /* Pointer to position list for phrase */ 1520 int iPos = 0; /* First position in position-list */ 1521 int rc; 1522 1523 UNUSED_PARAMETER(iPhrase); 1524 rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pList); 1525 nTerm = pExpr->pPhrase->nToken; 1526 if( pList ){ 1527 fts3GetDeltaPosition(&pList, &iPos); 1528 assert( iPos>=0 ); 1529 } 1530 1531 for(iTerm=0; iTerm<nTerm; iTerm++){ 1532 TermOffset *pT = &p->aTerm[p->iTerm++]; 1533 pT->iOff = nTerm-iTerm-1; 1534 pT->pList = pList; 1535 pT->iPos = iPos; 1536 } 1537 1538 return rc; 1539 } 1540 1541 /* 1542 ** Implementation of offsets() function. 1543 */ 1544 void sqlite3Fts3Offsets( 1545 sqlite3_context *pCtx, /* SQLite function call context */ 1546 Fts3Cursor *pCsr /* Cursor object */ 1547 ){ 1548 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; 1549 sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule; 1550 int rc; /* Return Code */ 1551 int nToken; /* Number of tokens in query */ 1552 int iCol; /* Column currently being processed */ 1553 StrBuffer res = {0, 0, 0}; /* Result string */ 1554 TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */ 1555 1556 if( !pCsr->pExpr ){ 1557 sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC); 1558 return; 1559 } 1560 1561 memset(&sCtx, 0, sizeof(sCtx)); 1562 assert( pCsr->isRequireSeek==0 ); 1563 1564 /* Count the number of terms in the query */ 1565 rc = fts3ExprLoadDoclists(pCsr, 0, &nToken); 1566 if( rc!=SQLITE_OK ) goto offsets_out; 1567 1568 /* Allocate the array of TermOffset iterators. */ 1569 sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken); 1570 if( 0==sCtx.aTerm ){ 1571 rc = SQLITE_NOMEM; 1572 goto offsets_out; 1573 } 1574 sCtx.iDocid = pCsr->iPrevId; 1575 sCtx.pCsr = pCsr; 1576 1577 /* Loop through the table columns, appending offset information to 1578 ** string-buffer res for each column. 1579 */ 1580 for(iCol=0; iCol<pTab->nColumn; iCol++){ 1581 sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */ 1582 const char *ZDUMMY; /* Dummy argument used with xNext() */ 1583 int NDUMMY = 0; /* Dummy argument used with xNext() */ 1584 int iStart = 0; 1585 int iEnd = 0; 1586 int iCurrent = 0; 1587 const char *zDoc; 1588 int nDoc; 1589 1590 /* Initialize the contents of sCtx.aTerm[] for column iCol. There is 1591 ** no way that this operation can fail, so the return code from 1592 ** fts3ExprIterate() can be discarded. 1593 */ 1594 sCtx.iCol = iCol; 1595 sCtx.iTerm = 0; 1596 (void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void*)&sCtx); 1597 1598 /* Retreive the text stored in column iCol. If an SQL NULL is stored 1599 ** in column iCol, jump immediately to the next iteration of the loop. 1600 ** If an OOM occurs while retrieving the data (this can happen if SQLite 1601 ** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM 1602 ** to the caller. 1603 */ 1604 zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1); 1605 nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1); 1606 if( zDoc==0 ){ 1607 if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){ 1608 continue; 1609 } 1610 rc = SQLITE_NOMEM; 1611 goto offsets_out; 1612 } 1613 1614 /* Initialize a tokenizer iterator to iterate through column iCol. */ 1615 rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, 1616 zDoc, nDoc, &pC 1617 ); 1618 if( rc!=SQLITE_OK ) goto offsets_out; 1619 1620 rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent); 1621 while( rc==SQLITE_OK ){ 1622 int i; /* Used to loop through terms */ 1623 int iMinPos = 0x7FFFFFFF; /* Position of next token */ 1624 TermOffset *pTerm = 0; /* TermOffset associated with next token */ 1625 1626 for(i=0; i<nToken; i++){ 1627 TermOffset *pT = &sCtx.aTerm[i]; 1628 if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){ 1629 iMinPos = pT->iPos-pT->iOff; 1630 pTerm = pT; 1631 } 1632 } 1633 1634 if( !pTerm ){ 1635 /* All offsets for this column have been gathered. */ 1636 rc = SQLITE_DONE; 1637 }else{ 1638 assert( iCurrent<=iMinPos ); 1639 if( 0==(0xFE&*pTerm->pList) ){ 1640 pTerm->pList = 0; 1641 }else{ 1642 fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos); 1643 } 1644 while( rc==SQLITE_OK && iCurrent<iMinPos ){ 1645 rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent); 1646 } 1647 if( rc==SQLITE_OK ){ 1648 char aBuffer[64]; 1649 sqlite3_snprintf(sizeof(aBuffer), aBuffer, 1650 "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart 1651 ); 1652 rc = fts3StringAppend(&res, aBuffer, -1); 1653 }else if( rc==SQLITE_DONE && pTab->zContentTbl==0 ){ 1654 rc = FTS_CORRUPT_VTAB; 1655 } 1656 } 1657 } 1658 if( rc==SQLITE_DONE ){ 1659 rc = SQLITE_OK; 1660 } 1661 1662 pMod->xClose(pC); 1663 if( rc!=SQLITE_OK ) goto offsets_out; 1664 } 1665 1666 offsets_out: 1667 sqlite3_free(sCtx.aTerm); 1668 assert( rc!=SQLITE_DONE ); 1669 sqlite3Fts3SegmentsClose(pTab); 1670 if( rc!=SQLITE_OK ){ 1671 sqlite3_result_error_code(pCtx, rc); 1672 sqlite3_free(res.z); 1673 }else{ 1674 sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free); 1675 } 1676 return; 1677 } 1678 1679 /* 1680 ** Implementation of matchinfo() function. 1681 */ 1682 void sqlite3Fts3Matchinfo( 1683 sqlite3_context *pContext, /* Function call context */ 1684 Fts3Cursor *pCsr, /* FTS3 table cursor */ 1685 const char *zArg /* Second arg to matchinfo() function */ 1686 ){ 1687 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; 1688 const char *zFormat; 1689 1690 if( zArg ){ 1691 zFormat = zArg; 1692 }else{ 1693 zFormat = FTS3_MATCHINFO_DEFAULT; 1694 } 1695 1696 if( !pCsr->pExpr ){ 1697 sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC); 1698 return; 1699 }else{ 1700 /* Retrieve matchinfo() data. */ 1701 fts3GetMatchinfo(pContext, pCsr, zFormat); 1702 sqlite3Fts3SegmentsClose(pTab); 1703 } 1704 } 1705 1706 #endif