modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/fts5_aux.c (about) 1 /* 2 ** 2014 May 31 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ****************************************************************************** 12 */ 13 14 15 #include "fts5Int.h" 16 #include <math.h> /* amalgamator: keep */ 17 18 /* 19 ** Object used to iterate through all "coalesced phrase instances" in 20 ** a single column of the current row. If the phrase instances in the 21 ** column being considered do not overlap, this object simply iterates 22 ** through them. Or, if they do overlap (share one or more tokens in 23 ** common), each set of overlapping instances is treated as a single 24 ** match. See documentation for the highlight() auxiliary function for 25 ** details. 26 ** 27 ** Usage is: 28 ** 29 ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter); 30 ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter); 31 ** rc = fts5CInstIterNext(&iter) 32 ** ){ 33 ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd); 34 ** } 35 ** 36 */ 37 typedef struct CInstIter CInstIter; 38 struct CInstIter { 39 const Fts5ExtensionApi *pApi; /* API offered by current FTS version */ 40 Fts5Context *pFts; /* First arg to pass to pApi functions */ 41 int iCol; /* Column to search */ 42 int iInst; /* Next phrase instance index */ 43 int nInst; /* Total number of phrase instances */ 44 45 /* Output variables */ 46 int iStart; /* First token in coalesced phrase instance */ 47 int iEnd; /* Last token in coalesced phrase instance */ 48 }; 49 50 /* 51 ** Advance the iterator to the next coalesced phrase instance. Return 52 ** an SQLite error code if an error occurs, or SQLITE_OK otherwise. 53 */ 54 static int fts5CInstIterNext(CInstIter *pIter){ 55 int rc = SQLITE_OK; 56 pIter->iStart = -1; 57 pIter->iEnd = -1; 58 59 while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){ 60 int ip; int ic; int io; 61 rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io); 62 if( rc==SQLITE_OK ){ 63 if( ic==pIter->iCol ){ 64 int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip); 65 if( pIter->iStart<0 ){ 66 pIter->iStart = io; 67 pIter->iEnd = iEnd; 68 }else if( io<=pIter->iEnd ){ 69 if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd; 70 }else{ 71 break; 72 } 73 } 74 pIter->iInst++; 75 } 76 } 77 78 return rc; 79 } 80 81 /* 82 ** Initialize the iterator object indicated by the final parameter to 83 ** iterate through coalesced phrase instances in column iCol. 84 */ 85 static int fts5CInstIterInit( 86 const Fts5ExtensionApi *pApi, 87 Fts5Context *pFts, 88 int iCol, 89 CInstIter *pIter 90 ){ 91 int rc; 92 93 memset(pIter, 0, sizeof(CInstIter)); 94 pIter->pApi = pApi; 95 pIter->pFts = pFts; 96 pIter->iCol = iCol; 97 rc = pApi->xInstCount(pFts, &pIter->nInst); 98 99 if( rc==SQLITE_OK ){ 100 rc = fts5CInstIterNext(pIter); 101 } 102 103 return rc; 104 } 105 106 107 108 /************************************************************************* 109 ** Start of highlight() implementation. 110 */ 111 typedef struct HighlightContext HighlightContext; 112 struct HighlightContext { 113 CInstIter iter; /* Coalesced Instance Iterator */ 114 int iPos; /* Current token offset in zIn[] */ 115 int iRangeStart; /* First token to include */ 116 int iRangeEnd; /* If non-zero, last token to include */ 117 const char *zOpen; /* Opening highlight */ 118 const char *zClose; /* Closing highlight */ 119 const char *zIn; /* Input text */ 120 int nIn; /* Size of input text in bytes */ 121 int iOff; /* Current offset within zIn[] */ 122 char *zOut; /* Output value */ 123 }; 124 125 /* 126 ** Append text to the HighlightContext output string - p->zOut. Argument 127 ** z points to a buffer containing n bytes of text to append. If n is 128 ** negative, everything up until the first '\0' is appended to the output. 129 ** 130 ** If *pRc is set to any value other than SQLITE_OK when this function is 131 ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered, 132 ** *pRc is set to an error code before returning. 133 */ 134 static void fts5HighlightAppend( 135 int *pRc, 136 HighlightContext *p, 137 const char *z, int n 138 ){ 139 if( *pRc==SQLITE_OK ){ 140 if( n<0 ) n = (int)strlen(z); 141 p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z); 142 if( p->zOut==0 ) *pRc = SQLITE_NOMEM; 143 } 144 } 145 146 /* 147 ** Tokenizer callback used by implementation of highlight() function. 148 */ 149 static int fts5HighlightCb( 150 void *pContext, /* Pointer to HighlightContext object */ 151 int tflags, /* Mask of FTS5_TOKEN_* flags */ 152 const char *pToken, /* Buffer containing token */ 153 int nToken, /* Size of token in bytes */ 154 int iStartOff, /* Start offset of token */ 155 int iEndOff /* End offset of token */ 156 ){ 157 HighlightContext *p = (HighlightContext*)pContext; 158 int rc = SQLITE_OK; 159 int iPos; 160 161 UNUSED_PARAM2(pToken, nToken); 162 163 if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK; 164 iPos = p->iPos++; 165 166 if( p->iRangeEnd>0 ){ 167 if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK; 168 if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff; 169 } 170 171 if( iPos==p->iter.iStart ){ 172 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff); 173 fts5HighlightAppend(&rc, p, p->zOpen, -1); 174 p->iOff = iStartOff; 175 } 176 177 if( iPos==p->iter.iEnd ){ 178 if( p->iRangeEnd && p->iter.iStart<p->iRangeStart ){ 179 fts5HighlightAppend(&rc, p, p->zOpen, -1); 180 } 181 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); 182 fts5HighlightAppend(&rc, p, p->zClose, -1); 183 p->iOff = iEndOff; 184 if( rc==SQLITE_OK ){ 185 rc = fts5CInstIterNext(&p->iter); 186 } 187 } 188 189 if( p->iRangeEnd>0 && iPos==p->iRangeEnd ){ 190 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); 191 p->iOff = iEndOff; 192 if( iPos>=p->iter.iStart && iPos<p->iter.iEnd ){ 193 fts5HighlightAppend(&rc, p, p->zClose, -1); 194 } 195 } 196 197 return rc; 198 } 199 200 /* 201 ** Implementation of highlight() function. 202 */ 203 static void fts5HighlightFunction( 204 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ 205 Fts5Context *pFts, /* First arg to pass to pApi functions */ 206 sqlite3_context *pCtx, /* Context for returning result/error */ 207 int nVal, /* Number of values in apVal[] array */ 208 sqlite3_value **apVal /* Array of trailing arguments */ 209 ){ 210 HighlightContext ctx; 211 int rc; 212 int iCol; 213 214 if( nVal!=3 ){ 215 const char *zErr = "wrong number of arguments to function highlight()"; 216 sqlite3_result_error(pCtx, zErr, -1); 217 return; 218 } 219 220 iCol = sqlite3_value_int(apVal[0]); 221 memset(&ctx, 0, sizeof(HighlightContext)); 222 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); 223 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); 224 rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn); 225 226 if( ctx.zIn ){ 227 if( rc==SQLITE_OK ){ 228 rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); 229 } 230 231 if( rc==SQLITE_OK ){ 232 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); 233 } 234 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); 235 236 if( rc==SQLITE_OK ){ 237 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); 238 } 239 sqlite3_free(ctx.zOut); 240 } 241 if( rc!=SQLITE_OK ){ 242 sqlite3_result_error_code(pCtx, rc); 243 } 244 } 245 /* 246 ** End of highlight() implementation. 247 **************************************************************************/ 248 249 /* 250 ** Context object passed to the fts5SentenceFinderCb() function. 251 */ 252 typedef struct Fts5SFinder Fts5SFinder; 253 struct Fts5SFinder { 254 int iPos; /* Current token position */ 255 int nFirstAlloc; /* Allocated size of aFirst[] */ 256 int nFirst; /* Number of entries in aFirst[] */ 257 int *aFirst; /* Array of first token in each sentence */ 258 const char *zDoc; /* Document being tokenized */ 259 }; 260 261 /* 262 ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if 263 ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an 264 ** error occurs. 265 */ 266 static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){ 267 if( p->nFirstAlloc==p->nFirst ){ 268 int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64; 269 int *aNew; 270 271 aNew = (int*)sqlite3_realloc(p->aFirst, nNew*sizeof(int)); 272 if( aNew==0 ) return SQLITE_NOMEM; 273 p->aFirst = aNew; 274 p->nFirstAlloc = nNew; 275 } 276 p->aFirst[p->nFirst++] = iAdd; 277 return SQLITE_OK; 278 } 279 280 /* 281 ** This function is an xTokenize() callback used by the auxiliary snippet() 282 ** function. Its job is to identify tokens that are the first in a sentence. 283 ** For each such token, an entry is added to the SFinder.aFirst[] array. 284 */ 285 static int fts5SentenceFinderCb( 286 void *pContext, /* Pointer to HighlightContext object */ 287 int tflags, /* Mask of FTS5_TOKEN_* flags */ 288 const char *pToken, /* Buffer containing token */ 289 int nToken, /* Size of token in bytes */ 290 int iStartOff, /* Start offset of token */ 291 int iEndOff /* End offset of token */ 292 ){ 293 int rc = SQLITE_OK; 294 295 UNUSED_PARAM2(pToken, nToken); 296 UNUSED_PARAM(iEndOff); 297 298 if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ 299 Fts5SFinder *p = (Fts5SFinder*)pContext; 300 if( p->iPos>0 ){ 301 int i; 302 char c = 0; 303 for(i=iStartOff-1; i>=0; i--){ 304 c = p->zDoc[i]; 305 if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break; 306 } 307 if( i!=iStartOff-1 && (c=='.' || c==':') ){ 308 rc = fts5SentenceFinderAdd(p, p->iPos); 309 } 310 }else{ 311 rc = fts5SentenceFinderAdd(p, 0); 312 } 313 p->iPos++; 314 } 315 return rc; 316 } 317 318 static int fts5SnippetScore( 319 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ 320 Fts5Context *pFts, /* First arg to pass to pApi functions */ 321 int nDocsize, /* Size of column in tokens */ 322 unsigned char *aSeen, /* Array with one element per query phrase */ 323 int iCol, /* Column to score */ 324 int iPos, /* Starting offset to score */ 325 int nToken, /* Max tokens per snippet */ 326 int *pnScore, /* OUT: Score */ 327 int *piPos /* OUT: Adjusted offset */ 328 ){ 329 int rc; 330 int i; 331 int ip = 0; 332 int ic = 0; 333 int iOff = 0; 334 int iFirst = -1; 335 int nInst; 336 int nScore = 0; 337 int iLast = 0; 338 339 rc = pApi->xInstCount(pFts, &nInst); 340 for(i=0; i<nInst && rc==SQLITE_OK; i++){ 341 rc = pApi->xInst(pFts, i, &ip, &ic, &iOff); 342 if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<(iPos+nToken) ){ 343 nScore += (aSeen[ip] ? 1 : 1000); 344 aSeen[ip] = 1; 345 if( iFirst<0 ) iFirst = iOff; 346 iLast = iOff + pApi->xPhraseSize(pFts, ip); 347 } 348 } 349 350 *pnScore = nScore; 351 if( piPos ){ 352 int iAdj = iFirst - (nToken - (iLast-iFirst)) / 2; 353 if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken; 354 if( iAdj<0 ) iAdj = 0; 355 *piPos = iAdj; 356 } 357 358 return rc; 359 } 360 361 /* 362 ** Implementation of snippet() function. 363 */ 364 static void fts5SnippetFunction( 365 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ 366 Fts5Context *pFts, /* First arg to pass to pApi functions */ 367 sqlite3_context *pCtx, /* Context for returning result/error */ 368 int nVal, /* Number of values in apVal[] array */ 369 sqlite3_value **apVal /* Array of trailing arguments */ 370 ){ 371 HighlightContext ctx; 372 int rc = SQLITE_OK; /* Return code */ 373 int iCol; /* 1st argument to snippet() */ 374 const char *zEllips; /* 4th argument to snippet() */ 375 int nToken; /* 5th argument to snippet() */ 376 int nInst = 0; /* Number of instance matches this row */ 377 int i; /* Used to iterate through instances */ 378 int nPhrase; /* Number of phrases in query */ 379 unsigned char *aSeen; /* Array of "seen instance" flags */ 380 int iBestCol; /* Column containing best snippet */ 381 int iBestStart = 0; /* First token of best snippet */ 382 int nBestScore = 0; /* Score of best snippet */ 383 int nColSize = 0; /* Total size of iBestCol in tokens */ 384 Fts5SFinder sFinder; /* Used to find the beginnings of sentences */ 385 int nCol; 386 387 if( nVal!=5 ){ 388 const char *zErr = "wrong number of arguments to function snippet()"; 389 sqlite3_result_error(pCtx, zErr, -1); 390 return; 391 } 392 393 nCol = pApi->xColumnCount(pFts); 394 memset(&ctx, 0, sizeof(HighlightContext)); 395 iCol = sqlite3_value_int(apVal[0]); 396 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); 397 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); 398 zEllips = (const char*)sqlite3_value_text(apVal[3]); 399 nToken = sqlite3_value_int(apVal[4]); 400 401 iBestCol = (iCol>=0 ? iCol : 0); 402 nPhrase = pApi->xPhraseCount(pFts); 403 aSeen = sqlite3_malloc(nPhrase); 404 if( aSeen==0 ){ 405 rc = SQLITE_NOMEM; 406 } 407 if( rc==SQLITE_OK ){ 408 rc = pApi->xInstCount(pFts, &nInst); 409 } 410 411 memset(&sFinder, 0, sizeof(Fts5SFinder)); 412 for(i=0; i<nCol; i++){ 413 if( iCol<0 || iCol==i ){ 414 int nDoc; 415 int nDocsize; 416 int ii; 417 sFinder.iPos = 0; 418 sFinder.nFirst = 0; 419 rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc); 420 if( rc!=SQLITE_OK ) break; 421 rc = pApi->xTokenize(pFts, 422 sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb 423 ); 424 if( rc!=SQLITE_OK ) break; 425 rc = pApi->xColumnSize(pFts, i, &nDocsize); 426 if( rc!=SQLITE_OK ) break; 427 428 for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){ 429 int ip, ic, io; 430 int iAdj; 431 int nScore; 432 int jj; 433 434 rc = pApi->xInst(pFts, ii, &ip, &ic, &io); 435 if( ic!=i || rc!=SQLITE_OK ) continue; 436 memset(aSeen, 0, nPhrase); 437 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, 438 io, nToken, &nScore, &iAdj 439 ); 440 if( rc==SQLITE_OK && nScore>nBestScore ){ 441 nBestScore = nScore; 442 iBestCol = i; 443 iBestStart = iAdj; 444 nColSize = nDocsize; 445 } 446 447 if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){ 448 for(jj=0; jj<(sFinder.nFirst-1); jj++){ 449 if( sFinder.aFirst[jj+1]>io ) break; 450 } 451 452 if( sFinder.aFirst[jj]<io ){ 453 memset(aSeen, 0, nPhrase); 454 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, 455 sFinder.aFirst[jj], nToken, &nScore, 0 456 ); 457 458 nScore += (sFinder.aFirst[jj]==0 ? 120 : 100); 459 if( rc==SQLITE_OK && nScore>nBestScore ){ 460 nBestScore = nScore; 461 iBestCol = i; 462 iBestStart = sFinder.aFirst[jj]; 463 nColSize = nDocsize; 464 } 465 } 466 } 467 } 468 } 469 } 470 471 if( rc==SQLITE_OK ){ 472 rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn); 473 } 474 if( rc==SQLITE_OK && nColSize==0 ){ 475 rc = pApi->xColumnSize(pFts, iBestCol, &nColSize); 476 } 477 if( ctx.zIn ){ 478 if( rc==SQLITE_OK ){ 479 rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter); 480 } 481 482 ctx.iRangeStart = iBestStart; 483 ctx.iRangeEnd = iBestStart + nToken - 1; 484 485 if( iBestStart>0 ){ 486 fts5HighlightAppend(&rc, &ctx, zEllips, -1); 487 } 488 489 /* Advance iterator ctx.iter so that it points to the first coalesced 490 ** phrase instance at or following position iBestStart. */ 491 while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){ 492 rc = fts5CInstIterNext(&ctx.iter); 493 } 494 495 if( rc==SQLITE_OK ){ 496 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); 497 } 498 if( ctx.iRangeEnd>=(nColSize-1) ){ 499 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); 500 }else{ 501 fts5HighlightAppend(&rc, &ctx, zEllips, -1); 502 } 503 } 504 if( rc==SQLITE_OK ){ 505 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); 506 }else{ 507 sqlite3_result_error_code(pCtx, rc); 508 } 509 sqlite3_free(ctx.zOut); 510 sqlite3_free(aSeen); 511 sqlite3_free(sFinder.aFirst); 512 } 513 514 /************************************************************************/ 515 516 /* 517 ** The first time the bm25() function is called for a query, an instance 518 ** of the following structure is allocated and populated. 519 */ 520 typedef struct Fts5Bm25Data Fts5Bm25Data; 521 struct Fts5Bm25Data { 522 int nPhrase; /* Number of phrases in query */ 523 double avgdl; /* Average number of tokens in each row */ 524 double *aIDF; /* IDF for each phrase */ 525 double *aFreq; /* Array used to calculate phrase freq. */ 526 }; 527 528 /* 529 ** Callback used by fts5Bm25GetData() to count the number of rows in the 530 ** table matched by each individual phrase within the query. 531 */ 532 static int fts5CountCb( 533 const Fts5ExtensionApi *pApi, 534 Fts5Context *pFts, 535 void *pUserData /* Pointer to sqlite3_int64 variable */ 536 ){ 537 sqlite3_int64 *pn = (sqlite3_int64*)pUserData; 538 UNUSED_PARAM2(pApi, pFts); 539 (*pn)++; 540 return SQLITE_OK; 541 } 542 543 /* 544 ** Set *ppData to point to the Fts5Bm25Data object for the current query. 545 ** If the object has not already been allocated, allocate and populate it 546 ** now. 547 */ 548 static int fts5Bm25GetData( 549 const Fts5ExtensionApi *pApi, 550 Fts5Context *pFts, 551 Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */ 552 ){ 553 int rc = SQLITE_OK; /* Return code */ 554 Fts5Bm25Data *p; /* Object to return */ 555 556 p = pApi->xGetAuxdata(pFts, 0); 557 if( p==0 ){ 558 int nPhrase; /* Number of phrases in query */ 559 sqlite3_int64 nRow = 0; /* Number of rows in table */ 560 sqlite3_int64 nToken = 0; /* Number of tokens in table */ 561 int nByte; /* Bytes of space to allocate */ 562 int i; 563 564 /* Allocate the Fts5Bm25Data object */ 565 nPhrase = pApi->xPhraseCount(pFts); 566 nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double); 567 p = (Fts5Bm25Data*)sqlite3_malloc(nByte); 568 if( p==0 ){ 569 rc = SQLITE_NOMEM; 570 }else{ 571 memset(p, 0, nByte); 572 p->nPhrase = nPhrase; 573 p->aIDF = (double*)&p[1]; 574 p->aFreq = &p->aIDF[nPhrase]; 575 } 576 577 /* Calculate the average document length for this FTS5 table */ 578 if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow); 579 if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken); 580 if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow; 581 582 /* Calculate an IDF for each phrase in the query */ 583 for(i=0; rc==SQLITE_OK && i<nPhrase; i++){ 584 sqlite3_int64 nHit = 0; 585 rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb); 586 if( rc==SQLITE_OK ){ 587 /* Calculate the IDF (Inverse Document Frequency) for phrase i. 588 ** This is done using the standard BM25 formula as found on wikipedia: 589 ** 590 ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) ) 591 ** 592 ** where "N" is the total number of documents in the set and nHit 593 ** is the number that contain at least one instance of the phrase 594 ** under consideration. 595 ** 596 ** The problem with this is that if (N < 2*nHit), the IDF is 597 ** negative. Which is undesirable. So the mimimum allowable IDF is 598 ** (1e-6) - roughly the same as a term that appears in just over 599 ** half of set of 5,000,000 documents. */ 600 double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) ); 601 if( idf<=0.0 ) idf = 1e-6; 602 p->aIDF[i] = idf; 603 } 604 } 605 606 if( rc!=SQLITE_OK ){ 607 sqlite3_free(p); 608 }else{ 609 rc = pApi->xSetAuxdata(pFts, p, sqlite3_free); 610 } 611 if( rc!=SQLITE_OK ) p = 0; 612 } 613 *ppData = p; 614 return rc; 615 } 616 617 /* 618 ** Implementation of bm25() function. 619 */ 620 static void fts5Bm25Function( 621 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ 622 Fts5Context *pFts, /* First arg to pass to pApi functions */ 623 sqlite3_context *pCtx, /* Context for returning result/error */ 624 int nVal, /* Number of values in apVal[] array */ 625 sqlite3_value **apVal /* Array of trailing arguments */ 626 ){ 627 const double k1 = 1.2; /* Constant "k1" from BM25 formula */ 628 const double b = 0.75; /* Constant "b" from BM25 formula */ 629 int rc = SQLITE_OK; /* Error code */ 630 double score = 0.0; /* SQL function return value */ 631 Fts5Bm25Data *pData; /* Values allocated/calculated once only */ 632 int i; /* Iterator variable */ 633 int nInst = 0; /* Value returned by xInstCount() */ 634 double D = 0.0; /* Total number of tokens in row */ 635 double *aFreq = 0; /* Array of phrase freq. for current row */ 636 637 /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation) 638 ** for each phrase in the query for the current row. */ 639 rc = fts5Bm25GetData(pApi, pFts, &pData); 640 if( rc==SQLITE_OK ){ 641 aFreq = pData->aFreq; 642 memset(aFreq, 0, sizeof(double) * pData->nPhrase); 643 rc = pApi->xInstCount(pFts, &nInst); 644 } 645 for(i=0; rc==SQLITE_OK && i<nInst; i++){ 646 int ip; int ic; int io; 647 rc = pApi->xInst(pFts, i, &ip, &ic, &io); 648 if( rc==SQLITE_OK ){ 649 double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0; 650 aFreq[ip] += w; 651 } 652 } 653 654 /* Figure out the total size of the current row in tokens. */ 655 if( rc==SQLITE_OK ){ 656 int nTok; 657 rc = pApi->xColumnSize(pFts, -1, &nTok); 658 D = (double)nTok; 659 } 660 661 /* Determine the BM25 score for the current row. */ 662 for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){ 663 score += pData->aIDF[i] * ( 664 ( aFreq[i] * (k1 + 1.0) ) / 665 ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) ) 666 ); 667 } 668 669 /* If no error has occurred, return the calculated score. Otherwise, 670 ** throw an SQL exception. */ 671 if( rc==SQLITE_OK ){ 672 sqlite3_result_double(pCtx, -1.0 * score); 673 }else{ 674 sqlite3_result_error_code(pCtx, rc); 675 } 676 } 677 678 int sqlite3Fts5AuxInit(fts5_api *pApi){ 679 struct Builtin { 680 const char *zFunc; /* Function name (nul-terminated) */ 681 void *pUserData; /* User-data pointer */ 682 fts5_extension_function xFunc;/* Callback function */ 683 void (*xDestroy)(void*); /* Destructor function */ 684 } aBuiltin [] = { 685 { "snippet", 0, fts5SnippetFunction, 0 }, 686 { "highlight", 0, fts5HighlightFunction, 0 }, 687 { "bm25", 0, fts5Bm25Function, 0 }, 688 }; 689 int rc = SQLITE_OK; /* Return code */ 690 int i; /* To iterate through builtin functions */ 691 692 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){ 693 rc = pApi->xCreateFunction(pApi, 694 aBuiltin[i].zFunc, 695 aBuiltin[i].pUserData, 696 aBuiltin[i].xFunc, 697 aBuiltin[i].xDestroy 698 ); 699 } 700 701 return rc; 702 } 703 704