modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/fts5_aux.c (about)

     1  /*
     2  ** 2014 May 31
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  ******************************************************************************
    12  */
    13  
    14  
    15  #include "fts5Int.h"
    16  #include <math.h>                 /* amalgamator: keep */
    17  
    18  /*
    19  ** Object used to iterate through all "coalesced phrase instances" in 
    20  ** a single column of the current row. If the phrase instances in the
    21  ** column being considered do not overlap, this object simply iterates
    22  ** through them. Or, if they do overlap (share one or more tokens in
    23  ** common), each set of overlapping instances is treated as a single
    24  ** match. See documentation for the highlight() auxiliary function for
    25  ** details.
    26  **
    27  ** Usage is:
    28  **
    29  **   for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
    30  **      (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
    31  **      rc = fts5CInstIterNext(&iter)
    32  **   ){
    33  **     printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
    34  **   }
    35  **
    36  */
    37  typedef struct CInstIter CInstIter;
    38  struct CInstIter {
    39    const Fts5ExtensionApi *pApi;   /* API offered by current FTS version */
    40    Fts5Context *pFts;              /* First arg to pass to pApi functions */
    41    int iCol;                       /* Column to search */
    42    int iInst;                      /* Next phrase instance index */
    43    int nInst;                      /* Total number of phrase instances */
    44  
    45    /* Output variables */
    46    int iStart;                     /* First token in coalesced phrase instance */
    47    int iEnd;                       /* Last token in coalesced phrase instance */
    48  };
    49  
    50  /*
    51  ** Advance the iterator to the next coalesced phrase instance. Return
    52  ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
    53  */
    54  static int fts5CInstIterNext(CInstIter *pIter){
    55    int rc = SQLITE_OK;
    56    pIter->iStart = -1;
    57    pIter->iEnd = -1;
    58  
    59    while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){
    60      int ip; int ic; int io;
    61      rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io);
    62      if( rc==SQLITE_OK ){
    63        if( ic==pIter->iCol ){
    64          int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip);
    65          if( pIter->iStart<0 ){
    66            pIter->iStart = io;
    67            pIter->iEnd = iEnd;
    68          }else if( io<=pIter->iEnd ){
    69            if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd;
    70          }else{
    71            break;
    72          }
    73        }
    74        pIter->iInst++;
    75      }
    76    }
    77  
    78    return rc;
    79  }
    80  
    81  /*
    82  ** Initialize the iterator object indicated by the final parameter to 
    83  ** iterate through coalesced phrase instances in column iCol.
    84  */
    85  static int fts5CInstIterInit(
    86    const Fts5ExtensionApi *pApi,
    87    Fts5Context *pFts,
    88    int iCol,
    89    CInstIter *pIter
    90  ){
    91    int rc;
    92  
    93    memset(pIter, 0, sizeof(CInstIter));
    94    pIter->pApi = pApi;
    95    pIter->pFts = pFts;
    96    pIter->iCol = iCol;
    97    rc = pApi->xInstCount(pFts, &pIter->nInst);
    98  
    99    if( rc==SQLITE_OK ){
   100      rc = fts5CInstIterNext(pIter);
   101    }
   102  
   103    return rc;
   104  }
   105  
   106  
   107  
   108  /*************************************************************************
   109  ** Start of highlight() implementation.
   110  */
   111  typedef struct HighlightContext HighlightContext;
   112  struct HighlightContext {
   113    CInstIter iter;                 /* Coalesced Instance Iterator */
   114    int iPos;                       /* Current token offset in zIn[] */
   115    int iRangeStart;                /* First token to include */
   116    int iRangeEnd;                  /* If non-zero, last token to include */
   117    const char *zOpen;              /* Opening highlight */
   118    const char *zClose;             /* Closing highlight */
   119    const char *zIn;                /* Input text */
   120    int nIn;                        /* Size of input text in bytes */
   121    int iOff;                       /* Current offset within zIn[] */
   122    char *zOut;                     /* Output value */
   123  };
   124  
   125  /*
   126  ** Append text to the HighlightContext output string - p->zOut. Argument
   127  ** z points to a buffer containing n bytes of text to append. If n is 
   128  ** negative, everything up until the first '\0' is appended to the output.
   129  **
   130  ** If *pRc is set to any value other than SQLITE_OK when this function is 
   131  ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered, 
   132  ** *pRc is set to an error code before returning. 
   133  */
   134  static void fts5HighlightAppend(
   135    int *pRc, 
   136    HighlightContext *p, 
   137    const char *z, int n
   138  ){
   139    if( *pRc==SQLITE_OK ){
   140      if( n<0 ) n = (int)strlen(z);
   141      p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z);
   142      if( p->zOut==0 ) *pRc = SQLITE_NOMEM;
   143    }
   144  }
   145  
   146  /*
   147  ** Tokenizer callback used by implementation of highlight() function.
   148  */
   149  static int fts5HighlightCb(
   150    void *pContext,                 /* Pointer to HighlightContext object */
   151    int tflags,                     /* Mask of FTS5_TOKEN_* flags */
   152    const char *pToken,             /* Buffer containing token */
   153    int nToken,                     /* Size of token in bytes */
   154    int iStartOff,                  /* Start offset of token */
   155    int iEndOff                     /* End offset of token */
   156  ){
   157    HighlightContext *p = (HighlightContext*)pContext;
   158    int rc = SQLITE_OK;
   159    int iPos;
   160  
   161    UNUSED_PARAM2(pToken, nToken);
   162  
   163    if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
   164    iPos = p->iPos++;
   165  
   166    if( p->iRangeEnd>0 ){
   167      if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
   168      if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
   169    }
   170  
   171    if( iPos==p->iter.iStart ){
   172      fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
   173      fts5HighlightAppend(&rc, p, p->zOpen, -1);
   174      p->iOff = iStartOff;
   175    }
   176  
   177    if( iPos==p->iter.iEnd ){
   178      if( p->iRangeEnd && p->iter.iStart<p->iRangeStart ){
   179        fts5HighlightAppend(&rc, p, p->zOpen, -1);
   180      }
   181      fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
   182      fts5HighlightAppend(&rc, p, p->zClose, -1);
   183      p->iOff = iEndOff;
   184      if( rc==SQLITE_OK ){
   185        rc = fts5CInstIterNext(&p->iter);
   186      }
   187    }
   188  
   189    if( p->iRangeEnd>0 && iPos==p->iRangeEnd ){
   190      fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
   191      p->iOff = iEndOff;
   192      if( iPos>=p->iter.iStart && iPos<p->iter.iEnd ){
   193        fts5HighlightAppend(&rc, p, p->zClose, -1);
   194      }
   195    }
   196  
   197    return rc;
   198  }
   199  
   200  /*
   201  ** Implementation of highlight() function.
   202  */
   203  static void fts5HighlightFunction(
   204    const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   205    Fts5Context *pFts,              /* First arg to pass to pApi functions */
   206    sqlite3_context *pCtx,          /* Context for returning result/error */
   207    int nVal,                       /* Number of values in apVal[] array */
   208    sqlite3_value **apVal           /* Array of trailing arguments */
   209  ){
   210    HighlightContext ctx;
   211    int rc;
   212    int iCol;
   213  
   214    if( nVal!=3 ){
   215      const char *zErr = "wrong number of arguments to function highlight()";
   216      sqlite3_result_error(pCtx, zErr, -1);
   217      return;
   218    }
   219  
   220    iCol = sqlite3_value_int(apVal[0]);
   221    memset(&ctx, 0, sizeof(HighlightContext));
   222    ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
   223    ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
   224    rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
   225  
   226    if( ctx.zIn ){
   227      if( rc==SQLITE_OK ){
   228        rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
   229      }
   230  
   231      if( rc==SQLITE_OK ){
   232        rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
   233      }
   234      fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
   235  
   236      if( rc==SQLITE_OK ){
   237        sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
   238      }
   239      sqlite3_free(ctx.zOut);
   240    }
   241    if( rc!=SQLITE_OK ){
   242      sqlite3_result_error_code(pCtx, rc);
   243    }
   244  }
   245  /*
   246  ** End of highlight() implementation.
   247  **************************************************************************/
   248  
   249  /*
   250  ** Context object passed to the fts5SentenceFinderCb() function.
   251  */
   252  typedef struct Fts5SFinder Fts5SFinder;
   253  struct Fts5SFinder {
   254    int iPos;                       /* Current token position */
   255    int nFirstAlloc;                /* Allocated size of aFirst[] */
   256    int nFirst;                     /* Number of entries in aFirst[] */
   257    int *aFirst;                    /* Array of first token in each sentence */
   258    const char *zDoc;               /* Document being tokenized */
   259  };
   260  
   261  /*
   262  ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if
   263  ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an
   264  ** error occurs.
   265  */
   266  static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){
   267    if( p->nFirstAlloc==p->nFirst ){
   268      int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64;
   269      int *aNew;
   270  
   271      aNew = (int*)sqlite3_realloc(p->aFirst, nNew*sizeof(int));
   272      if( aNew==0 ) return SQLITE_NOMEM;
   273      p->aFirst = aNew;
   274      p->nFirstAlloc = nNew;
   275    }
   276    p->aFirst[p->nFirst++] = iAdd;
   277    return SQLITE_OK;
   278  }
   279  
   280  /*
   281  ** This function is an xTokenize() callback used by the auxiliary snippet()
   282  ** function. Its job is to identify tokens that are the first in a sentence.
   283  ** For each such token, an entry is added to the SFinder.aFirst[] array.
   284  */
   285  static int fts5SentenceFinderCb(
   286    void *pContext,                 /* Pointer to HighlightContext object */
   287    int tflags,                     /* Mask of FTS5_TOKEN_* flags */
   288    const char *pToken,             /* Buffer containing token */
   289    int nToken,                     /* Size of token in bytes */
   290    int iStartOff,                  /* Start offset of token */
   291    int iEndOff                     /* End offset of token */
   292  ){
   293    int rc = SQLITE_OK;
   294  
   295    UNUSED_PARAM2(pToken, nToken);
   296    UNUSED_PARAM(iEndOff);
   297  
   298    if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
   299      Fts5SFinder *p = (Fts5SFinder*)pContext;
   300      if( p->iPos>0 ){
   301        int i;
   302        char c = 0;
   303        for(i=iStartOff-1; i>=0; i--){
   304          c = p->zDoc[i];
   305          if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break;
   306        }
   307        if( i!=iStartOff-1 && (c=='.' || c==':') ){
   308          rc = fts5SentenceFinderAdd(p, p->iPos);
   309        }
   310      }else{
   311        rc = fts5SentenceFinderAdd(p, 0);
   312      }
   313      p->iPos++;
   314    }
   315    return rc;
   316  }
   317  
   318  static int fts5SnippetScore(
   319    const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   320    Fts5Context *pFts,              /* First arg to pass to pApi functions */
   321    int nDocsize,                   /* Size of column in tokens */
   322    unsigned char *aSeen,           /* Array with one element per query phrase */
   323    int iCol,                       /* Column to score */
   324    int iPos,                       /* Starting offset to score */
   325    int nToken,                     /* Max tokens per snippet */
   326    int *pnScore,                   /* OUT: Score */
   327    int *piPos                      /* OUT: Adjusted offset */
   328  ){
   329    int rc;
   330    int i;
   331    int ip = 0;
   332    int ic = 0;
   333    int iOff = 0;
   334    int iFirst = -1;
   335    int nInst;
   336    int nScore = 0;
   337    int iLast = 0;
   338  
   339    rc = pApi->xInstCount(pFts, &nInst);
   340    for(i=0; i<nInst && rc==SQLITE_OK; i++){
   341      rc = pApi->xInst(pFts, i, &ip, &ic, &iOff);
   342      if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<(iPos+nToken) ){
   343        nScore += (aSeen[ip] ? 1 : 1000);
   344        aSeen[ip] = 1;
   345        if( iFirst<0 ) iFirst = iOff;
   346        iLast = iOff + pApi->xPhraseSize(pFts, ip);
   347      }
   348    }
   349  
   350    *pnScore = nScore;
   351    if( piPos ){
   352      int iAdj = iFirst - (nToken - (iLast-iFirst)) / 2;
   353      if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken;
   354      if( iAdj<0 ) iAdj = 0;
   355      *piPos = iAdj;
   356    }
   357  
   358    return rc;
   359  }
   360  
   361  /*
   362  ** Implementation of snippet() function.
   363  */
   364  static void fts5SnippetFunction(
   365    const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   366    Fts5Context *pFts,              /* First arg to pass to pApi functions */
   367    sqlite3_context *pCtx,          /* Context for returning result/error */
   368    int nVal,                       /* Number of values in apVal[] array */
   369    sqlite3_value **apVal           /* Array of trailing arguments */
   370  ){
   371    HighlightContext ctx;
   372    int rc = SQLITE_OK;             /* Return code */
   373    int iCol;                       /* 1st argument to snippet() */
   374    const char *zEllips;            /* 4th argument to snippet() */
   375    int nToken;                     /* 5th argument to snippet() */
   376    int nInst = 0;                  /* Number of instance matches this row */
   377    int i;                          /* Used to iterate through instances */
   378    int nPhrase;                    /* Number of phrases in query */
   379    unsigned char *aSeen;           /* Array of "seen instance" flags */
   380    int iBestCol;                   /* Column containing best snippet */
   381    int iBestStart = 0;             /* First token of best snippet */
   382    int nBestScore = 0;             /* Score of best snippet */
   383    int nColSize = 0;               /* Total size of iBestCol in tokens */
   384    Fts5SFinder sFinder;            /* Used to find the beginnings of sentences */
   385    int nCol;
   386  
   387    if( nVal!=5 ){
   388      const char *zErr = "wrong number of arguments to function snippet()";
   389      sqlite3_result_error(pCtx, zErr, -1);
   390      return;
   391    }
   392  
   393    nCol = pApi->xColumnCount(pFts);
   394    memset(&ctx, 0, sizeof(HighlightContext));
   395    iCol = sqlite3_value_int(apVal[0]);
   396    ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
   397    ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
   398    zEllips = (const char*)sqlite3_value_text(apVal[3]);
   399    nToken = sqlite3_value_int(apVal[4]);
   400  
   401    iBestCol = (iCol>=0 ? iCol : 0);
   402    nPhrase = pApi->xPhraseCount(pFts);
   403    aSeen = sqlite3_malloc(nPhrase);
   404    if( aSeen==0 ){
   405      rc = SQLITE_NOMEM;
   406    }
   407    if( rc==SQLITE_OK ){
   408      rc = pApi->xInstCount(pFts, &nInst);
   409    }
   410  
   411    memset(&sFinder, 0, sizeof(Fts5SFinder));
   412    for(i=0; i<nCol; i++){
   413      if( iCol<0 || iCol==i ){
   414        int nDoc;
   415        int nDocsize;
   416        int ii;
   417        sFinder.iPos = 0;
   418        sFinder.nFirst = 0;
   419        rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc);
   420        if( rc!=SQLITE_OK ) break;
   421        rc = pApi->xTokenize(pFts, 
   422            sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb
   423        );
   424        if( rc!=SQLITE_OK ) break;
   425        rc = pApi->xColumnSize(pFts, i, &nDocsize);
   426        if( rc!=SQLITE_OK ) break;
   427  
   428        for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){
   429          int ip, ic, io;
   430          int iAdj;
   431          int nScore;
   432          int jj;
   433  
   434          rc = pApi->xInst(pFts, ii, &ip, &ic, &io);
   435          if( ic!=i || rc!=SQLITE_OK ) continue;
   436          memset(aSeen, 0, nPhrase);
   437          rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
   438              io, nToken, &nScore, &iAdj
   439          );
   440          if( rc==SQLITE_OK && nScore>nBestScore ){
   441            nBestScore = nScore;
   442            iBestCol = i;
   443            iBestStart = iAdj;
   444            nColSize = nDocsize;
   445          }
   446  
   447          if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){
   448            for(jj=0; jj<(sFinder.nFirst-1); jj++){
   449              if( sFinder.aFirst[jj+1]>io ) break;
   450            }
   451  
   452            if( sFinder.aFirst[jj]<io ){
   453              memset(aSeen, 0, nPhrase);
   454              rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, 
   455                sFinder.aFirst[jj], nToken, &nScore, 0
   456              );
   457  
   458              nScore += (sFinder.aFirst[jj]==0 ? 120 : 100);
   459              if( rc==SQLITE_OK && nScore>nBestScore ){
   460                nBestScore = nScore;
   461                iBestCol = i;
   462                iBestStart = sFinder.aFirst[jj];
   463                nColSize = nDocsize;
   464              }
   465            }
   466          }
   467        }
   468      }
   469    }
   470  
   471    if( rc==SQLITE_OK ){
   472      rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn);
   473    }
   474    if( rc==SQLITE_OK && nColSize==0 ){
   475      rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
   476    }
   477    if( ctx.zIn ){
   478      if( rc==SQLITE_OK ){
   479        rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter);
   480      }
   481  
   482      ctx.iRangeStart = iBestStart;
   483      ctx.iRangeEnd = iBestStart + nToken - 1;
   484  
   485      if( iBestStart>0 ){
   486        fts5HighlightAppend(&rc, &ctx, zEllips, -1);
   487      }
   488  
   489      /* Advance iterator ctx.iter so that it points to the first coalesced
   490      ** phrase instance at or following position iBestStart. */
   491      while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){
   492        rc = fts5CInstIterNext(&ctx.iter);
   493      }
   494  
   495      if( rc==SQLITE_OK ){
   496        rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
   497      }
   498      if( ctx.iRangeEnd>=(nColSize-1) ){
   499        fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
   500      }else{
   501        fts5HighlightAppend(&rc, &ctx, zEllips, -1);
   502      }
   503    }
   504    if( rc==SQLITE_OK ){
   505      sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
   506    }else{
   507      sqlite3_result_error_code(pCtx, rc);
   508    }
   509    sqlite3_free(ctx.zOut);
   510    sqlite3_free(aSeen);
   511    sqlite3_free(sFinder.aFirst);
   512  }
   513  
   514  /************************************************************************/
   515  
   516  /*
   517  ** The first time the bm25() function is called for a query, an instance
   518  ** of the following structure is allocated and populated.
   519  */
   520  typedef struct Fts5Bm25Data Fts5Bm25Data;
   521  struct Fts5Bm25Data {
   522    int nPhrase;                    /* Number of phrases in query */
   523    double avgdl;                   /* Average number of tokens in each row */
   524    double *aIDF;                   /* IDF for each phrase */
   525    double *aFreq;                  /* Array used to calculate phrase freq. */
   526  };
   527  
   528  /*
   529  ** Callback used by fts5Bm25GetData() to count the number of rows in the
   530  ** table matched by each individual phrase within the query.
   531  */
   532  static int fts5CountCb(
   533    const Fts5ExtensionApi *pApi, 
   534    Fts5Context *pFts,
   535    void *pUserData                 /* Pointer to sqlite3_int64 variable */
   536  ){
   537    sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
   538    UNUSED_PARAM2(pApi, pFts);
   539    (*pn)++;
   540    return SQLITE_OK;
   541  }
   542  
   543  /*
   544  ** Set *ppData to point to the Fts5Bm25Data object for the current query. 
   545  ** If the object has not already been allocated, allocate and populate it
   546  ** now.
   547  */
   548  static int fts5Bm25GetData(
   549    const Fts5ExtensionApi *pApi, 
   550    Fts5Context *pFts,
   551    Fts5Bm25Data **ppData           /* OUT: bm25-data object for this query */
   552  ){
   553    int rc = SQLITE_OK;             /* Return code */
   554    Fts5Bm25Data *p;                /* Object to return */
   555  
   556    p = pApi->xGetAuxdata(pFts, 0);
   557    if( p==0 ){
   558      int nPhrase;                  /* Number of phrases in query */
   559      sqlite3_int64 nRow = 0;       /* Number of rows in table */
   560      sqlite3_int64 nToken = 0;     /* Number of tokens in table */
   561      int nByte;                    /* Bytes of space to allocate */
   562      int i;
   563  
   564      /* Allocate the Fts5Bm25Data object */
   565      nPhrase = pApi->xPhraseCount(pFts);
   566      nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
   567      p = (Fts5Bm25Data*)sqlite3_malloc(nByte);
   568      if( p==0 ){
   569        rc = SQLITE_NOMEM;
   570      }else{
   571        memset(p, 0, nByte);
   572        p->nPhrase = nPhrase;
   573        p->aIDF = (double*)&p[1];
   574        p->aFreq = &p->aIDF[nPhrase];
   575      }
   576  
   577      /* Calculate the average document length for this FTS5 table */
   578      if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
   579      if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
   580      if( rc==SQLITE_OK ) p->avgdl = (double)nToken  / (double)nRow;
   581  
   582      /* Calculate an IDF for each phrase in the query */
   583      for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
   584        sqlite3_int64 nHit = 0;
   585        rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
   586        if( rc==SQLITE_OK ){
   587          /* Calculate the IDF (Inverse Document Frequency) for phrase i.
   588          ** This is done using the standard BM25 formula as found on wikipedia:
   589          **
   590          **   IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
   591          **
   592          ** where "N" is the total number of documents in the set and nHit
   593          ** is the number that contain at least one instance of the phrase
   594          ** under consideration.
   595          **
   596          ** The problem with this is that if (N < 2*nHit), the IDF is 
   597          ** negative. Which is undesirable. So the mimimum allowable IDF is
   598          ** (1e-6) - roughly the same as a term that appears in just over
   599          ** half of set of 5,000,000 documents.  */
   600          double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
   601          if( idf<=0.0 ) idf = 1e-6;
   602          p->aIDF[i] = idf;
   603        }
   604      }
   605  
   606      if( rc!=SQLITE_OK ){
   607        sqlite3_free(p);
   608      }else{
   609        rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
   610      }
   611      if( rc!=SQLITE_OK ) p = 0;
   612    }
   613    *ppData = p;
   614    return rc;
   615  }
   616  
   617  /*
   618  ** Implementation of bm25() function.
   619  */
   620  static void fts5Bm25Function(
   621    const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   622    Fts5Context *pFts,              /* First arg to pass to pApi functions */
   623    sqlite3_context *pCtx,          /* Context for returning result/error */
   624    int nVal,                       /* Number of values in apVal[] array */
   625    sqlite3_value **apVal           /* Array of trailing arguments */
   626  ){
   627    const double k1 = 1.2;          /* Constant "k1" from BM25 formula */
   628    const double b = 0.75;          /* Constant "b" from BM25 formula */
   629    int rc = SQLITE_OK;             /* Error code */
   630    double score = 0.0;             /* SQL function return value */
   631    Fts5Bm25Data *pData;            /* Values allocated/calculated once only */
   632    int i;                          /* Iterator variable */
   633    int nInst = 0;                  /* Value returned by xInstCount() */
   634    double D = 0.0;                 /* Total number of tokens in row */
   635    double *aFreq = 0;              /* Array of phrase freq. for current row */
   636  
   637    /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
   638    ** for each phrase in the query for the current row. */
   639    rc = fts5Bm25GetData(pApi, pFts, &pData);
   640    if( rc==SQLITE_OK ){
   641      aFreq = pData->aFreq;
   642      memset(aFreq, 0, sizeof(double) * pData->nPhrase);
   643      rc = pApi->xInstCount(pFts, &nInst);
   644    }
   645    for(i=0; rc==SQLITE_OK && i<nInst; i++){
   646      int ip; int ic; int io;
   647      rc = pApi->xInst(pFts, i, &ip, &ic, &io);
   648      if( rc==SQLITE_OK ){
   649        double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
   650        aFreq[ip] += w;
   651      }
   652    }
   653  
   654    /* Figure out the total size of the current row in tokens. */
   655    if( rc==SQLITE_OK ){
   656      int nTok;
   657      rc = pApi->xColumnSize(pFts, -1, &nTok);
   658      D = (double)nTok;
   659    }
   660  
   661    /* Determine the BM25 score for the current row. */
   662    for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){
   663      score += pData->aIDF[i] * (
   664        ( aFreq[i] * (k1 + 1.0) ) / 
   665        ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
   666      );
   667    }
   668    
   669    /* If no error has occurred, return the calculated score. Otherwise,
   670    ** throw an SQL exception.  */
   671    if( rc==SQLITE_OK ){
   672      sqlite3_result_double(pCtx, -1.0 * score);
   673    }else{
   674      sqlite3_result_error_code(pCtx, rc);
   675    }
   676  }
   677  
   678  int sqlite3Fts5AuxInit(fts5_api *pApi){
   679    struct Builtin {
   680      const char *zFunc;            /* Function name (nul-terminated) */
   681      void *pUserData;              /* User-data pointer */
   682      fts5_extension_function xFunc;/* Callback function */
   683      void (*xDestroy)(void*);      /* Destructor function */
   684    } aBuiltin [] = {
   685      { "snippet",   0, fts5SnippetFunction, 0 },
   686      { "highlight", 0, fts5HighlightFunction, 0 },
   687      { "bm25",      0, fts5Bm25Function,    0 },
   688    };
   689    int rc = SQLITE_OK;             /* Return code */
   690    int i;                          /* To iterate through builtin functions */
   691  
   692    for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
   693      rc = pApi->xCreateFunction(pApi,
   694          aBuiltin[i].zFunc,
   695          aBuiltin[i].pUserData,
   696          aBuiltin[i].xFunc,
   697          aBuiltin[i].xDestroy
   698      );
   699    }
   700  
   701    return rc;
   702  }
   703  
   704