modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts3/fts3_test.c (about)

     1  /*
     2  ** 2011 Jun 13
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  ******************************************************************************
    12  **
    13  ** This file is not part of the production FTS code. It is only used for
    14  ** testing. It contains a Tcl command that can be used to test if a document
    15  ** matches an FTS NEAR expression.
    16  **
    17  ** As of March 2012, it also contains a version 1 tokenizer used for testing
    18  ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
    19  */
    20  
    21  #if defined(INCLUDE_SQLITE_TCL_H)
    22  #  include "sqlite_tcl.h"
    23  #else
    24  #  include "tcl.h"
    25  #  ifndef SQLITE_TCLAPI
    26  #    define SQLITE_TCLAPI
    27  #  endif
    28  #endif
    29  #include <string.h>
    30  #include <assert.h>
    31  
    32  #if defined(SQLITE_TEST)
    33  #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
    34  
    35  /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
    36  #include "fts3Int.h"
    37  
    38  #define NM_MAX_TOKEN 12
    39  
    40  typedef struct NearPhrase NearPhrase;
    41  typedef struct NearDocument NearDocument;
    42  typedef struct NearToken NearToken;
    43  
    44  struct NearDocument {
    45    int nToken;                     /* Length of token in bytes */
    46    NearToken *aToken;              /* Token array */
    47  };
    48  
    49  struct NearToken {
    50    int n;                          /* Length of token in bytes */
    51    const char *z;                  /* Pointer to token string */
    52  };
    53  
    54  struct NearPhrase {
    55    int nNear;                      /* Preceding NEAR value */
    56    int nToken;                     /* Number of tokens in this phrase */
    57    NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */
    58  };
    59  
    60  static int nm_phrase_match(
    61    NearPhrase *p,
    62    NearToken *aToken
    63  ){
    64    int ii;
    65  
    66    for(ii=0; ii<p->nToken; ii++){
    67      NearToken *pToken = &p->aToken[ii];
    68      if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){
    69        if( aToken[ii].n<(pToken->n-1) ) return 0;
    70        if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0;
    71      }else{
    72        if( aToken[ii].n!=pToken->n ) return 0;
    73        if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0;
    74      }
    75    }
    76  
    77    return 1;
    78  }
    79  
    80  static int nm_near_chain(
    81    int iDir,                       /* Direction to iterate through aPhrase[] */
    82    NearDocument *pDoc,             /* Document to match against */
    83    int iPos,                       /* Position at which iPhrase was found */
    84    int nPhrase,                    /* Size of phrase array */
    85    NearPhrase *aPhrase,            /* Phrase array */
    86    int iPhrase                     /* Index of phrase found */
    87  ){
    88    int iStart;
    89    int iStop;
    90    int ii;
    91    int nNear;
    92    int iPhrase2;
    93    NearPhrase *p;
    94    NearPhrase *pPrev;
    95  
    96    assert( iDir==1 || iDir==-1 );
    97  
    98    if( iDir==1 ){
    99      if( (iPhrase+1)==nPhrase ) return 1;
   100      nNear = aPhrase[iPhrase+1].nNear;
   101    }else{
   102      if( iPhrase==0 ) return 1;
   103      nNear = aPhrase[iPhrase].nNear;
   104    }
   105    pPrev = &aPhrase[iPhrase];
   106    iPhrase2 = iPhrase+iDir;
   107    p = &aPhrase[iPhrase2];
   108  
   109    iStart = iPos - nNear - p->nToken;
   110    iStop = iPos + nNear + pPrev->nToken;
   111  
   112    if( iStart<0 ) iStart = 0;
   113    if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken;
   114  
   115    for(ii=iStart; ii<=iStop; ii++){
   116      if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
   117        if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1;
   118      }
   119    }
   120  
   121    return 0;
   122  }
   123  
   124  static int nm_match_count(
   125    NearDocument *pDoc,             /* Document to match against */
   126    int nPhrase,                    /* Size of phrase array */
   127    NearPhrase *aPhrase,            /* Phrase array */
   128    int iPhrase                     /* Index of phrase to count matches for */
   129  ){
   130    int nOcc = 0;
   131    int ii;
   132    NearPhrase *p = &aPhrase[iPhrase];
   133  
   134    for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){
   135      if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
   136        /* Test forward NEAR chain (i>iPhrase) */
   137        if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
   138  
   139        /* Test reverse NEAR chain (i<iPhrase) */
   140        if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
   141  
   142        /* This is a real match. Increment the counter. */
   143        nOcc++;
   144      }
   145    } 
   146  
   147    return nOcc;
   148  }
   149  
   150  /*
   151  ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
   152  */
   153  static int SQLITE_TCLAPI fts3_near_match_cmd(
   154    ClientData clientData,
   155    Tcl_Interp *interp,
   156    int objc,
   157    Tcl_Obj *CONST objv[]
   158  ){
   159    int nTotal = 0;
   160    int rc;
   161    int ii;
   162    int nPhrase;
   163    NearPhrase *aPhrase = 0;
   164    NearDocument doc = {0, 0};
   165    Tcl_Obj **apDocToken;
   166    Tcl_Obj *pRet;
   167    Tcl_Obj *pPhrasecount = 0;
   168    
   169    Tcl_Obj **apExprToken;
   170    int nExprToken;
   171  
   172    UNUSED_PARAMETER(clientData);
   173  
   174    /* Must have 3 or more arguments. */
   175    if( objc<3 || (objc%2)==0 ){
   176      Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?...");
   177      rc = TCL_ERROR;
   178      goto near_match_out;
   179    }
   180  
   181    for(ii=3; ii<objc; ii+=2){
   182      enum NM_enum { NM_PHRASECOUNTS };
   183      struct TestnmSubcmd {
   184        char *zName;
   185        enum NM_enum eOpt;
   186      } aOpt[] = {
   187        { "-phrasecountvar", NM_PHRASECOUNTS },
   188        { 0, 0 }
   189      };
   190      int iOpt;
   191      if( Tcl_GetIndexFromObjStruct(
   192          interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt) 
   193      ){
   194        return TCL_ERROR;
   195      }
   196  
   197      switch( aOpt[iOpt].eOpt ){
   198        case NM_PHRASECOUNTS:
   199          pPhrasecount = objv[ii+1];
   200          break;
   201      }
   202    }
   203  
   204    rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken);
   205    if( rc!=TCL_OK ) goto near_match_out;
   206    doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken));
   207    for(ii=0; ii<doc.nToken; ii++){
   208      doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n);
   209    }
   210  
   211    rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken);
   212    if( rc!=TCL_OK ) goto near_match_out;
   213  
   214    nPhrase = (nExprToken + 1) / 2;
   215    aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase));
   216    memset(aPhrase, 0, nPhrase * sizeof(NearPhrase));
   217    for(ii=0; ii<nPhrase; ii++){
   218      Tcl_Obj *pPhrase = apExprToken[ii*2];
   219      Tcl_Obj **apToken;
   220      int nToken;
   221      int jj;
   222  
   223      rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken);
   224      if( rc!=TCL_OK ) goto near_match_out;
   225      if( nToken>NM_MAX_TOKEN ){
   226        Tcl_AppendResult(interp, "Too many tokens in phrase", 0);
   227        rc = TCL_ERROR;
   228        goto near_match_out;
   229      }
   230      for(jj=0; jj<nToken; jj++){
   231        NearToken *pT = &aPhrase[ii].aToken[jj];
   232        pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n);
   233      }
   234      aPhrase[ii].nToken = nToken;
   235    }
   236    for(ii=1; ii<nPhrase; ii++){
   237      Tcl_Obj *pNear = apExprToken[2*ii-1];
   238      int nNear;
   239      rc = Tcl_GetIntFromObj(interp, pNear, &nNear);
   240      if( rc!=TCL_OK ) goto near_match_out;
   241      aPhrase[ii].nNear = nNear;
   242    }
   243  
   244    pRet = Tcl_NewObj();
   245    Tcl_IncrRefCount(pRet);
   246    for(ii=0; ii<nPhrase; ii++){
   247      int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii);
   248      Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc));
   249      nTotal += nOcc;
   250    }
   251    if( pPhrasecount ){
   252      Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0);
   253    }
   254    Tcl_DecrRefCount(pRet);
   255    Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0));
   256  
   257   near_match_out: 
   258    ckfree((char *)aPhrase);
   259    ckfree((char *)doc.aToken);
   260    return rc;
   261  }
   262  
   263  /*
   264  **   Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
   265  **
   266  ** Normally, FTS uses hard-coded values to determine the minimum doclist
   267  ** size eligible for incremental loading, and the size of the chunks loaded
   268  ** when a doclist is incrementally loaded. This command allows the built-in
   269  ** values to be overridden for testing purposes.
   270  **
   271  ** If present, the first argument is the chunksize in bytes to load doclists
   272  ** in. The second argument is the minimum doclist size in bytes to use
   273  ** incremental loading with.
   274  **
   275  ** Whether or not the arguments are present, this command returns a list of
   276  ** two integers - the initial chunksize and threshold when the command is
   277  ** invoked. This can be used to restore the default behavior after running
   278  ** tests. For example:
   279  **
   280  **    # Override incr-load settings for testing:
   281  **    set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
   282  **
   283  **    .... run tests ....
   284  **
   285  **    # Restore initial incr-load settings:
   286  **    eval fts3_configure_incr_load $cfg
   287  */
   288  static int SQLITE_TCLAPI fts3_configure_incr_load_cmd(
   289    ClientData clientData,
   290    Tcl_Interp *interp,
   291    int objc,
   292    Tcl_Obj *CONST objv[]
   293  ){
   294  #ifdef SQLITE_ENABLE_FTS3
   295    extern int test_fts3_node_chunksize;
   296    extern int test_fts3_node_chunk_threshold;
   297    Tcl_Obj *pRet;
   298  
   299    if( objc!=1 && objc!=3 ){
   300      Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?");
   301      return TCL_ERROR;
   302    }
   303  
   304    pRet = Tcl_NewObj();
   305    Tcl_IncrRefCount(pRet);
   306    Tcl_ListObjAppendElement(
   307        interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize));
   308    Tcl_ListObjAppendElement(
   309        interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold));
   310  
   311    if( objc==3 ){
   312      int iArg1;
   313      int iArg2;
   314      if( Tcl_GetIntFromObj(interp, objv[1], &iArg1)
   315       || Tcl_GetIntFromObj(interp, objv[2], &iArg2)
   316      ){
   317        Tcl_DecrRefCount(pRet);
   318        return TCL_ERROR;
   319      }
   320      test_fts3_node_chunksize = iArg1;
   321      test_fts3_node_chunk_threshold = iArg2;
   322    }
   323  
   324    Tcl_SetObjResult(interp, pRet);
   325    Tcl_DecrRefCount(pRet);
   326  #endif
   327    UNUSED_PARAMETER(clientData);
   328    return TCL_OK;
   329  }
   330  
   331  #ifdef SQLITE_ENABLE_FTS3
   332  /**************************************************************************
   333  ** Beginning of test tokenizer code.
   334  **
   335  ** For language 0, this tokenizer is similar to the default 'simple' 
   336  ** tokenizer. For other languages L, the following:
   337  **
   338  **   * Odd numbered languages are case-sensitive. Even numbered 
   339  **     languages are not.
   340  **
   341  **   * Language ids 100 or greater are considered an error.
   342  **
   343  ** The implementation assumes that the input contains only ASCII characters
   344  ** (i.e. those that may be encoded in UTF-8 using a single byte).
   345  */
   346  typedef struct test_tokenizer {
   347    sqlite3_tokenizer base;
   348  } test_tokenizer;
   349  
   350  typedef struct test_tokenizer_cursor {
   351    sqlite3_tokenizer_cursor base;
   352    const char *aInput;          /* Input being tokenized */
   353    int nInput;                  /* Size of the input in bytes */
   354    int iInput;                  /* Current offset in aInput */
   355    int iToken;                  /* Index of next token to be returned */
   356    char *aBuffer;               /* Buffer containing current token */
   357    int nBuffer;                 /* Number of bytes allocated at pToken */
   358    int iLangid;                 /* Configured language id */
   359  } test_tokenizer_cursor;
   360  
   361  static int testTokenizerCreate(
   362    int argc, const char * const *argv,
   363    sqlite3_tokenizer **ppTokenizer
   364  ){
   365    test_tokenizer *pNew;
   366    UNUSED_PARAMETER(argc);
   367    UNUSED_PARAMETER(argv);
   368  
   369    pNew = sqlite3_malloc(sizeof(test_tokenizer));
   370    if( !pNew ) return SQLITE_NOMEM;
   371    memset(pNew, 0, sizeof(test_tokenizer));
   372  
   373    *ppTokenizer = (sqlite3_tokenizer *)pNew;
   374    return SQLITE_OK;
   375  }
   376  
   377  static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
   378    test_tokenizer *p = (test_tokenizer *)pTokenizer;
   379    sqlite3_free(p);
   380    return SQLITE_OK;
   381  }
   382  
   383  static int testTokenizerOpen(
   384    sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
   385    const char *pInput, int nBytes,        /* String to be tokenized */
   386    sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
   387  ){
   388    int rc = SQLITE_OK;                    /* Return code */
   389    test_tokenizer_cursor *pCsr;           /* New cursor object */
   390  
   391    UNUSED_PARAMETER(pTokenizer);
   392  
   393    pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
   394    if( pCsr==0 ){
   395      rc = SQLITE_NOMEM;
   396    }else{
   397      memset(pCsr, 0, sizeof(test_tokenizer_cursor));
   398      pCsr->aInput = pInput;
   399      if( nBytes<0 ){
   400        pCsr->nInput = (int)strlen(pInput);
   401      }else{
   402        pCsr->nInput = nBytes;
   403      }
   404    }
   405  
   406    *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
   407    return rc;
   408  }
   409  
   410  static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
   411    test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
   412    sqlite3_free(pCsr->aBuffer);
   413    sqlite3_free(pCsr);
   414    return SQLITE_OK;
   415  }
   416  
   417  static int testIsTokenChar(char c){
   418    return (c>='a' && c<='z') || (c>='A' && c<='Z');
   419  }
   420  static int testTolower(char c){
   421    char ret = c;
   422    if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
   423    return ret;
   424  }
   425  
   426  static int testTokenizerNext(
   427    sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by testTokenizerOpen */
   428    const char **ppToken,               /* OUT: *ppToken is the token text */
   429    int *pnBytes,                       /* OUT: Number of bytes in token */
   430    int *piStartOffset,                 /* OUT: Starting offset of token */
   431    int *piEndOffset,                   /* OUT: Ending offset of token */
   432    int *piPosition                     /* OUT: Position integer of token */
   433  ){
   434    test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
   435    int rc = SQLITE_OK;
   436    const char *p;
   437    const char *pEnd;
   438  
   439    p = &pCsr->aInput[pCsr->iInput];
   440    pEnd = &pCsr->aInput[pCsr->nInput];
   441  
   442    /* Skip past any white-space */
   443    assert( p<=pEnd );
   444    while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
   445  
   446    if( p==pEnd ){
   447      rc = SQLITE_DONE;
   448    }else{
   449      /* Advance to the end of the token */
   450      const char *pToken = p;
   451      int nToken;
   452      while( p<pEnd && testIsTokenChar(*p) ) p++;
   453      nToken = (int)(p-pToken);
   454  
   455      /* Copy the token into the buffer */
   456      if( nToken>pCsr->nBuffer ){
   457        sqlite3_free(pCsr->aBuffer);
   458        pCsr->aBuffer = sqlite3_malloc(nToken);
   459      }
   460      if( pCsr->aBuffer==0 ){
   461        rc = SQLITE_NOMEM;
   462      }else{
   463        int i;
   464  
   465        if( pCsr->iLangid & 0x00000001 ){
   466          for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
   467        }else{
   468          for(i=0; i<nToken; i++) pCsr->aBuffer[i] = (char)testTolower(pToken[i]);
   469        }
   470        pCsr->iToken++;
   471        pCsr->iInput = (int)(p - pCsr->aInput);
   472  
   473        *ppToken = pCsr->aBuffer;
   474        *pnBytes = nToken;
   475        *piStartOffset = (int)(pToken - pCsr->aInput);
   476        *piEndOffset = (int)(p - pCsr->aInput);
   477        *piPosition = pCsr->iToken;
   478      }
   479    }
   480  
   481    return rc;
   482  }
   483  
   484  static int testTokenizerLanguage(
   485    sqlite3_tokenizer_cursor *pCursor,
   486    int iLangid
   487  ){
   488    int rc = SQLITE_OK;
   489    test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
   490    pCsr->iLangid = iLangid;
   491    if( pCsr->iLangid>=100 ){
   492      rc = SQLITE_ERROR;
   493    }
   494    return rc;
   495  }
   496  #endif
   497  
   498  static int SQLITE_TCLAPI fts3_test_tokenizer_cmd(
   499    ClientData clientData,
   500    Tcl_Interp *interp,
   501    int objc,
   502    Tcl_Obj *CONST objv[]
   503  ){
   504  #ifdef SQLITE_ENABLE_FTS3
   505    static const sqlite3_tokenizer_module testTokenizerModule = {
   506      1,
   507      testTokenizerCreate,
   508      testTokenizerDestroy,
   509      testTokenizerOpen,
   510      testTokenizerClose,
   511      testTokenizerNext,
   512      testTokenizerLanguage
   513    };
   514    const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
   515    if( objc!=1 ){
   516      Tcl_WrongNumArgs(interp, 1, objv, "");
   517      return TCL_ERROR;
   518    }
   519    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
   520      (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
   521    ));
   522  #endif
   523    UNUSED_PARAMETER(clientData);
   524    return TCL_OK;
   525  }
   526  
   527  static int SQLITE_TCLAPI fts3_test_varint_cmd(
   528    ClientData clientData,
   529    Tcl_Interp *interp,
   530    int objc,
   531    Tcl_Obj *CONST objv[]
   532  ){
   533  #ifdef SQLITE_ENABLE_FTS3
   534    char aBuf[24];
   535    int rc;
   536    Tcl_WideInt w;
   537    sqlite3_int64 w2;
   538    int nByte, nByte2;
   539  
   540    if( objc!=2 ){
   541      Tcl_WrongNumArgs(interp, 1, objv, "INTEGER");
   542      return TCL_ERROR;
   543    }
   544  
   545    rc = Tcl_GetWideIntFromObj(interp, objv[1], &w);
   546    if( rc!=TCL_OK ) return rc;
   547  
   548    nByte = sqlite3Fts3PutVarint(aBuf, w);
   549    nByte2 = sqlite3Fts3GetVarint(aBuf, &w2);
   550    if( w!=w2 || nByte!=nByte2 ){
   551      char *zErr = sqlite3_mprintf("error testing %lld", w);
   552      Tcl_ResetResult(interp);
   553      Tcl_AppendResult(interp, zErr, 0);
   554      return TCL_ERROR;
   555    }
   556  
   557    if( w<=2147483647 && w>=0 ){
   558      int i;
   559      nByte2 = fts3GetVarint32(aBuf, &i);
   560      if( (int)w!=i || nByte!=nByte2 ){
   561        char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w);
   562        Tcl_ResetResult(interp);
   563        Tcl_AppendResult(interp, zErr, 0);
   564        return TCL_ERROR;
   565      }
   566    }
   567  
   568  #endif
   569    UNUSED_PARAMETER(clientData);
   570    return TCL_OK;
   571  }
   572  
   573  /* 
   574  ** End of tokenizer code.
   575  **************************************************************************/ 
   576  
   577  int Sqlitetestfts3_Init(Tcl_Interp *interp){
   578    Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
   579    Tcl_CreateObjCommand(interp, 
   580        "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
   581    );
   582    Tcl_CreateObjCommand(
   583        interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
   584    );
   585  
   586    Tcl_CreateObjCommand(
   587        interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0
   588    );
   589    return TCL_OK;
   590  }
   591  #endif                  /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
   592  #endif                  /* ifdef SQLITE_TEST */