modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts1/fulltext.c (about)

     1  /* The author disclaims copyright to this source code.
     2   *
     3   * This is an SQLite module implementing full-text search.
     4   */
     5  
     6  #include <assert.h>
     7  #if !defined(__APPLE__)
     8  #include <malloc.h>
     9  #else
    10  #include <stdlib.h>
    11  #endif
    12  #include <stdio.h>
    13  #include <string.h>
    14  #include <ctype.h>
    15  
    16  #include "fulltext.h"
    17  #include "ft_hash.h"
    18  #include "tokenizer.h"
    19  #include "sqlite3.h"
    20  #include "sqlite3ext.h"
    21  SQLITE_EXTENSION_INIT1
    22  
    23  /* utility functions */
    24  
    25  /* We encode variable-length integers in little-endian order using seven bits
    26   * per byte as follows:
    27  **
    28  ** KEY:
    29  **         A = 0xxxxxxx    7 bits of data and one flag bit
    30  **         B = 1xxxxxxx    7 bits of data and one flag bit
    31  **
    32  **  7 bits - A
    33  ** 14 bits - BA
    34  ** 21 bits - BBA
    35  ** and so on.
    36  */
    37  
    38  /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
    39  #define VARINT_MAX 10
    40  
    41  /* Write a 64-bit variable-length integer to memory starting at p[0].
    42   * The length of data written will be between 1 and VARINT_MAX bytes.
    43   * The number of bytes written is returned. */
    44  static int putVarint(char *p, sqlite_int64 v){
    45    unsigned char *q = (unsigned char *) p;
    46    sqlite_uint64 vu = v;
    47    do{
    48      *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
    49      vu >>= 7;
    50    }while( vu!=0 );
    51    q[-1] &= 0x7f;  /* turn off high bit in final byte */
    52    assert( q - (unsigned char *)p <= VARINT_MAX );
    53    return (int) (q - (unsigned char *)p);
    54  }
    55  
    56  /* Read a 64-bit variable-length integer from memory starting at p[0].
    57   * Return the number of bytes read, or 0 on error.
    58   * The value is stored in *v. */
    59  static int getVarint(const char *p, sqlite_int64 *v){
    60    const unsigned char *q = (const unsigned char *) p;
    61    sqlite_uint64 x = 0, y = 1;
    62    while( (*q & 0x80) == 0x80 ){
    63      x += y * (*q++ & 0x7f);
    64      y <<= 7;
    65      if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */
    66        assert( 0 );
    67        return 0;
    68      }
    69    }
    70    x += y * (*q++);
    71    *v = (sqlite_int64) x;
    72    return (int) (q - (unsigned char *)p);
    73  }
    74  
    75  static int getVarint32(const char *p, int *pi){
    76   sqlite_int64 i;
    77   int ret = getVarint(p, &i);
    78   *pi = (int) i;
    79   assert( *pi==i );
    80   return ret;
    81  }
    82  
    83  /*** Document lists ***
    84   *
    85   * A document list holds a sorted list of varint-encoded document IDs.
    86   *
    87   * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
    88   *
    89   * array {
    90   *   varint docid;
    91   *   array {
    92   *     varint position;     (delta from previous position plus 1, or 0 for end)
    93   *     varint startOffset;  (delta from previous startOffset)
    94   *     varint endOffset;    (delta from startOffset)
    95   *   }
    96   * }
    97   *
    98   * Here, array { X } means zero or more occurrences of X, adjacent in memory.
    99   *
   100   * A doclist with type DL_POSITIONS is like the above, but holds only docids
   101   * and positions without offset information.
   102   *
   103   * A doclist with type DL_DOCIDS is like the above, but holds only docids
   104   * without positions or offset information.
   105   *
   106   * On disk, every document list has positions and offsets, so we don't bother
   107   * to serialize a doclist's type.
   108   * 
   109   * We don't yet delta-encode document IDs; doing so will probably be a
   110   * modest win.
   111   *
   112   * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
   113   * After the first offset, estimate the next offset by using the
   114   * current token position and the previous token position and offset,
   115   * offset to handle some variance.  So the estimate would be
   116   * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
   117   * as normal.  Offsets more than 64 chars from the estimate are
   118   * encoded as the delta to the previous start offset + 128.  An
   119   * additional tiny increment can be gained by using the end offset of
   120   * the previous token to make the estimate a tiny bit more precise.
   121  */
   122  
   123  typedef enum DocListType {
   124    DL_DOCIDS,              /* docids only */
   125    DL_POSITIONS,           /* docids + positions */
   126    DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
   127  } DocListType;
   128  
   129  typedef struct DocList {
   130    char *pData;
   131    int nData;
   132    DocListType iType;
   133    int iLastPos;       /* the last position written */
   134    int iLastOffset;    /* the last start offset written */
   135  } DocList;
   136  
   137  /* Initialize a new DocList to hold the given data. */
   138  static void docListInit(DocList *d, DocListType iType,
   139                          const char *pData, int nData){
   140    d->nData = nData;
   141    if( nData>0 ){
   142      d->pData = malloc(nData);
   143      memcpy(d->pData, pData, nData);
   144    } else {
   145      d->pData = NULL;
   146    }
   147    d->iType = iType;
   148    d->iLastPos = 0;
   149    d->iLastOffset = 0;
   150  }
   151  
   152  /* Create a new dynamically-allocated DocList. */
   153  static DocList *docListNew(DocListType iType){
   154    DocList *d = (DocList *) malloc(sizeof(DocList));
   155    docListInit(d, iType, 0, 0);
   156    return d;
   157  }
   158  
   159  static void docListDestroy(DocList *d){
   160    free(d->pData);
   161  #ifndef NDEBUG
   162    memset(d, 0x55, sizeof(*d));
   163  #endif
   164  }
   165  
   166  static void docListDelete(DocList *d){
   167    docListDestroy(d);
   168    free(d);
   169  }
   170  
   171  static char *docListEnd(DocList *d){
   172    return d->pData + d->nData;
   173  }
   174  
   175  /* Append a varint to a DocList's data. */
   176  static void appendVarint(DocList *d, sqlite_int64 i){
   177    char c[VARINT_MAX];
   178    int n = putVarint(c, i);
   179    d->pData = realloc(d->pData, d->nData + n);
   180    memcpy(d->pData + d->nData, c, n);
   181    d->nData += n;
   182  }
   183  
   184  static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
   185    appendVarint(d, iDocid);
   186    d->iLastPos = 0;
   187  }
   188  
   189  /* Add a position to the last position list in a doclist. */
   190  static void docListAddPos(DocList *d, int iPos){
   191    assert( d->iType>=DL_POSITIONS );
   192    appendVarint(d, iPos-d->iLastPos+1);
   193    d->iLastPos = iPos;
   194  }
   195  
   196  static void docListAddPosOffset(DocList *d, int iPos,
   197                                  int iStartOffset, int iEndOffset){
   198    assert( d->iType==DL_POSITIONS_OFFSETS );
   199    docListAddPos(d, iPos);
   200    appendVarint(d, iStartOffset-d->iLastOffset);
   201    d->iLastOffset = iStartOffset;
   202    appendVarint(d, iEndOffset-iStartOffset);
   203  }
   204  
   205  /* Terminate the last position list in the given doclist. */
   206  static void docListAddEndPos(DocList *d){
   207    appendVarint(d, 0);
   208  }
   209  
   210  typedef struct DocListReader {
   211    DocList *pDoclist;
   212    char *p;
   213    int iLastPos;    /* the last position read */
   214  } DocListReader;
   215  
   216  static void readerInit(DocListReader *r, DocList *pDoclist){
   217    r->pDoclist = pDoclist;
   218    if( pDoclist!=NULL ){
   219      r->p = pDoclist->pData;
   220    }
   221    r->iLastPos = 0;
   222  }
   223  
   224  static int readerAtEnd(DocListReader *pReader){
   225    return pReader->p >= docListEnd(pReader->pDoclist);
   226  }
   227  
   228  /* Peek at the next docid without advancing the read pointer. */
   229  static sqlite_int64 peekDocid(DocListReader *pReader){
   230    sqlite_int64 ret;
   231    assert( !readerAtEnd(pReader) );
   232    getVarint(pReader->p, &ret);
   233    return ret;
   234  }
   235  
   236  /* Read the next docid. */
   237  static sqlite_int64 readDocid(DocListReader *pReader){
   238    sqlite_int64 ret;
   239    assert( !readerAtEnd(pReader) );
   240    pReader->p += getVarint(pReader->p, &ret);
   241    pReader->iLastPos = 0;
   242    return ret;
   243  }
   244  
   245  /* Read the next position from a position list.
   246   * Returns the position, or -1 at the end of the list. */
   247  static int readPosition(DocListReader *pReader){
   248    int i;
   249    int iType = pReader->pDoclist->iType;
   250    assert( iType>=DL_POSITIONS );
   251    assert( !readerAtEnd(pReader) );
   252  
   253    pReader->p += getVarint32(pReader->p, &i);
   254    if( i==0 ){
   255      pReader->iLastPos = -1;
   256      return -1;
   257    }
   258    pReader->iLastPos += ((int) i)-1;
   259    if( iType>=DL_POSITIONS_OFFSETS ){
   260      /* Skip over offsets, ignoring them for now. */
   261      int iStart, iEnd;
   262      pReader->p += getVarint32(pReader->p, &iStart);
   263      pReader->p += getVarint32(pReader->p, &iEnd);
   264    }
   265    return pReader->iLastPos;
   266  }
   267  
   268  /* Skip past the end of a position list. */
   269  static void skipPositionList(DocListReader *pReader){
   270    while( readPosition(pReader)!=-1 )
   271      ;
   272  }
   273  
   274  /* Skip over a docid, including its position list if the doclist has
   275   * positions. */
   276  static void skipDocument(DocListReader *pReader){
   277    readDocid(pReader);
   278    if( pReader->pDoclist->iType >= DL_POSITIONS ){
   279      skipPositionList(pReader);
   280    }
   281  }
   282  
   283  static sqlite_int64 firstDocid(DocList *d){
   284    DocListReader r;
   285    readerInit(&r, d);
   286    return readDocid(&r);
   287  }
   288  
   289  /* Doclist multi-tool.  Pass pUpdate==NULL to delete the indicated docid;
   290   * otherwise pUpdate, which must contain only the single docid [iDocid], is
   291   * inserted (if not present) or updated (if already present). */
   292  static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){
   293    int modified = 0;
   294    DocListReader reader;
   295    char *p;
   296  
   297    if( pUpdate!=NULL ){
   298      assert( d->iType==pUpdate->iType);
   299      assert( iDocid==firstDocid(pUpdate) );
   300    }
   301  
   302    readerInit(&reader, d);
   303    while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){
   304      skipDocument(&reader);
   305    }
   306  
   307    p = reader.p;
   308    /* Delete if there is a matching element. */
   309    if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){
   310      skipDocument(&reader);
   311      memmove(p, reader.p, docListEnd(d) - reader.p);
   312      d->nData -= (reader.p - p);
   313      modified = 1;
   314    }
   315  
   316    /* Insert if indicated. */
   317    if( pUpdate!=NULL ){
   318      int iDoclist = p-d->pData;
   319      docListAddEndPos(pUpdate);
   320  
   321      d->pData = realloc(d->pData, d->nData+pUpdate->nData);
   322      p = d->pData + iDoclist;
   323  
   324      memmove(p+pUpdate->nData, p, docListEnd(d) - p);
   325      memcpy(p, pUpdate->pData, pUpdate->nData);
   326      d->nData += pUpdate->nData;
   327      modified = 1;
   328    }
   329  
   330    return modified;
   331  }
   332  
   333  /* Split the second half of doclist d into a separate doclist d2.  Returns 1
   334   * if successful, or 0 if d contains a single document and hence can't be
   335   * split. */
   336  static int docListSplit(DocList *d, DocList *d2){
   337    const char *pSplitPoint = d->pData + d->nData / 2;
   338    DocListReader reader;
   339  
   340    readerInit(&reader, d);
   341    while( reader.p<pSplitPoint ){
   342      skipDocument(&reader);
   343    }
   344    if( readerAtEnd(&reader) ) return 0;
   345    docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p);
   346    d->nData = reader.p - d->pData;
   347    d->pData = realloc(d->pData, d->nData);
   348    return 1;
   349  }
   350  
   351  /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked
   352   * on-disk doclist, resulting in another in-memory DocList [out].  [in]
   353   * and [out] may or may not store position information according to the
   354   * caller's wishes.  The on-disk doclist always comes with positions.
   355   *
   356   * The caller must read each chunk of the on-disk doclist in succession and
   357   * pass it to mergeBlock().
   358   *
   359   * If [in] has positions, then the merge output contains only documents with
   360   * matching positions in the two input doclists.  If [in] does not have
   361   * positions, then the merge output contains all documents common to the two
   362   * input doclists.
   363   *
   364   * If [in] is NULL, then the on-disk doclist is copied to [out] directly.
   365   *
   366   * A merge is performed using an integer [iOffset] provided by the caller.
   367   * [iOffset] is subtracted from each position in the on-disk doclist for the
   368   * purpose of position comparison; this is helpful in implementing phrase
   369   * searches.
   370   *
   371   * A DocListMerge is not yet able to propagate offsets through query
   372   * processing; we should add that capability soon.
   373  */
   374  typedef struct DocListMerge {
   375    DocListReader in;
   376    DocList *pOut;
   377    int iOffset;
   378  } DocListMerge;
   379  
   380  static void mergeInit(DocListMerge *m,
   381                        DocList *pIn, int iOffset, DocList *pOut){
   382    readerInit(&m->in, pIn);
   383    m->pOut = pOut;
   384    m->iOffset = iOffset;
   385  
   386    /* can't handle offsets yet */
   387    assert( pIn==NULL || pIn->iType <= DL_POSITIONS );
   388    assert( pOut->iType <= DL_POSITIONS );
   389  }
   390  
   391  /* A helper function for mergeBlock(), below.  Merge the position lists
   392   * pointed to by m->in and pBlockReader.
   393   * If the merge matches, write [iDocid] to m->pOut; if m->pOut
   394   * has positions then write all matching positions as well. */
   395  static void mergePosList(DocListMerge *m, sqlite_int64 iDocid,
   396                    DocListReader *pBlockReader){
   397    int block_pos = readPosition(pBlockReader);
   398    int in_pos = readPosition(&m->in);
   399    int match = 0;
   400    while( block_pos!=-1 || in_pos!=-1 ){
   401      if( block_pos-m->iOffset==in_pos ){
   402        if( !match ){
   403          docListAddDocid(m->pOut, iDocid);
   404          match = 1;
   405        }
   406        if( m->pOut->iType >= DL_POSITIONS ){
   407          docListAddPos(m->pOut, in_pos);
   408        }
   409        block_pos = readPosition(pBlockReader);
   410        in_pos = readPosition(&m->in);
   411      } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){
   412        block_pos = readPosition(pBlockReader);
   413      } else {
   414        in_pos = readPosition(&m->in);
   415      }
   416    }
   417    if( m->pOut->iType >= DL_POSITIONS && match ){
   418      docListAddEndPos(m->pOut);
   419    }
   420  }
   421  
   422  /* Merge one block of an on-disk doclist into a DocListMerge. */
   423  static void mergeBlock(DocListMerge *m, DocList *pBlock){
   424    DocListReader blockReader;
   425    assert( pBlock->iType >= DL_POSITIONS );
   426    readerInit(&blockReader, pBlock);
   427    while( !readerAtEnd(&blockReader) ){
   428      sqlite_int64 iDocid = readDocid(&blockReader);
   429      if( m->in.pDoclist!=NULL ){
   430        while( 1 ){
   431          if( readerAtEnd(&m->in) ) return;  /* nothing more to merge */
   432          if( peekDocid(&m->in)>=iDocid ) break;
   433          skipDocument(&m->in);
   434        }
   435        if( peekDocid(&m->in)>iDocid ){  /* [pIn] has no match with iDocid */
   436          skipPositionList(&blockReader);  /* skip this docid in the block */
   437          continue;
   438        }
   439        readDocid(&m->in);
   440      }
   441      /* We have a document match. */
   442      if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){
   443        /* We don't need to do a poslist merge. */
   444        docListAddDocid(m->pOut, iDocid);
   445        if( m->pOut->iType >= DL_POSITIONS ){
   446          /* Copy all positions to the output doclist. */
   447          while( 1 ){
   448            int pos = readPosition(&blockReader);
   449            if( pos==-1 ) break;
   450            docListAddPos(m->pOut, pos);
   451          }
   452          docListAddEndPos(m->pOut);
   453        } else skipPositionList(&blockReader);
   454        continue;
   455      }
   456      mergePosList(m, iDocid, &blockReader);
   457    }
   458  }
   459  
   460  static char *string_dup_n(const char *s, int n){
   461    char *str = malloc(n + 1);
   462    memcpy(str, s, n);
   463    str[n] = '\0';
   464    return str;
   465  }
   466  
   467  /* Duplicate a string; the caller must free() the returned string.
   468   * (We don't use strdup() since it's not part of the standard C library and
   469   * may not be available everywhere.) */
   470  static char *string_dup(const char *s){
   471    return string_dup_n(s, strlen(s));
   472  }
   473  
   474  /* Format a string, replacing each occurrence of the % character with
   475   * zName.  This may be more convenient than sqlite_mprintf()
   476   * when one string is used repeatedly in a format string.
   477   * The caller must free() the returned string. */
   478  static char *string_format(const char *zFormat, const char *zName){
   479    const char *p;
   480    size_t len = 0;
   481    size_t nName = strlen(zName);
   482    char *result;
   483    char *r;
   484  
   485    /* first compute length needed */
   486    for(p = zFormat ; *p ; ++p){
   487      len += (*p=='%' ? nName : 1);
   488    }
   489    len += 1;  /* for null terminator */
   490  
   491    r = result = malloc(len);
   492    for(p = zFormat; *p; ++p){
   493      if( *p=='%' ){
   494        memcpy(r, zName, nName);
   495        r += nName;
   496      } else {
   497        *r++ = *p;
   498      }
   499    }
   500    *r++ = '\0';
   501    assert( r == result + len );
   502    return result;
   503  }
   504  
   505  static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){
   506    char *zCommand = string_format(zFormat, zName);
   507    int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
   508    free(zCommand);
   509    return rc;
   510  }
   511  
   512  static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt,
   513                  const char *zFormat){
   514    char *zCommand = string_format(zFormat, zName);
   515    int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
   516    free(zCommand);
   517    return rc;
   518  }
   519  
   520  /* end utility functions */
   521  
   522  #define QUERY_GENERIC 0
   523  #define QUERY_FULLTEXT 1
   524  
   525  #define CHUNK_MAX 1024
   526  
   527  typedef enum fulltext_statement {
   528    CONTENT_INSERT_STMT,
   529    CONTENT_SELECT_STMT,
   530    CONTENT_DELETE_STMT,
   531  
   532    TERM_SELECT_STMT,
   533    TERM_CHUNK_SELECT_STMT,
   534    TERM_INSERT_STMT,
   535    TERM_UPDATE_STMT,
   536    TERM_DELETE_STMT,
   537  
   538    MAX_STMT                     /* Always at end! */
   539  } fulltext_statement;
   540  
   541  /* These must exactly match the enum above. */
   542  /* TODO(adam): Is there some risk that a statement (in particular,
   543  ** pTermSelectStmt) will be used in two cursors at once, e.g.  if a
   544  ** query joins a virtual table to itself?  If so perhaps we should
   545  ** move some of these to the cursor object.
   546  */
   547  static const char *fulltext_zStatement[MAX_STMT] = {
   548    /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)",
   549    /* CONTENT_SELECT */ "select content from %_content where rowid = ?",
   550    /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
   551  
   552    /* TERM_SELECT */
   553    "select rowid, doclist from %_term where term = ? and first = ?",
   554    /* TERM_CHUNK_SELECT */
   555    "select max(first) from %_term where term = ? and first <= ?",
   556    /* TERM_INSERT */
   557    "insert into %_term (term, first, doclist) values (?, ?, ?)",
   558    /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
   559    /* TERM_DELETE */ "delete from %_term where rowid = ?",
   560  };
   561  
   562  typedef struct fulltext_vtab {
   563    sqlite3_vtab base;
   564    sqlite3 *db;
   565    const char *zName;               /* virtual table name */
   566    sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
   567  
   568    /* Precompiled statements which we keep as long as the table is
   569    ** open.
   570    */
   571    sqlite3_stmt *pFulltextStatements[MAX_STMT];
   572  } fulltext_vtab;
   573  
   574  typedef struct fulltext_cursor {
   575    sqlite3_vtab_cursor base;
   576    int iCursorType;  /* QUERY_GENERIC or QUERY_FULLTEXT */
   577  
   578    sqlite3_stmt *pStmt;
   579  
   580    int eof;
   581  
   582    /* The following is used only when iCursorType == QUERY_FULLTEXT. */
   583    DocListReader result;
   584  } fulltext_cursor;
   585  
   586  static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
   587    return (fulltext_vtab *) c->base.pVtab;
   588  }
   589  
   590  static sqlite3_module fulltextModule;   /* forward declaration */
   591  
   592  /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
   593  ** If the indicated statement has never been prepared, it is prepared
   594  ** and cached, otherwise the cached version is reset.
   595  */
   596  static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
   597                               sqlite3_stmt **ppStmt){
   598    assert( iStmt<MAX_STMT );
   599    if( v->pFulltextStatements[iStmt]==NULL ){
   600      int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt],
   601                           fulltext_zStatement[iStmt]);
   602      if( rc!=SQLITE_OK ) return rc;
   603    } else {
   604      int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
   605      if( rc!=SQLITE_OK ) return rc;
   606    }
   607  
   608    *ppStmt = v->pFulltextStatements[iStmt];
   609    return SQLITE_OK;
   610  }
   611  
   612  /* Step the indicated statement, handling errors SQLITE_BUSY (by
   613  ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
   614  ** bindings to the new statement).
   615  ** TODO(adam): We should extend this function so that it can work with
   616  ** statements declared locally, not only globally cached statements.
   617  */
   618  static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
   619                                sqlite3_stmt **ppStmt){
   620    int rc;
   621    sqlite3_stmt *s = *ppStmt;
   622    assert( iStmt<MAX_STMT );
   623    assert( s==v->pFulltextStatements[iStmt] );
   624  
   625    while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
   626      sqlite3_stmt *pNewStmt;
   627  
   628      if( rc==SQLITE_BUSY ) continue;
   629      if( rc!=SQLITE_ERROR ) return rc;
   630  
   631      rc = sqlite3_reset(s);
   632      if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR;
   633  
   634      v->pFulltextStatements[iStmt] = NULL;   /* Still in s */
   635      rc = sql_get_statement(v, iStmt, &pNewStmt);
   636      if( rc!=SQLITE_OK ) goto err;
   637      *ppStmt = pNewStmt;
   638  
   639      rc = sqlite3_transfer_bindings(s, pNewStmt);
   640      if( rc!=SQLITE_OK ) goto err;
   641  
   642      rc = sqlite3_finalize(s);
   643      if( rc!=SQLITE_OK ) return rc;
   644      s = pNewStmt;
   645    }
   646    return rc;
   647  
   648   err:
   649    sqlite3_finalize(s);
   650    return rc;
   651  }
   652  
   653  /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
   654  ** Useful for statements like UPDATE, where we expect no results.
   655  */
   656  static int sql_single_step_statement(fulltext_vtab *v,
   657                                       fulltext_statement iStmt,
   658                                       sqlite3_stmt **ppStmt){
   659    int rc = sql_step_statement(v, iStmt, ppStmt);
   660    return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
   661  }
   662  
   663  /* insert into %_content (rowid, content) values ([rowid], [zContent]) */
   664  static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
   665                            const char *zContent, int nContent){
   666    sqlite3_stmt *s;
   667    int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
   668    if( rc!=SQLITE_OK ) return rc;
   669  
   670    rc = sqlite3_bind_value(s, 1, rowid);
   671    if( rc!=SQLITE_OK ) return rc;
   672  
   673    rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC);
   674    if( rc!=SQLITE_OK ) return rc;
   675  
   676    return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
   677  }
   678  
   679  /* select content from %_content where rowid = [iRow]
   680   * The caller must delete the returned string. */
   681  static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
   682                            char **pzContent){
   683    sqlite3_stmt *s;
   684    int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
   685    if( rc!=SQLITE_OK ) return rc;
   686  
   687    rc = sqlite3_bind_int64(s, 1, iRow);
   688    if( rc!=SQLITE_OK ) return rc;
   689  
   690    rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
   691    if( rc!=SQLITE_ROW ) return rc;
   692  
   693    *pzContent = string_dup((const char *)sqlite3_column_text(s, 0));
   694  
   695    /* We expect only one row.  We must execute another sqlite3_step()
   696     * to complete the iteration; otherwise the table will remain locked. */
   697    rc = sqlite3_step(s);
   698    if( rc==SQLITE_DONE ) return SQLITE_OK;
   699  
   700    free(*pzContent);
   701    return rc;
   702  }
   703  
   704  /* delete from %_content where rowid = [iRow ] */
   705  static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
   706    sqlite3_stmt *s;
   707    int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
   708    if( rc!=SQLITE_OK ) return rc;
   709  
   710    rc = sqlite3_bind_int64(s, 1, iRow);
   711    if( rc!=SQLITE_OK ) return rc;
   712  
   713    return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
   714  }
   715  
   716  /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst]
   717   * If found, returns SQLITE_OK; the caller must free the returned doclist.
   718   * If no rows found, returns SQLITE_ERROR. */
   719  static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm,
   720                         sqlite_int64 iFirst,
   721                         sqlite_int64 *rowid,
   722                         DocList *out){
   723    sqlite3_stmt *s;
   724    int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
   725    if( rc!=SQLITE_OK ) return rc;
   726  
   727    rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT);
   728    if( rc!=SQLITE_OK ) return rc;
   729  
   730    rc = sqlite3_bind_int64(s, 2, iFirst);
   731    if( rc!=SQLITE_OK ) return rc;
   732  
   733    rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
   734    if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
   735  
   736    *rowid = sqlite3_column_int64(s, 0);
   737    docListInit(out, DL_POSITIONS_OFFSETS,
   738                sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
   739  
   740    /* We expect only one row.  We must execute another sqlite3_step()
   741     * to complete the iteration; otherwise the table will remain locked. */
   742    rc = sqlite3_step(s);
   743    return rc==SQLITE_DONE ? SQLITE_OK : rc;
   744  }
   745  
   746  /* select max(first) from %_term where term = [zTerm] and first <= [iFirst]
   747   * If found, returns SQLITE_ROW and result in *piResult; if the query returns
   748   * NULL (meaning no row found) returns SQLITE_DONE.
   749   */
   750  static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm,
   751                             sqlite_int64 iFirst, sqlite_int64 *piResult){
   752    sqlite3_stmt *s;
   753    int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s);
   754    if( rc!=SQLITE_OK ) return rc;
   755  
   756    rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
   757    if( rc!=SQLITE_OK ) return rc;
   758  
   759    rc = sqlite3_bind_int64(s, 2, iFirst);
   760    if( rc!=SQLITE_OK ) return rc;
   761  
   762    rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s);
   763    if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
   764  
   765    switch( sqlite3_column_type(s, 0) ){
   766      case SQLITE_NULL:
   767        rc = SQLITE_DONE;
   768        break;
   769      case SQLITE_INTEGER:
   770       *piResult = sqlite3_column_int64(s, 0);
   771       break;
   772      default:
   773        return SQLITE_ERROR;
   774    }
   775    /* We expect only one row.  We must execute another sqlite3_step()
   776     * to complete the iteration; otherwise the table will remain locked. */
   777    if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR;
   778    return rc;
   779  }
   780  
   781  /* insert into %_term (term, first, doclist)
   782                 values ([zTerm], [iFirst], [doclist]) */
   783  static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm,
   784                         sqlite_int64 iFirst, DocList *doclist){
   785    sqlite3_stmt *s;
   786    int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
   787    if( rc!=SQLITE_OK ) return rc;
   788  
   789    rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
   790    if( rc!=SQLITE_OK ) return rc;
   791  
   792    rc = sqlite3_bind_int64(s, 2, iFirst);
   793    if( rc!=SQLITE_OK ) return rc;
   794  
   795    rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC);
   796    if( rc!=SQLITE_OK ) return rc;
   797  
   798    return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
   799  }
   800  
   801  /* update %_term set doclist = [doclist] where rowid = [rowid] */
   802  static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
   803                         DocList *doclist){
   804    sqlite3_stmt *s;
   805    int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
   806    if( rc!=SQLITE_OK ) return rc;
   807  
   808    rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData,
   809                           SQLITE_STATIC);
   810    if( rc!=SQLITE_OK ) return rc;
   811  
   812    rc = sqlite3_bind_int64(s, 2, rowid);
   813    if( rc!=SQLITE_OK ) return rc;
   814  
   815    return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
   816  }
   817  
   818  static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
   819    sqlite3_stmt *s;
   820    int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
   821    if( rc!=SQLITE_OK ) return rc;
   822  
   823    rc = sqlite3_bind_int64(s, 1, rowid);
   824    if( rc!=SQLITE_OK ) return rc;
   825  
   826    return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
   827  }
   828  
   829  static void fulltext_vtab_destroy(fulltext_vtab *v){
   830    int iStmt;
   831  
   832    for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
   833      if( v->pFulltextStatements[iStmt]!=NULL ){
   834        sqlite3_finalize(v->pFulltextStatements[iStmt]);
   835        v->pFulltextStatements[iStmt] = NULL;
   836      }
   837    }
   838  
   839    if( v->pTokenizer!=NULL ){
   840      v->pTokenizer->pModule->xDestroy(v->pTokenizer);
   841      v->pTokenizer = NULL;
   842    }
   843  
   844    free((void *) v->zName);
   845    free(v);
   846  }
   847  
   848  /* Current interface:
   849  ** argv[0] - module name
   850  ** argv[1] - database name
   851  ** argv[2] - table name
   852  ** argv[3] - tokenizer name (optional, a sensible default is provided)
   853  ** argv[4..] - passed to tokenizer (optional based on tokenizer)
   854  **/
   855  static int fulltextConnect(
   856    sqlite3 *db,
   857    void *pAux,
   858    int argc,
   859    const char * const *argv,
   860    sqlite3_vtab **ppVTab,
   861    char **pzErr
   862  ){
   863    int rc;
   864    fulltext_vtab *v;
   865    sqlite3_tokenizer_module *m = NULL;
   866  
   867    assert( argc>=3 );
   868    v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
   869    /* sqlite will initialize v->base */
   870    v->db = db;
   871    v->zName = string_dup(argv[2]);
   872    v->pTokenizer = NULL;
   873  
   874    if( argc==3 ){
   875      get_simple_tokenizer_module(&m);
   876    } else {
   877      /* TODO(shess) For now, add new tokenizers as else if clauses. */
   878      if( !strcmp(argv[3], "simple") ){
   879        get_simple_tokenizer_module(&m);
   880      } else {
   881        assert( "unrecognized tokenizer"==NULL );
   882      }
   883    }
   884  
   885    /* TODO(shess) Since tokenization impacts the index, the parameters
   886    ** to the tokenizer need to be identical when a persistent virtual
   887    ** table is re-created.  One solution would be a meta-table to track
   888    ** such information in the database.  Then we could verify that the
   889    ** information is identical on subsequent creates.
   890    */
   891    /* TODO(shess) Why isn't argv already (const char **)? */
   892    rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer);
   893    if( rc!=SQLITE_OK ) return rc;
   894    v->pTokenizer->pModule = m;
   895  
   896    /* TODO: verify the existence of backing tables foo_content, foo_term */
   897  
   898    rc = sqlite3_declare_vtab(db, "create table x(content text)");
   899    if( rc!=SQLITE_OK ) return rc;
   900  
   901    memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
   902  
   903    *ppVTab = &v->base;
   904    return SQLITE_OK;
   905  }
   906  
   907  static int fulltextCreate(
   908    sqlite3 *db,
   909    void *pAux,
   910    int argc,
   911    const char * const *argv,
   912    sqlite3_vtab **ppVTab,
   913    char **pzErr
   914  ){
   915    int rc;
   916    assert( argc>=3 );
   917  
   918    /* The %_content table holds the text of each full-text item, with
   919    ** the rowid used as the docid.
   920    **
   921    ** The %_term table maps each term to a document list blob
   922    ** containing elements sorted by ascending docid, each element
   923    ** encoded as:
   924    **
   925    **   docid varint-encoded
   926    **   token count varint-encoded
   927    **   "count" token elements (poslist):
   928    **     position varint-encoded as delta from previous position
   929    **     start offset varint-encoded as delta from previous start offset
   930    **     end offset varint-encoded as delta from start offset
   931    **
   932    ** Additionally, doclist blobs can be chunked into multiple rows,
   933    ** using "first" to order the blobs.  "first" is simply the first
   934    ** docid in the blob.
   935    */
   936    /*
   937    ** NOTE(shess) That last sentence is incorrect in the face of
   938    ** deletion, which can leave a doclist that doesn't contain the
   939    ** first from that row.  I _believe_ this does not matter to the
   940    ** operation of the system, but it might be reasonable to update
   941    ** appropriately in case this assumption becomes more important.
   942    */
   943    rc = sql_exec(db, argv[2],
   944      "create table %_content(content text);"
   945      "create table %_term(term text, first integer, doclist blob);"
   946      "create index %_index on %_term(term, first)");
   947    if( rc!=SQLITE_OK ) return rc;
   948  
   949    return fulltextConnect(db, pAux, argc, argv, ppVTab, pzErr);
   950  }
   951  
   952  /* Decide how to handle an SQL query.
   953   * At the moment, MATCH queries can include implicit boolean ANDs; we
   954   * haven't implemented phrase searches or OR yet. */
   955  static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
   956    int i;
   957  
   958    for(i=0; i<pInfo->nConstraint; ++i){
   959      const struct sqlite3_index_constraint *pConstraint;
   960      pConstraint = &pInfo->aConstraint[i];
   961      if( pConstraint->iColumn==0 &&
   962          pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH &&
   963          pConstraint->usable ){   /* a full-text search */
   964        pInfo->aConstraintUsage[i].argvIndex = 1;
   965        pInfo->aConstraintUsage[i].omit = 1;
   966        pInfo->idxNum = QUERY_FULLTEXT;
   967        pInfo->estimatedCost = 1.0;   /* an arbitrary value for now */
   968        return SQLITE_OK;
   969      }
   970    }
   971    pInfo->idxNum = QUERY_GENERIC;
   972    return SQLITE_OK;
   973  }
   974  
   975  static int fulltextDisconnect(sqlite3_vtab *pVTab){
   976    fulltext_vtab_destroy((fulltext_vtab *)pVTab);
   977    return SQLITE_OK;
   978  }
   979  
   980  static int fulltextDestroy(sqlite3_vtab *pVTab){
   981    fulltext_vtab *v = (fulltext_vtab *)pVTab;
   982  
   983    int rc = sql_exec(v->db, v->zName,
   984                      "drop table %_content; drop table %_term");
   985    if( rc!=SQLITE_OK ) return rc;
   986  
   987    fulltext_vtab_destroy((fulltext_vtab *)pVTab);
   988    return SQLITE_OK;
   989  }
   990  
   991  static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
   992    fulltext_cursor *c;
   993  
   994    c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
   995    /* sqlite will initialize c->base */
   996    *ppCursor = &c->base;
   997  
   998    return SQLITE_OK;
   999  }
  1000  
  1001  static int fulltextClose(sqlite3_vtab_cursor *pCursor){
  1002    fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1003    sqlite3_finalize(c->pStmt);
  1004    if( c->result.pDoclist!=NULL ){
  1005      docListDelete(c->result.pDoclist);
  1006    }
  1007    free(c);
  1008    return SQLITE_OK;
  1009  }
  1010  
  1011  static int fulltextNext(sqlite3_vtab_cursor *pCursor){
  1012    fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1013    sqlite_int64 iDocid;
  1014    int rc;
  1015  
  1016    switch( c->iCursorType ){
  1017      case QUERY_GENERIC:
  1018        /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
  1019        rc = sqlite3_step(c->pStmt);
  1020        switch( rc ){
  1021          case SQLITE_ROW:
  1022            c->eof = 0;
  1023            return SQLITE_OK;
  1024          case SQLITE_DONE:
  1025            c->eof = 1;
  1026            return SQLITE_OK;
  1027          default:
  1028            c->eof = 1;
  1029            return rc;
  1030        }
  1031      case QUERY_FULLTEXT:
  1032        rc = sqlite3_reset(c->pStmt);
  1033        if( rc!=SQLITE_OK ) return rc;
  1034  
  1035        if( readerAtEnd(&c->result)){
  1036          c->eof = 1;
  1037          return SQLITE_OK;
  1038        }
  1039        iDocid = readDocid(&c->result);
  1040        rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
  1041        if( rc!=SQLITE_OK ) return rc;
  1042        /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
  1043        rc = sqlite3_step(c->pStmt);
  1044        if( rc==SQLITE_ROW ){   /* the case we expect */
  1045          c->eof = 0;
  1046          return SQLITE_OK;
  1047        }
  1048        /* an error occurred; abort */
  1049        return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
  1050      default:
  1051        assert( 0 );
  1052        return SQLITE_ERROR;  /* not reached */
  1053    }
  1054  }
  1055  
  1056  static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm,
  1057                                 sqlite3_stmt **ppStmt){
  1058    int rc;
  1059    if( *ppStmt ){
  1060      rc = sqlite3_reset(*ppStmt);
  1061    } else {
  1062      rc = sql_prepare(v->db, v->zName, ppStmt,
  1063        "select doclist from %_term where term = ? order by first");
  1064    }
  1065    if( rc!=SQLITE_OK ) return rc;
  1066  
  1067    rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT);
  1068    if( rc!=SQLITE_OK ) return rc;
  1069  
  1070    return sqlite3_step(*ppStmt);   /* TODO(adamd): handle schema error */
  1071  }
  1072  
  1073  /* Read the posting list for [zTerm]; AND it with the doclist [in] to
  1074   * produce the doclist [out], using the given offset [iOffset] for phrase
  1075   * matching.
  1076   * (*pSelect) is used to hold an SQLite statement used inside this function;
  1077   * the caller should initialize *pSelect to NULL before the first call.
  1078   */
  1079  static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect,
  1080                         const char *zTerm,
  1081                         DocList *pIn, int iOffset, DocList *out){
  1082    int rc;
  1083    DocListMerge merge;
  1084  
  1085    if( pIn!=NULL && !pIn->nData ){
  1086      /* If [pIn] is already empty, there's no point in reading the
  1087       * posting list to AND it in; return immediately. */
  1088        return SQLITE_OK;
  1089    }
  1090  
  1091    rc = term_select_doclist(v, zTerm, -1, pSelect);
  1092    if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
  1093  
  1094    mergeInit(&merge, pIn, iOffset, out);
  1095    while( rc==SQLITE_ROW ){
  1096      DocList block;
  1097      docListInit(&block, DL_POSITIONS_OFFSETS,
  1098                  sqlite3_column_blob(*pSelect, 0),
  1099                  sqlite3_column_bytes(*pSelect, 0));
  1100      mergeBlock(&merge, &block);
  1101      docListDestroy(&block);
  1102  
  1103      rc = sqlite3_step(*pSelect);
  1104      if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){
  1105        return rc;
  1106      }
  1107    }
  1108    
  1109    return SQLITE_OK;
  1110  }
  1111  
  1112  typedef struct QueryTerm {
  1113    int is_phrase;    /* true if this term begins a new phrase */
  1114    const char *zTerm;
  1115  } QueryTerm;
  1116  
  1117  /* A parsed query.
  1118   *
  1119   * As an example, parsing the query ["four score" years "new nation"] will
  1120   * yield a Query with 5 terms:
  1121   *   "four",   is_phrase = 1
  1122   *   "score",  is_phrase = 0
  1123   *   "years",  is_phrase = 1
  1124   *   "new",    is_phrase = 1
  1125   *   "nation", is_phrase = 0
  1126   */
  1127  typedef struct Query {
  1128    int nTerms;
  1129    QueryTerm *pTerm;
  1130  } Query;
  1131  
  1132  static void query_add(Query *q, int is_phrase, const char *zTerm){
  1133    QueryTerm *t;
  1134    ++q->nTerms;
  1135    q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0]));
  1136    t = &q->pTerm[q->nTerms - 1];
  1137    t->is_phrase = is_phrase;
  1138    t->zTerm = zTerm;
  1139  }
  1140      
  1141  static void query_free(Query *q){
  1142    int i;
  1143    for(i = 0; i < q->nTerms; ++i){
  1144      free((void *) q->pTerm[i].zTerm);
  1145    }
  1146    free(q->pTerm);
  1147  }
  1148  
  1149  static int tokenize_segment(sqlite3_tokenizer *pTokenizer,
  1150                              const char *zQuery, int in_phrase,
  1151                              Query *pQuery){
  1152    sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
  1153    sqlite3_tokenizer_cursor *pCursor;
  1154    int is_first = 1;
  1155    
  1156    int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor);
  1157    if( rc!=SQLITE_OK ) return rc;
  1158    pCursor->pTokenizer = pTokenizer;
  1159  
  1160    while( 1 ){
  1161      const char *zToken;
  1162      int nToken, iStartOffset, iEndOffset, dummy_pos;
  1163  
  1164      rc = pModule->xNext(pCursor,
  1165                          &zToken, &nToken,
  1166                          &iStartOffset, &iEndOffset,
  1167                          &dummy_pos);
  1168      if( rc!=SQLITE_OK ) break;
  1169      query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken));
  1170      is_first = 0;
  1171    }
  1172  
  1173    return pModule->xClose(pCursor);
  1174  }
  1175  
  1176  /* Parse a query string, yielding a Query object. */
  1177  static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){
  1178    char *zQuery1 = string_dup(zQuery);
  1179    int in_phrase = 0;
  1180    char *s = zQuery1;
  1181    pQuery->nTerms = 0;
  1182    pQuery->pTerm = NULL;
  1183  
  1184    while( *s ){
  1185      char *t = s;
  1186      while( *t ){
  1187        if( *t=='"' ){
  1188          *t++ = '\0';
  1189          break;
  1190        }
  1191        ++t;
  1192      }
  1193      if( *s ){
  1194        tokenize_segment(v->pTokenizer, s, in_phrase, pQuery);
  1195      }
  1196      s = t;
  1197      in_phrase = !in_phrase;
  1198    }
  1199    
  1200    free(zQuery1);
  1201    return SQLITE_OK;
  1202  }
  1203  
  1204  /* Perform a full-text query; return a list of documents in [pResult]. */
  1205  static int fulltext_query(fulltext_vtab *v, const char *zQuery,
  1206                            DocList **pResult){
  1207    Query q;
  1208    int phrase_start = -1;
  1209    int i;
  1210    sqlite3_stmt *pSelect = NULL;
  1211    DocList *d = NULL;
  1212  
  1213    int rc = parse_query(v, zQuery, &q);
  1214    if( rc!=SQLITE_OK ) return rc;
  1215  
  1216    /* Merge terms. */
  1217    for(i = 0 ; i < q.nTerms ; ++i){
  1218      /* In each merge step, we need to generate positions whenever we're
  1219       * processing a phrase which hasn't ended yet. */
  1220      int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase;
  1221      DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS);
  1222      if( q.pTerm[i].is_phrase ){
  1223        phrase_start = i;
  1224      }
  1225      rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next);
  1226      if( rc!=SQLITE_OK ) break;
  1227      if( d!=NULL ){
  1228        docListDelete(d);
  1229      }
  1230      d = next;
  1231    }
  1232  
  1233    sqlite3_finalize(pSelect);
  1234    query_free(&q);
  1235    *pResult = d;
  1236    return rc;
  1237  }
  1238  
  1239  static int fulltextFilter(sqlite3_vtab_cursor *pCursor,
  1240                            int idxNum, const char *idxStr,
  1241                            int argc, sqlite3_value **argv){
  1242    fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1243    fulltext_vtab *v = cursor_vtab(c);
  1244    int rc;
  1245    const char *zStatement;
  1246  
  1247    c->iCursorType = idxNum;
  1248    switch( idxNum ){
  1249      case QUERY_GENERIC:
  1250        zStatement = "select rowid, content from %_content";
  1251        break;
  1252  
  1253      case QUERY_FULLTEXT:   /* full-text search */
  1254      {
  1255        const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
  1256        DocList *pResult;
  1257        assert( argc==1 );
  1258        rc = fulltext_query(v, zQuery, &pResult);
  1259        if( rc!=SQLITE_OK ) return rc;
  1260        readerInit(&c->result, pResult);
  1261        zStatement = "select rowid, content from %_content where rowid = ?";
  1262        break;
  1263      }
  1264  
  1265      default:
  1266        assert( 0 );
  1267    }
  1268  
  1269    rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement);
  1270    if( rc!=SQLITE_OK ) return rc;
  1271  
  1272    return fulltextNext(pCursor);
  1273  }
  1274  
  1275  static int fulltextEof(sqlite3_vtab_cursor *pCursor){
  1276    fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1277    return c->eof;
  1278  }
  1279  
  1280  static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
  1281                            sqlite3_context *pContext, int idxCol){
  1282    fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1283    const char *s;
  1284  
  1285    assert( idxCol==0 );
  1286    s = (const char *) sqlite3_column_text(c->pStmt, 1);
  1287    sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT);
  1288  
  1289    return SQLITE_OK;
  1290  }
  1291  
  1292  static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
  1293    fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1294  
  1295    *pRowid = sqlite3_column_int64(c->pStmt, 0);
  1296    return SQLITE_OK;
  1297  }
  1298  
  1299  /* Build a hash table containing all terms in zText. */
  1300  static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer,
  1301                         const char *zText, sqlite_int64 iDocid){
  1302    sqlite3_tokenizer_cursor *pCursor;
  1303    const char *pToken;
  1304    int nTokenBytes;
  1305    int iStartOffset, iEndOffset, iPosition;
  1306  
  1307    int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
  1308    if( rc!=SQLITE_OK ) return rc;
  1309  
  1310    pCursor->pTokenizer = pTokenizer;
  1311    HashInit(terms, HASH_STRING, 1);
  1312    while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
  1313                                                 &pToken, &nTokenBytes,
  1314                                                 &iStartOffset, &iEndOffset,
  1315                                                 &iPosition) ){
  1316      DocList *p;
  1317  
  1318      /* Positions can't be negative; we use -1 as a terminator internally. */
  1319      if( iPosition<0 ) {
  1320        rc = SQLITE_ERROR;  
  1321        goto err;
  1322      }
  1323  
  1324      p = HashFind(terms, pToken, nTokenBytes);
  1325      if( p==NULL ){
  1326        p = docListNew(DL_POSITIONS_OFFSETS);
  1327        docListAddDocid(p, iDocid);
  1328        HashInsert(terms, pToken, nTokenBytes, p);
  1329      }
  1330      docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset);
  1331    }
  1332  
  1333  err:
  1334    /* TODO(shess) Check return?  Should this be able to cause errors at
  1335    ** this point?  Actually, same question about sqlite3_finalize(),
  1336    ** though one could argue that failure there means that the data is
  1337    ** not durable.  *ponder*
  1338    */
  1339    pTokenizer->pModule->xClose(pCursor);
  1340    return rc;
  1341  }
  1342  /* Update the %_terms table to map the term [zTerm] to the given rowid. */
  1343  static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm,
  1344                               sqlite_int64 iDocid, DocList *p){
  1345    sqlite_int64 iFirst;
  1346    sqlite_int64 iIndexRow;
  1347    DocList doclist;
  1348  
  1349    int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
  1350    if( rc==SQLITE_DONE ){
  1351      docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0);
  1352      if( docListUpdate(&doclist, iDocid, p) ){
  1353        rc = term_insert(v, zTerm, nTerm, iDocid, &doclist);
  1354        docListDestroy(&doclist);
  1355        return rc;
  1356      }
  1357      return SQLITE_OK;
  1358    }
  1359    if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
  1360  
  1361    /* This word is in the index; add this document ID to its blob. */
  1362  
  1363    rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
  1364    if( rc!=SQLITE_OK ) return rc;
  1365  
  1366    if( docListUpdate(&doclist, iDocid, p) ){
  1367      /* If the blob is too big, split it in half. */
  1368      if( doclist.nData>CHUNK_MAX ){
  1369        DocList half;
  1370        if( docListSplit(&doclist, &half) ){
  1371          rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half);
  1372          docListDestroy(&half);
  1373          if( rc!=SQLITE_OK ) goto err;
  1374        }
  1375      }
  1376      rc = term_update(v, iIndexRow, &doclist);
  1377    }
  1378  
  1379  err:
  1380    docListDestroy(&doclist);
  1381    return rc;
  1382  }
  1383  
  1384  /* Insert a row into the full-text index; set *piRowid to be the ID of the
  1385   * new row. */
  1386  static int index_insert(fulltext_vtab *v,
  1387                          sqlite3_value *pRequestRowid, const char *zText,
  1388                          sqlite_int64 *piRowid){
  1389    Hash terms;  /* maps term string -> PosList */
  1390    HashElem *e;
  1391  
  1392    int rc = content_insert(v, pRequestRowid, zText, -1);
  1393    if( rc!=SQLITE_OK ) return rc;
  1394    *piRowid = sqlite3_last_insert_rowid(v->db);
  1395  
  1396    if( !zText ) return SQLITE_OK;   /* nothing to index */
  1397  
  1398    rc = build_terms(&terms, v->pTokenizer, zText, *piRowid);
  1399    if( rc!=SQLITE_OK ) return rc;
  1400  
  1401    for(e=HashFirst(&terms); e; e=HashNext(e)){
  1402      DocList *p = HashData(e);
  1403      rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p);
  1404      if( rc!=SQLITE_OK ) break;
  1405    }
  1406  
  1407    for(e=HashFirst(&terms); e; e=HashNext(e)){
  1408      DocList *p = HashData(e);
  1409      docListDelete(p);
  1410    }
  1411    HashClear(&terms);
  1412    return rc;
  1413  }
  1414  
  1415  static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm,
  1416                               sqlite_int64 iDocid){
  1417    sqlite_int64 iFirst;
  1418    sqlite_int64 iIndexRow;
  1419    DocList doclist;
  1420  
  1421    int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
  1422    if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
  1423  
  1424    rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
  1425    if( rc!=SQLITE_OK ) return rc;
  1426  
  1427    if( docListUpdate(&doclist, iDocid, NULL) ){
  1428      if( doclist.nData>0 ){
  1429        rc = term_update(v, iIndexRow, &doclist);
  1430      } else {  /* empty posting list */
  1431        rc = term_delete(v, iIndexRow);
  1432      }
  1433    }
  1434    docListDestroy(&doclist);
  1435    return rc;
  1436  }
  1437  
  1438  /* Delete a row from the full-text index. */
  1439  static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
  1440    char *zText;
  1441    Hash terms;
  1442    HashElem *e;
  1443  
  1444    int rc = content_select(v, iRow, &zText);
  1445    if( rc!=SQLITE_OK ) return rc;
  1446  
  1447    rc = build_terms(&terms, v->pTokenizer, zText, iRow);
  1448    free(zText);
  1449    if( rc!=SQLITE_OK ) return rc;
  1450  
  1451    for(e=HashFirst(&terms); e; e=HashNext(e)){
  1452      rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow);
  1453      if( rc!=SQLITE_OK ) break;
  1454    }
  1455    for(e=HashFirst(&terms); e; e=HashNext(e)){
  1456      DocList *p = HashData(e);
  1457      docListDelete(p);
  1458    }
  1459    HashClear(&terms);
  1460  
  1461    return content_delete(v, iRow);
  1462  }
  1463  
  1464  static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
  1465                     sqlite_int64 *pRowid){
  1466    fulltext_vtab *v = (fulltext_vtab *) pVtab;
  1467  
  1468    if( nArg<2 ){
  1469      return index_delete(v, sqlite3_value_int64(ppArg[0]));
  1470    }
  1471  
  1472    if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
  1473      return SQLITE_ERROR;   /* an update; not yet supported */
  1474    }
  1475  
  1476    assert( nArg==3 );    /* ppArg[1] = rowid, ppArg[2] = content */
  1477    return index_insert(v, ppArg[1],
  1478                        (const char *)sqlite3_value_text(ppArg[2]), pRowid);
  1479  }
  1480  
  1481  static sqlite3_module fulltextModule = {
  1482    0,
  1483    fulltextCreate,
  1484    fulltextConnect,
  1485    fulltextBestIndex,
  1486    fulltextDisconnect,
  1487    fulltextDestroy,
  1488    fulltextOpen,
  1489    fulltextClose,
  1490    fulltextFilter,
  1491    fulltextNext,
  1492    fulltextEof,
  1493    fulltextColumn,
  1494    fulltextRowid,
  1495    fulltextUpdate
  1496  };
  1497  
  1498  int fulltext_init(sqlite3 *db){
  1499   return sqlite3_create_module(db, "fulltext", &fulltextModule, 0);
  1500  }
  1501  
  1502  #if !SQLITE_CORE
  1503  #ifdef _WIN32
  1504  __declspec(dllexport)
  1505  #endif
  1506  int sqlite3_fulltext_init(sqlite3 *db, char **pzErrMsg,
  1507                            const sqlite3_api_routines *pApi){
  1508   SQLITE_EXTENSION_INIT2(pApi)
  1509   return fulltext_init(db);
  1510  }
  1511  #endif