modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/fts5_hash.c (about)

     1  /*
     2  ** 2014 August 11
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  ******************************************************************************
    12  **
    13  */
    14  
    15  
    16  
    17  #include "fts5Int.h"
    18  
    19  typedef struct Fts5HashEntry Fts5HashEntry;
    20  
    21  /*
    22  ** This file contains the implementation of an in-memory hash table used
    23  ** to accumuluate "term -> doclist" content before it is flused to a level-0
    24  ** segment.
    25  */
    26  
    27  
    28  struct Fts5Hash {
    29    int eDetail;                    /* Copy of Fts5Config.eDetail */
    30    int *pnByte;                    /* Pointer to bytes counter */
    31    int nEntry;                     /* Number of entries currently in hash */
    32    int nSlot;                      /* Size of aSlot[] array */
    33    Fts5HashEntry *pScan;           /* Current ordered scan item */
    34    Fts5HashEntry **aSlot;          /* Array of hash slots */
    35  };
    36  
    37  /*
    38  ** Each entry in the hash table is represented by an object of the 
    39  ** following type. Each object, its key (a nul-terminated string) and 
    40  ** its current data are stored in a single memory allocation. The 
    41  ** key immediately follows the object in memory. The position list
    42  ** data immediately follows the key data in memory.
    43  **
    44  ** The data that follows the key is in a similar, but not identical format
    45  ** to the doclist data stored in the database. It is:
    46  **
    47  **   * Rowid, as a varint
    48  **   * Position list, without 0x00 terminator.
    49  **   * Size of previous position list and rowid, as a 4 byte
    50  **     big-endian integer.
    51  **
    52  ** iRowidOff:
    53  **   Offset of last rowid written to data area. Relative to first byte of
    54  **   structure.
    55  **
    56  ** nData:
    57  **   Bytes of data written since iRowidOff.
    58  */
    59  struct Fts5HashEntry {
    60    Fts5HashEntry *pHashNext;       /* Next hash entry with same hash-key */
    61    Fts5HashEntry *pScanNext;       /* Next entry in sorted order */
    62    
    63    int nAlloc;                     /* Total size of allocation */
    64    int iSzPoslist;                 /* Offset of space for 4-byte poslist size */
    65    int nData;                      /* Total bytes of data (incl. structure) */
    66    int nKey;                       /* Length of key in bytes */
    67    u8 bDel;                        /* Set delete-flag @ iSzPoslist */
    68    u8 bContent;                    /* Set content-flag (detail=none mode) */
    69    i16 iCol;                       /* Column of last value written */
    70    int iPos;                       /* Position of last value written */
    71    i64 iRowid;                     /* Rowid of last value written */
    72  };
    73  
    74  /*
    75  ** Eqivalent to:
    76  **
    77  **   char *fts5EntryKey(Fts5HashEntry *pEntry){ return zKey; }
    78  */
    79  #define fts5EntryKey(p) ( ((char *)(&(p)[1])) )
    80  
    81  
    82  /*
    83  ** Allocate a new hash table.
    84  */
    85  int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){
    86    int rc = SQLITE_OK;
    87    Fts5Hash *pNew;
    88  
    89    *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash));
    90    if( pNew==0 ){
    91      rc = SQLITE_NOMEM;
    92    }else{
    93      int nByte;
    94      memset(pNew, 0, sizeof(Fts5Hash));
    95      pNew->pnByte = pnByte;
    96      pNew->eDetail = pConfig->eDetail;
    97  
    98      pNew->nSlot = 1024;
    99      nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
   100      pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc(nByte);
   101      if( pNew->aSlot==0 ){
   102        sqlite3_free(pNew);
   103        *ppNew = 0;
   104        rc = SQLITE_NOMEM;
   105      }else{
   106        memset(pNew->aSlot, 0, nByte);
   107      }
   108    }
   109    return rc;
   110  }
   111  
   112  /*
   113  ** Free a hash table object.
   114  */
   115  void sqlite3Fts5HashFree(Fts5Hash *pHash){
   116    if( pHash ){
   117      sqlite3Fts5HashClear(pHash);
   118      sqlite3_free(pHash->aSlot);
   119      sqlite3_free(pHash);
   120    }
   121  }
   122  
   123  /*
   124  ** Empty (but do not delete) a hash table.
   125  */
   126  void sqlite3Fts5HashClear(Fts5Hash *pHash){
   127    int i;
   128    for(i=0; i<pHash->nSlot; i++){
   129      Fts5HashEntry *pNext;
   130      Fts5HashEntry *pSlot;
   131      for(pSlot=pHash->aSlot[i]; pSlot; pSlot=pNext){
   132        pNext = pSlot->pHashNext;
   133        sqlite3_free(pSlot);
   134      }
   135    }
   136    memset(pHash->aSlot, 0, pHash->nSlot * sizeof(Fts5HashEntry*));
   137    pHash->nEntry = 0;
   138  }
   139  
   140  static unsigned int fts5HashKey(int nSlot, const u8 *p, int n){
   141    int i;
   142    unsigned int h = 13;
   143    for(i=n-1; i>=0; i--){
   144      h = (h << 3) ^ h ^ p[i];
   145    }
   146    return (h % nSlot);
   147  }
   148  
   149  static unsigned int fts5HashKey2(int nSlot, u8 b, const u8 *p, int n){
   150    int i;
   151    unsigned int h = 13;
   152    for(i=n-1; i>=0; i--){
   153      h = (h << 3) ^ h ^ p[i];
   154    }
   155    h = (h << 3) ^ h ^ b;
   156    return (h % nSlot);
   157  }
   158  
   159  /*
   160  ** Resize the hash table by doubling the number of slots.
   161  */
   162  static int fts5HashResize(Fts5Hash *pHash){
   163    int nNew = pHash->nSlot*2;
   164    int i;
   165    Fts5HashEntry **apNew;
   166    Fts5HashEntry **apOld = pHash->aSlot;
   167  
   168    apNew = (Fts5HashEntry**)sqlite3_malloc(nNew*sizeof(Fts5HashEntry*));
   169    if( !apNew ) return SQLITE_NOMEM;
   170    memset(apNew, 0, nNew*sizeof(Fts5HashEntry*));
   171  
   172    for(i=0; i<pHash->nSlot; i++){
   173      while( apOld[i] ){
   174        unsigned int iHash;
   175        Fts5HashEntry *p = apOld[i];
   176        apOld[i] = p->pHashNext;
   177        iHash = fts5HashKey(nNew, (u8*)fts5EntryKey(p),
   178                            (int)strlen(fts5EntryKey(p)));
   179        p->pHashNext = apNew[iHash];
   180        apNew[iHash] = p;
   181      }
   182    }
   183  
   184    sqlite3_free(apOld);
   185    pHash->nSlot = nNew;
   186    pHash->aSlot = apNew;
   187    return SQLITE_OK;
   188  }
   189  
   190  static void fts5HashAddPoslistSize(Fts5Hash *pHash, Fts5HashEntry *p){
   191    if( p->iSzPoslist ){
   192      u8 *pPtr = (u8*)p;
   193      if( pHash->eDetail==FTS5_DETAIL_NONE ){
   194        assert( p->nData==p->iSzPoslist );
   195        if( p->bDel ){
   196          pPtr[p->nData++] = 0x00;
   197          if( p->bContent ){
   198            pPtr[p->nData++] = 0x00;
   199          }
   200        }
   201      }else{
   202        int nSz = (p->nData - p->iSzPoslist - 1);       /* Size in bytes */
   203        int nPos = nSz*2 + p->bDel;                     /* Value of nPos field */
   204  
   205        assert( p->bDel==0 || p->bDel==1 );
   206        if( nPos<=127 ){
   207          pPtr[p->iSzPoslist] = (u8)nPos;
   208        }else{
   209          int nByte = sqlite3Fts5GetVarintLen((u32)nPos);
   210          memmove(&pPtr[p->iSzPoslist + nByte], &pPtr[p->iSzPoslist + 1], nSz);
   211          sqlite3Fts5PutVarint(&pPtr[p->iSzPoslist], nPos);
   212          p->nData += (nByte-1);
   213        }
   214      }
   215  
   216      p->iSzPoslist = 0;
   217      p->bDel = 0;
   218      p->bContent = 0;
   219    }
   220  }
   221  
   222  /*
   223  ** Add an entry to the in-memory hash table. The key is the concatenation
   224  ** of bByte and (pToken/nToken). The value is (iRowid/iCol/iPos).
   225  **
   226  **     (bByte || pToken) -> (iRowid,iCol,iPos)
   227  **
   228  ** Or, if iCol is negative, then the value is a delete marker.
   229  */
   230  int sqlite3Fts5HashWrite(
   231    Fts5Hash *pHash,
   232    i64 iRowid,                     /* Rowid for this entry */
   233    int iCol,                       /* Column token appears in (-ve -> delete) */
   234    int iPos,                       /* Position of token within column */
   235    char bByte,                     /* First byte of token */
   236    const char *pToken, int nToken  /* Token to add or remove to or from index */
   237  ){
   238    unsigned int iHash;
   239    Fts5HashEntry *p;
   240    u8 *pPtr;
   241    int nIncr = 0;                  /* Amount to increment (*pHash->pnByte) by */
   242    int bNew;                       /* If non-delete entry should be written */
   243    
   244    bNew = (pHash->eDetail==FTS5_DETAIL_FULL);
   245  
   246    /* Attempt to locate an existing hash entry */
   247    iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
   248    for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
   249      char *zKey = fts5EntryKey(p);
   250      if( zKey[0]==bByte 
   251       && p->nKey==nToken
   252       && memcmp(&zKey[1], pToken, nToken)==0 
   253      ){
   254        break;
   255      }
   256    }
   257  
   258    /* If an existing hash entry cannot be found, create a new one. */
   259    if( p==0 ){
   260      /* Figure out how much space to allocate */
   261      char *zKey;
   262      int nByte = sizeof(Fts5HashEntry) + (nToken+1) + 1 + 64;
   263      if( nByte<128 ) nByte = 128;
   264  
   265      /* Grow the Fts5Hash.aSlot[] array if necessary. */
   266      if( (pHash->nEntry*2)>=pHash->nSlot ){
   267        int rc = fts5HashResize(pHash);
   268        if( rc!=SQLITE_OK ) return rc;
   269        iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
   270      }
   271  
   272      /* Allocate new Fts5HashEntry and add it to the hash table. */
   273      p = (Fts5HashEntry*)sqlite3_malloc(nByte);
   274      if( !p ) return SQLITE_NOMEM;
   275      memset(p, 0, sizeof(Fts5HashEntry));
   276      p->nAlloc = nByte;
   277      zKey = fts5EntryKey(p);
   278      zKey[0] = bByte;
   279      memcpy(&zKey[1], pToken, nToken);
   280      assert( iHash==fts5HashKey(pHash->nSlot, (u8*)zKey, nToken+1) );
   281      p->nKey = nToken;
   282      zKey[nToken+1] = '\0';
   283      p->nData = nToken+1 + 1 + sizeof(Fts5HashEntry);
   284      p->pHashNext = pHash->aSlot[iHash];
   285      pHash->aSlot[iHash] = p;
   286      pHash->nEntry++;
   287  
   288      /* Add the first rowid field to the hash-entry */
   289      p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid);
   290      p->iRowid = iRowid;
   291  
   292      p->iSzPoslist = p->nData;
   293      if( pHash->eDetail!=FTS5_DETAIL_NONE ){
   294        p->nData += 1;
   295        p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
   296      }
   297  
   298      nIncr += p->nData;
   299    }else{
   300  
   301      /* Appending to an existing hash-entry. Check that there is enough 
   302      ** space to append the largest possible new entry. Worst case scenario 
   303      ** is:
   304      **
   305      **     + 9 bytes for a new rowid,
   306      **     + 4 byte reserved for the "poslist size" varint.
   307      **     + 1 byte for a "new column" byte,
   308      **     + 3 bytes for a new column number (16-bit max) as a varint,
   309      **     + 5 bytes for the new position offset (32-bit max).
   310      */
   311      if( (p->nAlloc - p->nData) < (9 + 4 + 1 + 3 + 5) ){
   312        int nNew = p->nAlloc * 2;
   313        Fts5HashEntry *pNew;
   314        Fts5HashEntry **pp;
   315        pNew = (Fts5HashEntry*)sqlite3_realloc(p, nNew);
   316        if( pNew==0 ) return SQLITE_NOMEM;
   317        pNew->nAlloc = nNew;
   318        for(pp=&pHash->aSlot[iHash]; *pp!=p; pp=&(*pp)->pHashNext);
   319        *pp = pNew;
   320        p = pNew;
   321      }
   322      nIncr -= p->nData;
   323    }
   324    assert( (p->nAlloc - p->nData) >= (9 + 4 + 1 + 3 + 5) );
   325  
   326    pPtr = (u8*)p;
   327  
   328    /* If this is a new rowid, append the 4-byte size field for the previous
   329    ** entry, and the new rowid for this entry.  */
   330    if( iRowid!=p->iRowid ){
   331      fts5HashAddPoslistSize(pHash, p);
   332      p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iRowid - p->iRowid);
   333      p->iRowid = iRowid;
   334      bNew = 1;
   335      p->iSzPoslist = p->nData;
   336      if( pHash->eDetail!=FTS5_DETAIL_NONE ){
   337        p->nData += 1;
   338        p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
   339        p->iPos = 0;
   340      }
   341    }
   342  
   343    if( iCol>=0 ){
   344      if( pHash->eDetail==FTS5_DETAIL_NONE ){
   345        p->bContent = 1;
   346      }else{
   347        /* Append a new column value, if necessary */
   348        assert( iCol>=p->iCol );
   349        if( iCol!=p->iCol ){
   350          if( pHash->eDetail==FTS5_DETAIL_FULL ){
   351            pPtr[p->nData++] = 0x01;
   352            p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
   353            p->iCol = (i16)iCol;
   354            p->iPos = 0;
   355          }else{
   356            bNew = 1;
   357            p->iCol = (i16)(iPos = iCol);
   358          }
   359        }
   360  
   361        /* Append the new position offset, if necessary */
   362        if( bNew ){
   363          p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
   364          p->iPos = iPos;
   365        }
   366      }
   367    }else{
   368      /* This is a delete. Set the delete flag. */
   369      p->bDel = 1;
   370    }
   371  
   372    nIncr += p->nData;
   373    *pHash->pnByte += nIncr;
   374    return SQLITE_OK;
   375  }
   376  
   377  
   378  /*
   379  ** Arguments pLeft and pRight point to linked-lists of hash-entry objects,
   380  ** each sorted in key order. This function merges the two lists into a
   381  ** single list and returns a pointer to its first element.
   382  */
   383  static Fts5HashEntry *fts5HashEntryMerge(
   384    Fts5HashEntry *pLeft,
   385    Fts5HashEntry *pRight
   386  ){
   387    Fts5HashEntry *p1 = pLeft;
   388    Fts5HashEntry *p2 = pRight;
   389    Fts5HashEntry *pRet = 0;
   390    Fts5HashEntry **ppOut = &pRet;
   391  
   392    while( p1 || p2 ){
   393      if( p1==0 ){
   394        *ppOut = p2;
   395        p2 = 0;
   396      }else if( p2==0 ){
   397        *ppOut = p1;
   398        p1 = 0;
   399      }else{
   400        int i = 0;
   401        char *zKey1 = fts5EntryKey(p1);
   402        char *zKey2 = fts5EntryKey(p2);
   403        while( zKey1[i]==zKey2[i] ) i++;
   404  
   405        if( ((u8)zKey1[i])>((u8)zKey2[i]) ){
   406          /* p2 is smaller */
   407          *ppOut = p2;
   408          ppOut = &p2->pScanNext;
   409          p2 = p2->pScanNext;
   410        }else{
   411          /* p1 is smaller */
   412          *ppOut = p1;
   413          ppOut = &p1->pScanNext;
   414          p1 = p1->pScanNext;
   415        }
   416        *ppOut = 0;
   417      }
   418    }
   419  
   420    return pRet;
   421  }
   422  
   423  /*
   424  ** Extract all tokens from hash table iHash and link them into a list
   425  ** in sorted order. The hash table is cleared before returning. It is
   426  ** the responsibility of the caller to free the elements of the returned
   427  ** list.
   428  */
   429  static int fts5HashEntrySort(
   430    Fts5Hash *pHash, 
   431    const char *pTerm, int nTerm,   /* Query prefix, if any */
   432    Fts5HashEntry **ppSorted
   433  ){
   434    const int nMergeSlot = 32;
   435    Fts5HashEntry **ap;
   436    Fts5HashEntry *pList;
   437    int iSlot;
   438    int i;
   439  
   440    *ppSorted = 0;
   441    ap = sqlite3_malloc(sizeof(Fts5HashEntry*) * nMergeSlot);
   442    if( !ap ) return SQLITE_NOMEM;
   443    memset(ap, 0, sizeof(Fts5HashEntry*) * nMergeSlot);
   444  
   445    for(iSlot=0; iSlot<pHash->nSlot; iSlot++){
   446      Fts5HashEntry *pIter;
   447      for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){
   448        if( pTerm==0 || 0==memcmp(fts5EntryKey(pIter), pTerm, nTerm) ){
   449          Fts5HashEntry *pEntry = pIter;
   450          pEntry->pScanNext = 0;
   451          for(i=0; ap[i]; i++){
   452            pEntry = fts5HashEntryMerge(pEntry, ap[i]);
   453            ap[i] = 0;
   454          }
   455          ap[i] = pEntry;
   456        }
   457      }
   458    }
   459  
   460    pList = 0;
   461    for(i=0; i<nMergeSlot; i++){
   462      pList = fts5HashEntryMerge(pList, ap[i]);
   463    }
   464  
   465    pHash->nEntry = 0;
   466    sqlite3_free(ap);
   467    *ppSorted = pList;
   468    return SQLITE_OK;
   469  }
   470  
   471  /*
   472  ** Query the hash table for a doclist associated with term pTerm/nTerm.
   473  */
   474  int sqlite3Fts5HashQuery(
   475    Fts5Hash *pHash,                /* Hash table to query */
   476    const char *pTerm, int nTerm,   /* Query term */
   477    const u8 **ppDoclist,           /* OUT: Pointer to doclist for pTerm */
   478    int *pnDoclist                  /* OUT: Size of doclist in bytes */
   479  ){
   480    unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm);
   481    char *zKey = 0;
   482    Fts5HashEntry *p;
   483  
   484    for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
   485      zKey = fts5EntryKey(p);
   486      if( memcmp(zKey, pTerm, nTerm)==0 && zKey[nTerm]==0 ) break;
   487    }
   488  
   489    if( p ){
   490      fts5HashAddPoslistSize(pHash, p);
   491      *ppDoclist = (const u8*)&zKey[nTerm+1];
   492      *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm + 1);
   493    }else{
   494      *ppDoclist = 0;
   495      *pnDoclist = 0;
   496    }
   497  
   498    return SQLITE_OK;
   499  }
   500  
   501  int sqlite3Fts5HashScanInit(
   502    Fts5Hash *p,                    /* Hash table to query */
   503    const char *pTerm, int nTerm    /* Query prefix */
   504  ){
   505    return fts5HashEntrySort(p, pTerm, nTerm, &p->pScan);
   506  }
   507  
   508  void sqlite3Fts5HashScanNext(Fts5Hash *p){
   509    assert( !sqlite3Fts5HashScanEof(p) );
   510    p->pScan = p->pScan->pScanNext;
   511  }
   512  
   513  int sqlite3Fts5HashScanEof(Fts5Hash *p){
   514    return (p->pScan==0);
   515  }
   516  
   517  void sqlite3Fts5HashScanEntry(
   518    Fts5Hash *pHash,
   519    const char **pzTerm,            /* OUT: term (nul-terminated) */
   520    const u8 **ppDoclist,           /* OUT: pointer to doclist */
   521    int *pnDoclist                  /* OUT: size of doclist in bytes */
   522  ){
   523    Fts5HashEntry *p;
   524    if( (p = pHash->pScan) ){
   525      char *zKey = fts5EntryKey(p);
   526      int nTerm = (int)strlen(zKey);
   527      fts5HashAddPoslistSize(pHash, p);
   528      *pzTerm = zKey;
   529      *ppDoclist = (const u8*)&zKey[nTerm+1];
   530      *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm + 1);
   531    }else{
   532      *pzTerm = 0;
   533      *ppDoclist = 0;
   534      *pnDoclist = 0;
   535    }
   536  }
   537