modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/lsm1/lsm_log.c (about)

     1  /*
     2  ** 2011-08-13
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  **
    13  ** This file contains the implementation of LSM database logging. Logging
    14  ** has one purpose in LSM - to make transactions durable.
    15  **
    16  ** When data is written to an LSM database, it is initially stored in an
    17  ** in-memory tree structure. Since this structure is in volatile memory,
    18  ** if a power failure or application crash occurs it may be lost. To
    19  ** prevent loss of data in this case, each time a record is written to the
    20  ** in-memory tree an equivalent record is appended to the log on disk.
    21  ** If a power failure or application crash does occur, data can be recovered
    22  ** by reading the log.
    23  **
    24  ** A log file consists of the following types of records representing data
    25  ** written into the database:
    26  **
    27  **   LOG_WRITE:  A key-value pair written to the database.
    28  **   LOG_DELETE: A delete key issued to the database.
    29  **   LOG_COMMIT: A transaction commit.
    30  **
    31  ** And the following types of records for ancillary purposes..
    32  **
    33  **   LOG_EOF:    A record indicating the end of a log file.
    34  **   LOG_PAD1:   A single byte padding record.
    35  **   LOG_PAD2:   An N byte padding record (N>1).
    36  **   LOG_JUMP:   A pointer to another offset within the log file.
    37  **
    38  ** Each transaction written to the log contains one or more LOG_WRITE and/or
    39  ** LOG_DELETE records, followed by a LOG_COMMIT record. The LOG_COMMIT record
    40  ** contains an 8-byte checksum based on all previous data written to the
    41  ** log file.
    42  **
    43  ** LOG CHECKSUMS & RECOVERY
    44  **
    45  **   Checksums are found in two types of log records: LOG_COMMIT and
    46  **   LOG_CKSUM records. In order to recover content from a log, a client
    47  **   reads each record from the start of the log, calculating a checksum as
    48  **   it does. Each time a LOG_COMMIT or LOG_CKSUM is encountered, the 
    49  **   recovery process verifies that the checksum stored in the log 
    50  **   matches the calculated checksum. If it does not, the recovery process
    51  **   can stop reading the log.
    52  **
    53  **   If a recovery process reads records (other than COMMIT or CKSUM) 
    54  **   consisting of at least LSM_CKSUM_MAXDATA bytes, then the next record in
    55  **   the log must be either a LOG_CKSUM or LOG_COMMIT record. If it is
    56  **   not, the recovery process also stops reading the log.
    57  **
    58  **   To recover the log file, it must be read twice. The first time to 
    59  **   determine the location of the last valid commit record. And the second
    60  **   time to load data into the in-memory tree.
    61  **
    62  **   Todo: Surely there is a better way...
    63  **
    64  ** LOG WRAPPING
    65  **
    66  **   If the log file were never deleted or wrapped, it would be possible to
    67  **   read it from start to end each time is required recovery (i.e each time
    68  **   the number of database clients changes from 0 to 1). Effectively reading
    69  **   the entire history of the database each time. This would quickly become 
    70  **   inefficient. Additionally, since the log file would grow without bound,
    71  **   it wastes storage space.
    72  **
    73  **   Instead, part of each checkpoint written into the database file contains 
    74  **   a log offset (and other information required to read the log starting at
    75  **   at this offset) at which to begin recovery. Offset $O.
    76  **
    77  **   Once a checkpoint has been written and synced into the database file, it
    78  **   is guaranteed that no recovery process will need to read any data before
    79  **   offset $O of the log file. It is therefore safe to begin overwriting
    80  **   any data that occurs before offset $O.
    81  **
    82  **   This implementation separates the log into three regions mapped into
    83  **   the log file - regions 0, 1 and 2. During recovery, regions are read
    84  **   in ascending order (i.e. 0, then 1, then 2). Each region is zero or
    85  **   more bytes in size.
    86  **
    87  **     |---1---|..|--0--|.|--2--|....
    88  **
    89  **   New records are always appended to the end of region 2.
    90  **
    91  **   Initially (when it is empty), all three regions are zero bytes in size.
    92  **   Each of them are located at the beginning of the file. As records are
    93  **   added to the log, region 2 grows, so that the log consists of a zero
    94  **   byte region 1, followed by a zero byte region 0, followed by an N byte
    95  **   region 2. After one or more checkpoints have been written to disk, 
    96  **   the start point of region 2 is moved to $O. For example:
    97  **
    98  **     A) ||.........|--2--|....
    99  **   
   100  **   (both regions 0 and 1 are 0 bytes in size at offset 0).
   101  **
   102  **   Eventually, the log wraps around to write new records into the start.
   103  **   At this point, region 2 is renamed to region 0. Region 0 is renamed
   104  **   to region 2. After appending a few records to the new region 2, the
   105  **   log file looks like this:
   106  **
   107  **     B) ||--2--|...|--0--|....
   108  **
   109  **   (region 1 is still 0 bytes in size, located at offset 0).
   110  **
   111  **   Any checkpoints made at this point may reduce the size of region 0.
   112  **   However, if they do not, and region 2 expands so that it is about to
   113  **   overwrite the start of region 0, then region 2 is renamed to region 1,
   114  **   and a new region 2 created at the end of the file following the existing
   115  **   region 0.
   116  **
   117  **     C) |---1---|..|--0--|.|-2-|
   118  **
   119  **   In this state records are appended to region 2 until checkpoints have
   120  **   contracted regions 0 AND 1 UNTil they are both zero bytes in size. They 
   121  **   are then shifted to the start of the log file, leaving the system in 
   122  **   the equivalent of state A above.
   123  **
   124  **   Alternatively, state B may transition directly to state A if the size
   125  **   of region 0 is reduced to zero bytes before region 2 threatens to 
   126  **   encroach upon it.
   127  **
   128  ** LOG_PAD1 & LOG_PAD2 RECORDS
   129  **
   130  **   PAD1 and PAD2 records may appear in a log file at any point. They allow
   131  **   a process writing the log file align the beginning of transactions with 
   132  **   the beginning of disk sectors, which increases robustness.
   133  **
   134  ** RECORD FORMATS:
   135  **
   136  **   LOG_EOF:    * A single 0x00 byte.
   137  **
   138  **   LOG_PAD1:   * A single 0x01 byte.
   139  **
   140  **   LOG_PAD2:   * A single 0x02 byte, followed by
   141  **               * The number of unused bytes (N) as a varint,
   142  **               * An N byte block of unused space.
   143  **
   144  **   LOG_COMMIT: * A single 0x03 byte.
   145  **               * An 8-byte checksum.
   146  **
   147  **   LOG_JUMP:   * A single 0x04 byte.
   148  **               * Absolute file offset to jump to, encoded as a varint.
   149  **
   150  **   LOG_WRITE:  * A single 0x06 or 0x07 byte, 
   151  **               * The number of bytes in the key, encoded as a varint, 
   152  **               * The number of bytes in the value, encoded as a varint, 
   153  **               * If the first byte was 0x07, an 8 byte checksum.
   154  **               * The key data,
   155  **               * The value data.
   156  **
   157  **   LOG_DELETE: * A single 0x08 or 0x09 byte, 
   158  **               * The number of bytes in the key, encoded as a varint, 
   159  **               * If the first byte was 0x09, an 8 byte checksum.
   160  **               * The key data.
   161  **
   162  **   Varints are as described in lsm_varint.c (SQLite 4 format).
   163  **
   164  ** CHECKSUMS:
   165  **
   166  **   The checksum is calculated using two 32-bit unsigned integers, s0 and
   167  **   s1. The initial value for both is 42. It is updated each time a record
   168  **   is written into the log file by treating the encoded (binary) record as 
   169  **   an array of 32-bit little-endian integers. Then, if x[] is the integer
   170  **   array, updating the checksum accumulators as follows:
   171  **
   172  **     for i from 0 to n-1 step 2:
   173  **       s0 += x[i] + s1;
   174  **       s1 += x[i+1] + s0;
   175  **     endfor
   176  **
   177  **   If the record is not an even multiple of 8-bytes in size it is padded
   178  **   with zeroes to make it so before the checksum is updated.
   179  **
   180  **   The checksum stored in a COMMIT, WRITE or DELETE is based on all bytes
   181  **   up to the start of the 8-byte checksum itself, including the COMMIT,
   182  **   WRITE or DELETE fields that appear before the checksum in the record.
   183  **
   184  ** VARINT FORMAT
   185  **
   186  ** See lsm_varint.c.
   187  */
   188  
   189  #ifndef _LSM_INT_H
   190  # include "lsmInt.h"
   191  #endif
   192  
   193  /* Log record types */
   194  #define LSM_LOG_EOF          0x00
   195  #define LSM_LOG_PAD1         0x01
   196  #define LSM_LOG_PAD2         0x02
   197  #define LSM_LOG_COMMIT       0x03
   198  #define LSM_LOG_JUMP         0x04
   199  
   200  #define LSM_LOG_WRITE        0x06
   201  #define LSM_LOG_WRITE_CKSUM  0x07
   202  
   203  #define LSM_LOG_DELETE       0x08
   204  #define LSM_LOG_DELETE_CKSUM 0x09
   205  
   206  #define LSM_LOG_DRANGE       0x0A
   207  #define LSM_LOG_DRANGE_CKSUM 0x0B
   208  
   209  /* Require a checksum every 32KB. */
   210  #define LSM_CKSUM_MAXDATA (32*1024)
   211  
   212  /* Do not wrap a log file smaller than this in bytes. */
   213  #define LSM_MIN_LOGWRAP      (128*1024)
   214  
   215  /*
   216  ** szSector:
   217  **   Commit records must be aligned to end on szSector boundaries. If
   218  **   the safety-mode is set to NORMAL or OFF, this value is 1. Otherwise,
   219  **   if the safety-mode is set to FULL, it is the size of the file-system
   220  **   sectors as reported by lsmFsSectorSize().
   221  */
   222  struct LogWriter {
   223    u32 cksum0;                     /* Checksum 0 at offset iOff */
   224    u32 cksum1;                     /* Checksum 1 at offset iOff */
   225    int iCksumBuf;                  /* Bytes of buf that have been checksummed */
   226    i64 iOff;                       /* Offset at start of buffer buf */
   227    int szSector;                   /* Sector size for this transaction */
   228    LogRegion jump;                 /* Avoid writing to this region */
   229    i64 iRegion1End;                /* End of first region written by trans */
   230    i64 iRegion2Start;              /* Start of second regions written by trans */
   231    LsmString buf;                  /* Buffer containing data not yet written */
   232  };
   233  
   234  /*
   235  ** Return the result of interpreting the first 4 bytes in buffer aIn as 
   236  ** a 32-bit unsigned little-endian integer.
   237  */
   238  static u32 getU32le(u8 *aIn){
   239    return ((u32)aIn[3] << 24) 
   240         + ((u32)aIn[2] << 16) 
   241         + ((u32)aIn[1] << 8) 
   242         + ((u32)aIn[0]);
   243  }
   244  
   245  
   246  /*
   247  ** This function is the same as logCksum(), except that pointer "a" need
   248  ** not be aligned to an 8-byte boundary or padded with zero bytes. This
   249  ** version is slower, but sometimes more convenient to use.
   250  */
   251  static void logCksumUnaligned(
   252    char *z,                        /* Input buffer */
   253    int n,                          /* Size of input buffer in bytes */
   254    u32 *pCksum0,                   /* IN/OUT: Checksum value 1 */
   255    u32 *pCksum1                    /* IN/OUT: Checksum value 2 */
   256  ){
   257    u8 *a = (u8 *)z;
   258    u32 cksum0 = *pCksum0;
   259    u32 cksum1 = *pCksum1;
   260    int nIn = (n/8) * 8;
   261    int i;
   262  
   263    assert( n>0 );
   264    for(i=0; i<nIn; i+=8){
   265      cksum0 += getU32le(&a[i]) + cksum1;
   266      cksum1 += getU32le(&a[i+4]) + cksum0;
   267    }
   268  
   269    if( nIn!=n ){
   270      u8 aBuf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
   271      assert( (n-nIn)<8 && n>nIn );
   272      memcpy(aBuf, &a[nIn], n-nIn);
   273      cksum0 += getU32le(aBuf) + cksum1;
   274      cksum1 += getU32le(&aBuf[4]) + cksum0;
   275    }
   276  
   277    *pCksum0 = cksum0;
   278    *pCksum1 = cksum1;
   279  }
   280  
   281  /*
   282  ** Update pLog->cksum0 and pLog->cksum1 so that the first nBuf bytes in the 
   283  ** write buffer (pLog->buf) are included in the checksum.
   284  */
   285  static void logUpdateCksum(LogWriter *pLog, int nBuf){
   286    assert( (pLog->iCksumBuf % 8)==0 );
   287    assert( pLog->iCksumBuf<=nBuf );
   288    assert( (nBuf % 8)==0 || nBuf==pLog->buf.n );
   289    if( nBuf>pLog->iCksumBuf ){
   290      logCksumUnaligned(
   291          &pLog->buf.z[pLog->iCksumBuf], nBuf-pLog->iCksumBuf, 
   292          &pLog->cksum0, &pLog->cksum1
   293      );
   294    }
   295    pLog->iCksumBuf = nBuf;
   296  }
   297  
   298  static i64 firstByteOnSector(LogWriter *pLog, i64 iOff){
   299    return (iOff / pLog->szSector) * pLog->szSector;
   300  }
   301  static i64 lastByteOnSector(LogWriter *pLog, i64 iOff){
   302    return firstByteOnSector(pLog, iOff) + pLog->szSector - 1;
   303  }
   304  
   305  /*
   306  ** If possible, reclaim log file space. Log file space is reclaimed after
   307  ** a snapshot that points to the same data in the database file is synced
   308  ** into the db header.
   309  */
   310  static int logReclaimSpace(lsm_db *pDb){
   311    int rc;
   312    int iMeta;
   313    int bRotrans;                   /* True if there exists some ro-trans */
   314  
   315    /* Test if there exists some other connection with a read-only transaction
   316    ** open. If there does, then log file space may not be reclaimed.  */
   317    rc = lsmDetectRoTrans(pDb, &bRotrans);
   318    if( rc!=LSM_OK || bRotrans ) return rc;
   319  
   320    iMeta = (int)pDb->pShmhdr->iMetaPage;
   321    if( iMeta==1 || iMeta==2 ){
   322      DbLog *pLog = &pDb->treehdr.log;
   323      i64 iSyncedId;
   324  
   325      /* Read the snapshot-id of the snapshot stored on meta-page iMeta. Note
   326      ** that in theory, the value read is untrustworthy (due to a race 
   327      ** condition - see comments above lsmFsReadSyncedId()). So it is only 
   328      ** ever used to conclude that no log space can be reclaimed. If it seems
   329      ** to indicate that it may be possible to reclaim log space, a
   330      ** second call to lsmCheckpointSynced() (which does return trustworthy
   331      ** values) is made below to confirm.  */
   332      rc = lsmFsReadSyncedId(pDb, iMeta, &iSyncedId);
   333  
   334      if( rc==LSM_OK && pLog->iSnapshotId!=iSyncedId ){
   335        i64 iSnapshotId = 0;
   336        i64 iOff = 0;
   337        rc = lsmCheckpointSynced(pDb, &iSnapshotId, &iOff, 0);
   338        if( rc==LSM_OK && pLog->iSnapshotId<iSnapshotId ){
   339          int iRegion;
   340          for(iRegion=0; iRegion<3; iRegion++){
   341            LogRegion *p = &pLog->aRegion[iRegion];
   342            if( iOff>=p->iStart && iOff<=p->iEnd ) break;
   343            p->iStart = 0;
   344            p->iEnd = 0;
   345          }
   346          assert( iRegion<3 );
   347          pLog->aRegion[iRegion].iStart = iOff;
   348          pLog->iSnapshotId = iSnapshotId;
   349        }
   350      }
   351    }
   352    return rc;
   353  }
   354  
   355  /*
   356  ** This function is called when a write-transaction is first opened. It
   357  ** is assumed that the caller is holding the client-mutex when it is 
   358  ** called.
   359  **
   360  ** Before returning, this function allocates the LogWriter object that
   361  ** will be used to write to the log file during the write transaction.
   362  ** LSM_OK is returned if no error occurs, otherwise an LSM error code.
   363  */
   364  int lsmLogBegin(lsm_db *pDb){
   365    int rc = LSM_OK;
   366    LogWriter *pNew;
   367    LogRegion *aReg;
   368  
   369    if( pDb->bUseLog==0 ) return LSM_OK;
   370  
   371    /* If the log file has not yet been opened, open it now. Also allocate
   372    ** the LogWriter structure, if it has not already been allocated.  */
   373    rc = lsmFsOpenLog(pDb, 0);
   374    if( pDb->pLogWriter==0 ){
   375      pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc);
   376      if( pNew ){
   377        lsmStringInit(&pNew->buf, pDb->pEnv);
   378        rc = lsmStringExtend(&pNew->buf, 2);
   379      }
   380      pDb->pLogWriter = pNew;
   381    }else{
   382      pNew = pDb->pLogWriter;
   383      assert( (u8 *)(&pNew[1])==(u8 *)(&((&pNew->buf)[1])) );
   384      memset(pNew, 0, ((u8 *)&pNew->buf) - (u8 *)pNew);
   385      pNew->buf.n = 0;
   386    }
   387  
   388    if( rc==LSM_OK ){
   389      /* The following call detects whether or not a new snapshot has been 
   390      ** synced into the database file. If so, it updates the contents of
   391      ** the pDb->treehdr.log structure to reclaim any space in the log
   392      ** file that is no longer required. 
   393      **
   394      ** TODO: Calling this every transaction is overkill. And since the 
   395      ** call has to read and checksum a snapshot from the database file,
   396      ** it is expensive. It would be better to figure out a way so that
   397      ** this is only called occasionally - say for every 32KB written to 
   398      ** the log file.
   399      */
   400      rc = logReclaimSpace(pDb);
   401    }
   402    if( rc!=LSM_OK ){
   403      lsmLogClose(pDb);
   404      return rc;
   405    }
   406  
   407    /* Set the effective sector-size for this transaction. Sectors are assumed
   408    ** to be one byte in size if the safety-mode is OFF or NORMAL, or as
   409    ** reported by lsmFsSectorSize if it is FULL.  */
   410    if( pDb->eSafety==LSM_SAFETY_FULL ){
   411      pNew->szSector = lsmFsSectorSize(pDb->pFS);
   412      assert( pNew->szSector>0 );
   413    }else{
   414      pNew->szSector = 1;
   415    }
   416  
   417    /* There are now three scenarios:
   418    **
   419    **   1) Regions 0 and 1 are both zero bytes in size and region 2 begins
   420    **      at a file offset greater than LSM_MIN_LOGWRAP. In this case, wrap
   421    **      around to the start and write data into the start of the log file. 
   422    **
   423    **   2) Region 1 is zero bytes in size and region 2 occurs earlier in the 
   424    **      file than region 0. In this case, append data to region 2, but
   425    **      remember to jump over region 1 if required.
   426    **
   427    **   3) Region 2 is the last in the file. Append to it.
   428    */
   429    aReg = &pDb->treehdr.log.aRegion[0];
   430  
   431    assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart );
   432    assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart );
   433  
   434    pNew->cksum0 = pDb->treehdr.log.cksum0;
   435    pNew->cksum1 = pDb->treehdr.log.cksum1;
   436  
   437    if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=LSM_MIN_LOGWRAP ){
   438      /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP 
   439      ** into the log file in this case. Pad it out to 8 bytes using a PAD2
   440      ** record so that the checksums can be updated immediately.  */
   441      u8 aJump[] = { 
   442        LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00 
   443      };
   444  
   445      lsmStringBinAppend(&pNew->buf, aJump, sizeof(aJump));
   446      logUpdateCksum(pNew, pNew->buf.n);
   447      rc = lsmFsWriteLog(pDb->pFS, aReg[2].iEnd, &pNew->buf);
   448      pNew->iCksumBuf = pNew->buf.n = 0;
   449  
   450      aReg[2].iEnd += 8;
   451      pNew->jump = aReg[0] = aReg[2];
   452      aReg[2].iStart = aReg[2].iEnd = 0;
   453    }else if( aReg[1].iEnd==0 && aReg[2].iEnd<aReg[0].iEnd ){
   454      /* Case 2. */
   455      pNew->iOff = aReg[2].iEnd;
   456      pNew->jump = aReg[0];
   457    }else{
   458      /* Case 3. */
   459      assert( aReg[2].iStart>=aReg[0].iEnd && aReg[2].iStart>=aReg[1].iEnd );
   460      pNew->iOff = aReg[2].iEnd;
   461    }
   462  
   463    if( pNew->jump.iStart ){
   464      i64 iRound;
   465      assert( pNew->jump.iStart>pNew->iOff );
   466  
   467      iRound = firstByteOnSector(pNew, pNew->jump.iStart);
   468      if( iRound>pNew->iOff ) pNew->jump.iStart = iRound;
   469      pNew->jump.iEnd = lastByteOnSector(pNew, pNew->jump.iEnd);
   470    }
   471  
   472    assert( pDb->pLogWriter==pNew );
   473    return rc;
   474  }
   475  
   476  /*
   477  ** This function is called when a write-transaction is being closed.
   478  ** Parameter bCommit is true if the transaction is being committed,
   479  ** or false otherwise. The caller must hold the client-mutex to call
   480  ** this function.
   481  **
   482  ** A call to this function deletes the LogWriter object allocated by
   483  ** lsmLogBegin(). If the transaction is being committed, the shared state
   484  ** in *pLog is updated before returning.
   485  */
   486  void lsmLogEnd(lsm_db *pDb, int bCommit){
   487    DbLog *pLog;
   488    LogWriter *p;
   489    p = pDb->pLogWriter;
   490  
   491    if( p==0 ) return;
   492    pLog = &pDb->treehdr.log;
   493  
   494    if( bCommit ){
   495      pLog->aRegion[2].iEnd = p->iOff;
   496      pLog->cksum0 = p->cksum0;
   497      pLog->cksum1 = p->cksum1;
   498      if( p->iRegion1End ){
   499        /* This happens when the transaction had to jump over some other
   500        ** part of the log.  */
   501        assert( pLog->aRegion[1].iEnd==0 );
   502        assert( pLog->aRegion[2].iStart<p->iRegion1End );
   503        pLog->aRegion[1].iStart = pLog->aRegion[2].iStart;
   504        pLog->aRegion[1].iEnd = p->iRegion1End;
   505        pLog->aRegion[2].iStart = p->iRegion2Start;
   506      }
   507    }
   508  }
   509  
   510  static int jumpIfRequired(
   511    lsm_db *pDb,
   512    LogWriter *pLog,
   513    int nReq,
   514    int *pbJump
   515  ){
   516    /* Determine if it is necessary to add an LSM_LOG_JUMP to jump over the
   517    ** jump region before writing the LSM_LOG_WRITE or DELETE record. This
   518    ** is necessary if there is insufficient room between the current offset
   519    ** and the jump region to fit the new WRITE/DELETE record and the largest
   520    ** possible JUMP record with up to 7 bytes of padding (a total of 17 
   521    ** bytes).  */
   522    if( (pLog->jump.iStart > (pLog->iOff + pLog->buf.n))
   523     && (pLog->jump.iStart < (pLog->iOff + pLog->buf.n + (nReq + 17))) 
   524    ){
   525      int rc;                       /* Return code */
   526      i64 iJump;                    /* Offset to jump to */
   527      u8 aJump[10];                 /* Encoded jump record */
   528      int nJump;                    /* Valid bytes in aJump[] */
   529      int nPad;                     /* Bytes of padding required */
   530  
   531      /* Serialize the JUMP record */
   532      iJump = pLog->jump.iEnd+1;
   533      aJump[0] = LSM_LOG_JUMP;
   534      nJump = 1 + lsmVarintPut64(&aJump[1], iJump);
   535  
   536      /* Adding padding to the contents of the buffer so that it will be a 
   537      ** multiple of 8 bytes in size after the JUMP record is appended. This
   538      ** is not strictly required, it just makes the keeping the running 
   539      ** checksum up to date in this file a little simpler.  */
   540      nPad = (pLog->buf.n + nJump) % 8;
   541      if( nPad ){
   542        u8 aPad[7] = {0,0,0,0,0,0,0};
   543        nPad = 8-nPad;
   544        if( nPad==1 ){
   545          aPad[0] = LSM_LOG_PAD1;
   546        }else{
   547          aPad[0] = LSM_LOG_PAD2;
   548          aPad[1] = (u8)(nPad-2);
   549        }
   550        rc = lsmStringBinAppend(&pLog->buf, aPad, nPad);
   551        if( rc!=LSM_OK ) return rc;
   552      }
   553  
   554      /* Append the JUMP record to the buffer. Then flush the buffer to disk
   555      ** and update the checksums. The next write to the log file (assuming
   556      ** there is no transaction rollback) will be to offset iJump (just past
   557      ** the jump region).  */
   558      rc = lsmStringBinAppend(&pLog->buf, aJump, nJump);
   559      if( rc!=LSM_OK ) return rc;
   560      assert( (pLog->buf.n % 8)==0 );
   561      rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf);
   562      if( rc!=LSM_OK ) return rc;
   563      logUpdateCksum(pLog, pLog->buf.n);
   564      pLog->iRegion1End = (pLog->iOff + pLog->buf.n);
   565      pLog->iRegion2Start = iJump;
   566      pLog->iOff = iJump;
   567      pLog->iCksumBuf = pLog->buf.n = 0;
   568      if( pbJump ) *pbJump = 1;
   569    }
   570  
   571    return LSM_OK;
   572  }
   573  
   574  static int logCksumAndFlush(lsm_db *pDb){
   575    int rc;                         /* Return code */
   576    LogWriter *pLog = pDb->pLogWriter;
   577  
   578    /* Calculate the checksum value. Append it to the buffer. */
   579    logUpdateCksum(pLog, pLog->buf.n);
   580    lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum0);
   581    pLog->buf.n += 4;
   582    lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum1);
   583    pLog->buf.n += 4;
   584  
   585    /* Write the contents of the buffer to disk. */
   586    rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf);
   587    pLog->iOff += pLog->buf.n;
   588    pLog->iCksumBuf = pLog->buf.n = 0;
   589  
   590    return rc;
   591  }
   592  
   593  /*
   594  ** Write the contents of the log-buffer to disk. Then write either a CKSUM
   595  ** or COMMIT record, depending on the value of parameter eType.
   596  */
   597  static int logFlush(lsm_db *pDb, int eType){
   598    int rc;
   599    int nReq;
   600    LogWriter *pLog = pDb->pLogWriter;
   601    
   602    assert( eType==LSM_LOG_COMMIT );
   603    assert( pLog );
   604  
   605    /* Commit record is always 9 bytes in size. */
   606    nReq = 9;
   607    if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ) nReq += pLog->szSector + 17;
   608    rc = jumpIfRequired(pDb, pLog, nReq, 0);
   609  
   610    /* If this is a COMMIT, add padding to the log so that the COMMIT record
   611    ** is aligned against the end of a disk sector. In other words, add padding
   612    ** so that the first byte following the COMMIT record lies on a different
   613    ** sector.  */
   614    if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ){
   615      int nPad;                     /* Bytes of padding to add */
   616  
   617      /* Determine the value of nPad. */
   618      nPad = ((pLog->iOff + pLog->buf.n + 9) % pLog->szSector);
   619      if( nPad ) nPad = pLog->szSector - nPad;
   620      rc = lsmStringExtend(&pLog->buf, nPad);
   621      if( rc!=LSM_OK ) return rc;
   622  
   623      while( nPad ){
   624        if( nPad==1 ){
   625          pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD1;
   626          nPad = 0;
   627        }else{
   628          int n = LSM_MIN(200, nPad-2);
   629          pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD2;
   630          pLog->buf.z[pLog->buf.n++] = (char)n;
   631          nPad -= 2;
   632          memset(&pLog->buf.z[pLog->buf.n], 0x2B, n);
   633          pLog->buf.n += n;
   634          nPad -= n;
   635        }
   636      }
   637    }
   638  
   639    /* Make sure there is room in the log-buffer to add the CKSUM or COMMIT
   640    ** record. Then add the first byte of it.  */
   641    rc = lsmStringExtend(&pLog->buf, 9);
   642    if( rc!=LSM_OK ) return rc;
   643    pLog->buf.z[pLog->buf.n++] = (char)eType;
   644    memset(&pLog->buf.z[pLog->buf.n], 0, 8);
   645  
   646    rc = logCksumAndFlush(pDb);
   647  
   648    /* If this is a commit and synchronous=full, sync the log to disk. */
   649    if( rc==LSM_OK && eType==LSM_LOG_COMMIT && pDb->eSafety==LSM_SAFETY_FULL ){
   650      rc = lsmFsSyncLog(pDb->pFS);
   651    }
   652    return rc;
   653  }
   654  
   655  /*
   656  ** Append an LSM_LOG_WRITE (if nVal>=0) or LSM_LOG_DELETE (if nVal<0) 
   657  ** record to the database log.
   658  */
   659  int lsmLogWrite(
   660    lsm_db *pDb,                    /* Database handle */
   661    int eType,
   662    void *pKey, int nKey,           /* Database key to write to log */
   663    void *pVal, int nVal            /* Database value (or nVal<0) to write */
   664  ){
   665    int rc = LSM_OK;
   666    LogWriter *pLog;                /* Log object to write to */
   667    int nReq;                       /* Bytes of space required in log */
   668    int bCksum = 0;                 /* True to embed a checksum in this record */
   669  
   670    assert( eType==LSM_WRITE || eType==LSM_DELETE || eType==LSM_DRANGE );
   671    assert( LSM_LOG_WRITE==LSM_WRITE );
   672    assert( LSM_LOG_DELETE==LSM_DELETE );
   673    assert( LSM_LOG_DRANGE==LSM_DRANGE );
   674    assert( (eType==LSM_LOG_DELETE)==(nVal<0) );
   675  
   676    if( pDb->bUseLog==0 ) return LSM_OK;
   677    pLog = pDb->pLogWriter;
   678  
   679    /* Determine how many bytes of space are required, assuming that a checksum
   680    ** will be embedded in this record (even though it may not be).  */
   681    nReq = 1 + lsmVarintLen32(nKey) + 8 + nKey;
   682    if( eType!=LSM_LOG_DELETE ) nReq += lsmVarintLen32(nVal) + nVal;
   683  
   684    /* Jump over the jump region if required. Set bCksum to true to tell the
   685    ** code below to include a checksum in the record if either (a) writing
   686    ** this record would mean that more than LSM_CKSUM_MAXDATA bytes of data
   687    ** have been written to the log since the last checksum, or (b) the jump
   688    ** is taken.  */
   689    rc = jumpIfRequired(pDb, pLog, nReq, &bCksum);
   690    if( (pLog->buf.n+nReq) > LSM_CKSUM_MAXDATA ) bCksum = 1;
   691  
   692    if( rc==LSM_OK ){
   693      rc = lsmStringExtend(&pLog->buf, nReq);
   694    }
   695    if( rc==LSM_OK ){
   696      u8 *a = (u8 *)&pLog->buf.z[pLog->buf.n];
   697      
   698      /* Write the record header - the type byte followed by either 1 (for
   699      ** DELETE) or 2 (for WRITE) varints.  */
   700      assert( LSM_LOG_WRITE_CKSUM == (LSM_LOG_WRITE | 0x0001) );
   701      assert( LSM_LOG_DELETE_CKSUM == (LSM_LOG_DELETE | 0x0001) );
   702      assert( LSM_LOG_DRANGE_CKSUM == (LSM_LOG_DRANGE | 0x0001) );
   703      *(a++) = (u8)eType | (u8)bCksum;
   704      a += lsmVarintPut32(a, nKey);
   705      if( eType!=LSM_LOG_DELETE ) a += lsmVarintPut32(a, nVal);
   706  
   707      if( bCksum ){
   708        pLog->buf.n = (a - (u8 *)pLog->buf.z);
   709        rc = logCksumAndFlush(pDb);
   710        a = (u8 *)&pLog->buf.z[pLog->buf.n];
   711      }
   712  
   713      memcpy(a, pKey, nKey);
   714      a += nKey;
   715      if( eType!=LSM_LOG_DELETE ){
   716        memcpy(a, pVal, nVal);
   717        a += nVal;
   718      }
   719      pLog->buf.n = a - (u8 *)pLog->buf.z;
   720      assert( pLog->buf.n<=pLog->buf.nAlloc );
   721    }
   722  
   723    return rc;
   724  }
   725  
   726  /*
   727  ** Append an LSM_LOG_COMMIT record to the database log.
   728  */
   729  int lsmLogCommit(lsm_db *pDb){
   730    if( pDb->bUseLog==0 ) return LSM_OK;
   731    return logFlush(pDb, LSM_LOG_COMMIT);
   732  }
   733  
   734  /*
   735  ** Store the current offset and other checksum related information in the
   736  ** structure *pMark. Later, *pMark can be passed to lsmLogSeek() to "rewind"
   737  ** the LogWriter object to the current log file offset. This is used when
   738  ** rolling back savepoint transactions.
   739  */
   740  void lsmLogTell(
   741    lsm_db *pDb,                    /* Database handle */
   742    LogMark *pMark                  /* Populate this object with current offset */
   743  ){
   744    LogWriter *pLog;
   745    int nCksum;
   746  
   747    if( pDb->bUseLog==0 ) return;
   748    pLog = pDb->pLogWriter;
   749    nCksum = pLog->buf.n & 0xFFFFFFF8;
   750    logUpdateCksum(pLog, nCksum);
   751    assert( pLog->iCksumBuf==nCksum );
   752    pMark->nBuf = pLog->buf.n - nCksum;
   753    memcpy(pMark->aBuf, &pLog->buf.z[nCksum], pMark->nBuf);
   754  
   755    pMark->iOff = pLog->iOff + pLog->buf.n;
   756    pMark->cksum0 = pLog->cksum0;
   757    pMark->cksum1 = pLog->cksum1;
   758  }
   759  
   760  /*
   761  ** Seek (rewind) back to the log file offset stored by an ealier call to
   762  ** lsmLogTell() in *pMark.
   763  */
   764  void lsmLogSeek(
   765    lsm_db *pDb,                    /* Database handle */
   766    LogMark *pMark                  /* Object containing log offset to seek to */
   767  ){
   768    LogWriter *pLog;
   769  
   770    if( pDb->bUseLog==0 ) return;
   771    pLog = pDb->pLogWriter;
   772  
   773    assert( pMark->iOff<=pLog->iOff+pLog->buf.n );
   774    if( (pMark->iOff & 0xFFFFFFF8)>=pLog->iOff ){
   775      pLog->buf.n = (int)(pMark->iOff - pLog->iOff);
   776      pLog->iCksumBuf = (pLog->buf.n & 0xFFFFFFF8);
   777    }else{
   778      pLog->buf.n = pMark->nBuf;
   779      memcpy(pLog->buf.z, pMark->aBuf, pMark->nBuf);
   780      pLog->iCksumBuf = 0;
   781      pLog->iOff = pMark->iOff - pMark->nBuf;
   782    }
   783    pLog->cksum0 = pMark->cksum0;
   784    pLog->cksum1 = pMark->cksum1;
   785  
   786    if( pMark->iOff > pLog->iRegion1End ) pLog->iRegion1End = 0;
   787    if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0;
   788  }
   789  
   790  /*
   791  ** This function does the work for an lsm_info(LOG_STRUCTURE) request.
   792  */
   793  int lsmInfoLogStructure(lsm_db *pDb, char **pzVal){
   794    int rc = LSM_OK;
   795    char *zVal = 0;
   796  
   797    /* If there is no read or write transaction open, read the latest 
   798    ** tree-header from shared-memory to report on. If necessary, update
   799    ** it based on the contents of the database header.  
   800    **
   801    ** No locks are taken here - these are passive read operations only.
   802    */
   803    if( pDb->pCsr==0 && pDb->nTransOpen==0 ){
   804      rc = lsmTreeLoadHeader(pDb, 0);
   805      if( rc==LSM_OK ) rc = logReclaimSpace(pDb);
   806    }
   807  
   808    if( rc==LSM_OK ){
   809      DbLog *pLog = &pDb->treehdr.log;
   810      zVal = lsmMallocPrintf(pDb->pEnv, 
   811          "%d %d %d %d %d %d", 
   812          (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd,
   813          (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd,
   814          (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd
   815      );
   816      if( !zVal ) rc = LSM_NOMEM_BKPT;
   817    }
   818  
   819    *pzVal = zVal;
   820    return rc;
   821  }
   822  
   823  /*************************************************************************
   824  ** Begin code for log recovery.
   825  */
   826  
   827  typedef struct LogReader LogReader;
   828  struct LogReader {
   829    FileSystem *pFS;                /* File system to read from */
   830    i64 iOff;                       /* File offset at end of buf content */
   831    int iBuf;                       /* Current read offset in buf */
   832    LsmString buf;                  /* Buffer containing file content */
   833  
   834    int iCksumBuf;                  /* Offset in buf corresponding to cksum[01] */
   835    u32 cksum0;                     /* Checksum 0 at offset iCksumBuf */
   836    u32 cksum1;                     /* Checksum 1 at offset iCksumBuf */
   837  };
   838  
   839  static void logReaderBlob(
   840    LogReader *p,                   /* Log reader object */
   841    LsmString *pBuf,                /* Dynamic storage, if required */
   842    int nBlob,                      /* Number of bytes to read */
   843    u8 **ppBlob,                    /* OUT: Pointer to blob read */
   844    int *pRc                        /* IN/OUT: Error code */
   845  ){
   846    static const int LOG_READ_SIZE = 512;
   847    int rc = *pRc;                  /* Return code */
   848    int nReq = nBlob;               /* Bytes required */
   849  
   850    while( rc==LSM_OK && nReq>0 ){
   851      int nAvail;                   /* Bytes of data available in p->buf */
   852      if( p->buf.n==p->iBuf ){
   853        int nCksum;                 /* Total bytes requiring checksum */
   854        int nCarry = 0;             /* Total bytes requiring checksum */
   855  
   856        nCksum = p->iBuf - p->iCksumBuf;
   857        if( nCksum>0 ){
   858          nCarry = nCksum % 8;
   859          nCksum = ((nCksum / 8) * 8);
   860          if( nCksum>0 ){
   861            logCksumUnaligned(
   862                &p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1
   863            );
   864          }
   865        }
   866        if( nCarry>0 ) memcpy(p->buf.z, &p->buf.z[p->iBuf-nCarry], nCarry);
   867        p->buf.n = nCarry;
   868        p->iBuf = nCarry;
   869  
   870        rc = lsmFsReadLog(p->pFS, p->iOff, LOG_READ_SIZE, &p->buf);
   871        if( rc!=LSM_OK ) break;
   872        p->iCksumBuf = 0;
   873        p->iOff += LOG_READ_SIZE;
   874      }
   875  
   876      nAvail = p->buf.n - p->iBuf;
   877      if( ppBlob && nReq==nBlob && nBlob<=nAvail ){
   878        *ppBlob = (u8 *)&p->buf.z[p->iBuf];
   879        p->iBuf += nBlob;
   880        nReq = 0;
   881      }else{
   882        int nCopy = LSM_MIN(nAvail, nReq);
   883        if( nBlob==nReq ){
   884          pBuf->n = 0;
   885        }
   886        rc = lsmStringBinAppend(pBuf, (u8 *)&p->buf.z[p->iBuf], nCopy);
   887        nReq -= nCopy;
   888        p->iBuf += nCopy;
   889        if( nReq==0 && ppBlob ){
   890          *ppBlob = (u8*)pBuf->z;
   891        }
   892      }
   893    }
   894  
   895    *pRc = rc;
   896  }
   897  
   898  static void logReaderVarint(
   899    LogReader *p, 
   900    LsmString *pBuf,
   901    int *piVal,                     /* OUT: Value read from log */
   902    int *pRc                        /* IN/OUT: Error code */
   903  ){
   904    if( *pRc==LSM_OK ){
   905      u8 *aVarint;
   906      if( p->buf.n==p->iBuf ){
   907        logReaderBlob(p, 0, 10, &aVarint, pRc);
   908        if( LSM_OK==*pRc ) p->iBuf -= (10 - lsmVarintGet32(aVarint, piVal));
   909      }else{
   910        logReaderBlob(p, pBuf, lsmVarintSize(p->buf.z[p->iBuf]), &aVarint, pRc);
   911        if( LSM_OK==*pRc ) lsmVarintGet32(aVarint, piVal);
   912      }
   913    }
   914  }
   915  
   916  static void logReaderByte(LogReader *p, u8 *pByte, int *pRc){
   917    u8 *pPtr = 0;
   918    logReaderBlob(p, 0, 1, &pPtr, pRc);
   919    if( pPtr ) *pByte = *pPtr;
   920  }
   921  
   922  static void logReaderCksum(LogReader *p, LsmString *pBuf, int *pbEof, int *pRc){
   923    if( *pRc==LSM_OK ){
   924      u8 *pPtr = 0;
   925      u32 cksum0, cksum1;
   926      int nCksum = p->iBuf - p->iCksumBuf;
   927  
   928      /* Update in-memory (expected) checksums */
   929      assert( nCksum>=0 );
   930      logCksumUnaligned(&p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1);
   931      p->iCksumBuf = p->iBuf + 8;
   932      logReaderBlob(p, pBuf, 8, &pPtr, pRc);
   933      assert( pPtr || *pRc );
   934  
   935      /* Read the checksums from the log file. Set *pbEof if they do not match. */
   936      if( pPtr ){
   937        cksum0 = lsmGetU32(pPtr);
   938        cksum1 = lsmGetU32(&pPtr[4]);
   939        *pbEof = (cksum0!=p->cksum0 || cksum1!=p->cksum1);
   940        p->iCksumBuf = p->iBuf;
   941      }
   942    }
   943  }
   944  
   945  static void logReaderInit(
   946    lsm_db *pDb,                    /* Database handle */
   947    DbLog *pLog,                    /* Log object associated with pDb */
   948    int bInitBuf,                   /* True if p->buf is uninitialized */
   949    LogReader *p                    /* Initialize this LogReader object */
   950  ){
   951    p->pFS = pDb->pFS;
   952    p->iOff = pLog->aRegion[2].iStart;
   953    p->cksum0 = pLog->cksum0;
   954    p->cksum1 = pLog->cksum1;
   955    if( bInitBuf ){ lsmStringInit(&p->buf, pDb->pEnv); }
   956    p->buf.n = 0;
   957    p->iCksumBuf = 0;
   958    p->iBuf = 0;
   959  }
   960  
   961  /*
   962  ** This function is called after reading the header of a LOG_DELETE or
   963  ** LOG_WRITE record. Parameter nByte is the total size of the key and
   964  ** value that follow the header just read. Return true if the size and
   965  ** position of the record indicate that it should contain a checksum.
   966  */
   967  static int logRequireCksum(LogReader *p, int nByte){
   968    return ((p->iBuf + nByte - p->iCksumBuf) > LSM_CKSUM_MAXDATA);
   969  }
   970  
   971  /*
   972  ** Recover the contents of the log file.
   973  */
   974  int lsmLogRecover(lsm_db *pDb){
   975    LsmString buf1;                 /* Key buffer */
   976    LsmString buf2;                 /* Value buffer */
   977    LogReader reader;               /* Log reader object */
   978    int rc = LSM_OK;                /* Return code */
   979    int nCommit = 0;                /* Number of transactions to recover */
   980    int iPass;
   981    int nJump = 0;                  /* Number of LSM_LOG_JUMP records in pass 0 */
   982    DbLog *pLog;
   983    int bOpen;
   984  
   985    rc = lsmFsOpenLog(pDb, &bOpen);
   986    if( rc!=LSM_OK ) return rc;
   987  
   988    rc = lsmTreeInit(pDb);
   989    if( rc!=LSM_OK ) return rc;
   990  
   991    pLog = &pDb->treehdr.log;
   992    lsmCheckpointLogoffset(pDb->pShmhdr->aSnap2, pLog);
   993  
   994    logReaderInit(pDb, pLog, 1, &reader);
   995    lsmStringInit(&buf1, pDb->pEnv);
   996    lsmStringInit(&buf2, pDb->pEnv);
   997  
   998    /* The outer for() loop runs at most twice. The first iteration is to 
   999    ** count the number of committed transactions in the log. The second 
  1000    ** iterates through those transactions and updates the in-memory tree 
  1001    ** structure with their contents.  */
  1002    if( bOpen ){
  1003      for(iPass=0; iPass<2 && rc==LSM_OK; iPass++){
  1004        int bEof = 0;
  1005  
  1006        while( rc==LSM_OK && !bEof ){
  1007          u8 eType = 0;
  1008          logReaderByte(&reader, &eType, &rc);
  1009  
  1010          switch( eType ){
  1011            case LSM_LOG_PAD1:
  1012              break;
  1013  
  1014            case LSM_LOG_PAD2: {
  1015              int nPad;
  1016              logReaderVarint(&reader, &buf1, &nPad, &rc);
  1017              logReaderBlob(&reader, &buf1, nPad, 0, &rc);
  1018              break;
  1019            }
  1020  
  1021            case LSM_LOG_DRANGE:
  1022            case LSM_LOG_DRANGE_CKSUM:
  1023            case LSM_LOG_WRITE:
  1024            case LSM_LOG_WRITE_CKSUM: {
  1025              int nKey;
  1026              int nVal;
  1027              u8 *aVal;
  1028              logReaderVarint(&reader, &buf1, &nKey, &rc);
  1029              logReaderVarint(&reader, &buf2, &nVal, &rc);
  1030  
  1031              if( eType==LSM_LOG_WRITE_CKSUM || eType==LSM_LOG_DRANGE_CKSUM ){
  1032                logReaderCksum(&reader, &buf1, &bEof, &rc);
  1033              }else{
  1034                bEof = logRequireCksum(&reader, nKey+nVal);
  1035              }
  1036              if( bEof ) break;
  1037  
  1038              logReaderBlob(&reader, &buf1, nKey, 0, &rc);
  1039              logReaderBlob(&reader, &buf2, nVal, &aVal, &rc);
  1040              if( iPass==1 && rc==LSM_OK ){ 
  1041                if( eType==LSM_LOG_WRITE || eType==LSM_LOG_WRITE_CKSUM ){
  1042                  rc = lsmTreeInsert(pDb, (u8 *)buf1.z, nKey, aVal, nVal);
  1043                }else{
  1044                  rc = lsmTreeDelete(pDb, (u8 *)buf1.z, nKey, aVal, nVal);
  1045                }
  1046              }
  1047              break;
  1048            }
  1049  
  1050            case LSM_LOG_DELETE:
  1051            case LSM_LOG_DELETE_CKSUM: {
  1052              int nKey; u8 *aKey;
  1053              logReaderVarint(&reader, &buf1, &nKey, &rc);
  1054  
  1055              if( eType==LSM_LOG_DELETE_CKSUM ){
  1056                logReaderCksum(&reader, &buf1, &bEof, &rc);
  1057              }else{
  1058                bEof = logRequireCksum(&reader, nKey);
  1059              }
  1060              if( bEof ) break;
  1061  
  1062              logReaderBlob(&reader, &buf1, nKey, &aKey, &rc);
  1063              if( iPass==1 && rc==LSM_OK ){ 
  1064                rc = lsmTreeInsert(pDb, aKey, nKey, NULL, -1);
  1065              }
  1066              break;
  1067            }
  1068  
  1069            case LSM_LOG_COMMIT:
  1070              logReaderCksum(&reader, &buf1, &bEof, &rc);
  1071              if( bEof==0 ){
  1072                nCommit++;
  1073                assert( nCommit>0 || iPass==1 );
  1074                if( nCommit==0 ) bEof = 1;
  1075              }
  1076              break;
  1077  
  1078            case LSM_LOG_JUMP: {
  1079              int iOff = 0;
  1080              logReaderVarint(&reader, &buf1, &iOff, &rc);
  1081              if( rc==LSM_OK ){
  1082                if( iPass==1 ){
  1083                  if( pLog->aRegion[2].iStart==0 ){
  1084                    assert( pLog->aRegion[1].iStart==0 );
  1085                    pLog->aRegion[1].iEnd = reader.iOff;
  1086                  }else{
  1087                    assert( pLog->aRegion[0].iStart==0 );
  1088                    pLog->aRegion[0].iStart = pLog->aRegion[2].iStart;
  1089                    pLog->aRegion[0].iEnd = reader.iOff-reader.buf.n+reader.iBuf;
  1090                  }
  1091                  pLog->aRegion[2].iStart = iOff;
  1092                }else{
  1093                  if( (nJump++)==2 ){
  1094                    bEof = 1;
  1095                  }
  1096                }
  1097  
  1098                reader.iOff = iOff;
  1099                reader.buf.n = reader.iBuf;
  1100              }
  1101              break;
  1102            }
  1103  
  1104            default:
  1105              /* Including LSM_LOG_EOF */
  1106              bEof = 1;
  1107              break;
  1108          }
  1109        }
  1110  
  1111        if( rc==LSM_OK && iPass==0 ){
  1112          if( nCommit==0 ){
  1113            if( pLog->aRegion[2].iStart==0 ){
  1114              iPass = 1;
  1115            }else{
  1116              pLog->aRegion[2].iStart = 0;
  1117              iPass = -1;
  1118              lsmCheckpointZeroLogoffset(pDb);
  1119            }
  1120          }
  1121          logReaderInit(pDb, pLog, 0, &reader);
  1122          nCommit = nCommit * -1;
  1123        }
  1124      }
  1125    }
  1126  
  1127    /* Initialize DbLog object */
  1128    if( rc==LSM_OK ){
  1129      pLog->aRegion[2].iEnd = reader.iOff - reader.buf.n + reader.iBuf;
  1130      pLog->cksum0 = reader.cksum0;
  1131      pLog->cksum1 = reader.cksum1;
  1132    }
  1133  
  1134    if( rc==LSM_OK ){
  1135      rc = lsmFinishRecovery(pDb);
  1136    }else{
  1137      lsmFinishRecovery(pDb);
  1138    }
  1139  
  1140    if( pDb->bRoTrans ){
  1141      lsmFsCloseLog(pDb);
  1142    }
  1143  
  1144    lsmStringClear(&buf1);
  1145    lsmStringClear(&buf2);
  1146    lsmStringClear(&reader.buf);
  1147    return rc;
  1148  }
  1149  
  1150  void lsmLogClose(lsm_db *db){
  1151    if( db->pLogWriter ){
  1152      lsmFree(db->pEnv, db->pLogWriter->buf.z);
  1153      lsmFree(db->pEnv, db->pLogWriter);
  1154      db->pLogWriter = 0;
  1155    }
  1156  }