modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/lsm1/lsm_file.c

modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/lsm1/lsm_file.c (about)

     1  /*
     2  ** 2011-08-26
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  ** 
    13  ** NORMAL DATABASE FILE FORMAT
    14  **
    15  ** The following database file format concepts are used by the code in
    16  ** this file to read and write the database file.
    17  **
    18  ** Pages:
    19  **
    20  **   A database file is divided into pages. The first 8KB of the file consists
    21  **   of two 4KB meta-pages. The meta-page size is not configurable. The 
    22  **   remainder of the file is made up of database pages. The default database
    23  **   page size is 4KB. Database pages are aligned to page-size boundaries,
    24  **   so if the database page size is larger than 8KB there is a gap between
    25  **   the end of the meta pages and the start of the database pages.
    26  **
    27  **   Database pages are numbered based on their position in the file. Page N
    28  **   begins at byte offset ((N-1)*pgsz). This means that page 1 does not 
    29  **   exist - since it would always overlap with the meta pages. If the 
    30  **   page-size is (say) 512 bytes, then the first usable page in the database
    31  **   is page 33.
    32  **
    33  **   It is assumed that the first two meta pages and the data that follows
    34  **   them are located on different disk sectors. So that if a power failure 
    35  **   while writing to a meta page there is no risk of damage to the other
    36  **   meta page or any other part of the database file. TODO: This may need
    37  **   to be revisited.
    38  **
    39  ** Blocks:
    40  **
    41  **   The database file is also divided into blocks. The default block size is
    42  **   1MB. When writing to the database file, an attempt is made to write data
    43  **   in contiguous block-sized chunks.
    44  **
    45  **   The first and last page on each block are special in that they are 4 
    46  **   bytes smaller than all other pages. This is because the last four bytes 
    47  **   of space on the first and last pages of each block are reserved for
    48  **   pointers to other blocks (i.e. a 32-bit block number).
    49  **
    50  ** Runs:
    51  **
    52  **   A run is a sequence of pages that the upper layer uses to store a 
    53  **   sorted array of database keys (and accompanying data - values, FC 
    54  **   pointers and so on). Given a page within a run, it is possible to
    55  **   navigate to the next page in the run as follows:
    56  **
    57  **     a) if the current page is not the last in a block, the next page 
    58  **        in the run is located immediately after the current page, OR
    59  **
    60  **     b) if the current page is the last page in a block, the next page 
    61  **        in the run is the first page on the block identified by the
    62  **        block pointer stored in the last 4 bytes of the current block.
    63  **
    64  **   It is possible to navigate to the previous page in a similar fashion,
    65  **   using the block pointer embedded in the last 4 bytes of the first page
    66  **   of each block as required.
    67  **
    68  **   The upper layer is responsible for identifying by page number the 
    69  **   first and last page of any run that it needs to navigate - there are
    70  **   no "end-of-run" markers stored or identified by this layer. This is
    71  **   necessary as clients reading different database snapshots may access 
    72  **   different subsets of a run.
    73  **
    74  ** THE LOG FILE 
    75  **
    76  ** This file opens and closes the log file. But it does not contain any
    77  ** logic related to the log file format. Instead, it exports the following
    78  ** functions that are used by the code in lsm_log.c to read and write the
    79  ** log file:
    80  **
    81  **     lsmFsOpenLog
    82  **     lsmFsWriteLog
    83  **     lsmFsSyncLog
    84  **     lsmFsReadLog
    85  **     lsmFsTruncateLog
    86  **     lsmFsCloseAndDeleteLog
    87  **
    88  ** COMPRESSED DATABASE FILE FORMAT
    89  **
    90  ** The compressed database file format is very similar to the normal format.
    91  ** The file still begins with two 4KB meta-pages (which are never compressed).
    92  ** It is still divided into blocks.
    93  **
    94  ** The first and last four bytes of each block are reserved for 32-bit 
    95  ** pointer values. Similar to the way four bytes are carved from the end of 
    96  ** the first and last page of each block in uncompressed databases. From
    97  ** the point of view of the upper layer, all pages are the same size - this
    98  ** is different from the uncompressed format where the first and last pages
    99  ** on each block are 4 bytes smaller than the others.
   100  **
   101  ** Pages are stored in variable length compressed form, as follows:
   102  **
   103  **     * 3-byte size field containing the size of the compressed page image
   104  **       in bytes. The most significant bit of each byte of the size field
   105  **       is always set. The remaining 7 bits are used to store a 21-bit
   106  **       integer value (in big-endian order - the first byte in the field
   107  **       contains the most significant 7 bits). Since the maximum allowed 
   108  **       size of a compressed page image is (2^17 - 1) bytes, there are
   109  **       actually 4 unused bits in the size field.
   110  **
   111  **       In other words, if the size of the compressed page image is nSz,
   112  **       the header can be serialized as follows:
   113  **
   114  **         u8 aHdr[3]
   115  **         aHdr[0] = 0x80 | (u8)(nSz >> 14);
   116  **         aHdr[1] = 0x80 | (u8)(nSz >>  7);
   117  **         aHdr[2] = 0x80 | (u8)(nSz >>  0);
   118  **
   119  **     * Compressed page image.
   120  **
   121  **     * A second copy of the 3-byte record header.
   122  **
   123  ** A page number is a byte offset into the database file. So the smallest
   124  ** possible page number is 8192 (immediately after the two meta-pages).
   125  ** The first and root page of a segment are identified by a page number
   126  ** corresponding to the byte offset of the first byte in the corresponding
   127  ** page record. The last page of a segment is identified by the byte offset
   128  ** of the last byte in its record.
   129  **
   130  ** Unlike uncompressed pages, compressed page records may span blocks.
   131  **
   132  ** Sometimes, in order to avoid touching sectors that contain synced data
   133  ** when writing, it is necessary to insert unused space between compressed
   134  ** page records. This can be done as follows:
   135  **
   136  **     * For less than 6 bytes of empty space, the first and last byte
   137  **       of the free space contain the total number of free bytes. For
   138  **       example:
   139  **
   140  **         Block of 4 free bytes: 0x04 0x?? 0x?? 0x04
   141  **         Block of 2 free bytes: 0x02 0x02
   142  **         A single free byte:    0x01
   143  **
   144  **     * For 6 or more bytes of empty space, a record similar to a 
   145  **       compressed page record is added to the segment. A padding record
   146  **       is distinguished from a compressed page record by the most 
   147  **       significant bit of the second byte of the size field, which is
   148  **       cleared instead of set. 
   149  */
   150  #include "lsmInt.h"
   151  
   152  #include <sys/types.h>
   153  #include <sys/stat.h>
   154  #include <fcntl.h>
   155  
   156  /*
   157  ** File-system object. Each database connection allocates a single instance
   158  ** of the following structure. It is used for all access to the database and
   159  ** log files.
   160  **
   161  ** The database file may be accessed via two methods - using mmap() or using
   162  ** read() and write() calls. In the general case both methods are used - a
   163  ** prefix of the file is mapped into memory and the remainder accessed using
   164  ** read() and write(). This is helpful when accessing very large files (or
   165  ** files that may grow very large during the lifetime of a database
   166  ** connection) on systems with 32-bit address spaces. However, it also requires
   167  ** that this object manage two distinct types of Page objects simultaneously -
   168  ** those that carry pointers to the mapped file and those that carry arrays
   169  ** populated by read() calls.
   170  **
   171  ** pFree:
   172  **   The head of a singly-linked list that containing currently unused Page 
   173  **   structures suitable for use as mmap-page handles. Connected by the
   174  **   Page.pFreeNext pointers.
   175  **
   176  ** pMapped:
   177  **   The head of a singly-linked list that contains all pages that currently
   178  **   carry pointers to the mapped region. This is used if the region is
   179  **   every remapped - the pointers carried by existing pages can be adjusted
   180  **   to account for the remapping. Connected by the Page.pMappedNext pointers.
   181  **
   182  ** pWaiting:
   183  **   When the upper layer wishes to append a new b-tree page to a segment,
   184  **   it allocates a Page object that carries a malloc'd block of memory -
   185  **   regardless of the mmap-related configuration. The page is not assigned
   186  **   a page number at first. When the upper layer has finished constructing
   187  **   the page contents, it calls lsmFsPagePersist() to assign a page number
   188  **   to it. At this point it is likely that N pages have been written to the
   189  **   segment, the (N+1)th page is still outstanding and the b-tree page is
   190  **   assigned page number (N+2). To avoid writing page (N+2) before page 
   191  **   (N+1), the recently completed b-tree page is held in the singly linked
   192  **   list headed by pWaiting until page (N+1) has been written. 
   193  **
   194  **   Function lsmFsFlushWaiting() is responsible for eventually writing 
   195  **   waiting pages to disk.
   196  **
   197  ** apHash/nHash:
   198  **   Hash table used to store all Page objects that carry malloc'd arrays,
   199  **   except those b-tree pages that have not yet been assigned page numbers.
   200  **   Once they have been assigned page numbers - they are added to this
   201  **   hash table.
   202  **
   203  **   Hash table overflow chains are connected using the Page.pHashNext
   204  **   pointers.
   205  **
   206  ** pLruFirst, pLruLast:
   207  **   The first and last entries in a doubly-linked list of pages. This
   208  **   list contains all pages with malloc'd data that are present in the
   209  **   hash table and have a ref-count of zero.
   210  */
   211  struct FileSystem {
   212    lsm_db *pDb;                    /* Database handle that owns this object */
   213    lsm_env *pEnv;                  /* Environment pointer */
   214    char *zDb;                      /* Database file name */
   215    char *zLog;                     /* Database file name */
   216    int nMetasize;                  /* Size of meta pages in bytes */
   217    int nMetaRwSize;                /* Read/written size of meta pages in bytes */
   218    int nPagesize;                  /* Database page-size in bytes */
   219    int nBlocksize;                 /* Database block-size in bytes */
   220  
   221    /* r/w file descriptors for both files. */
   222    LsmFile *pLsmFile;              /* Used after lsm_close() to link into list */
   223    lsm_file *fdDb;                 /* Database file */
   224    lsm_file *fdLog;                /* Log file */
   225    int szSector;                   /* Database file sector size */
   226  
   227    /* If this is a compressed database, a pointer to the compression methods.
   228    ** For an uncompressed database, a NULL pointer.  */
   229    lsm_compress *pCompress;
   230    u8 *aIBuffer;                   /* Buffer to compress to */
   231    u8 *aOBuffer;                   /* Buffer to uncompress from */
   232    int nBuffer;                    /* Allocated size of above buffers in bytes */
   233  
   234    /* mmap() page related things */
   235    i64 nMapLimit;                  /* Maximum bytes of file to map */
   236    void *pMap;                     /* Current mapping of database file */
   237    i64 nMap;                       /* Bytes mapped at pMap */
   238    Page *pFree;                    /* Unused Page structures */
   239    Page *pMapped;                  /* List of Page structs that point to pMap */
   240  
   241    /* Page cache parameters for non-mmap() pages */
   242    int nCacheMax;                  /* Configured cache size (in pages) */
   243    int nCacheAlloc;                /* Current cache size (in pages) */
   244    Page *pLruFirst;                /* Head of the LRU list */
   245    Page *pLruLast;                 /* Tail of the LRU list */
   246    int nHash;                      /* Number of hash slots in hash table */
   247    Page **apHash;                  /* nHash Hash slots */
   248    Page *pWaiting;                 /* b-tree pages waiting to be written */
   249  
   250    /* Statistics */
   251    int nOut;                       /* Number of outstanding pages */
   252    int nWrite;                     /* Total number of pages written */
   253    int nRead;                      /* Total number of pages read */
   254  };
   255  
   256  /*
   257  ** Database page handle.
   258  **
   259  ** pSeg:
   260  **   When lsmFsSortedAppend() is called on a compressed database, the new
   261  **   page is not assigned a page number or location in the database file
   262  **   immediately. Instead, these are assigned by the lsmFsPagePersist() call
   263  **   right before it writes the compressed page image to disk.
   264  **
   265  **   The lsmFsSortedAppend() function sets the pSeg pointer to point to the
   266  **   segment that the new page will be a part of. It is unset by
   267  **   lsmFsPagePersist() after the page is written to disk.
   268  */
   269  struct Page {
   270    u8 *aData;                      /* Buffer containing page data */
   271    int nData;                      /* Bytes of usable data at aData[] */
   272    Pgno iPg;                       /* Page number */
   273    int nRef;                       /* Number of outstanding references */
   274    int flags;                      /* Combination of PAGE_XXX flags */
   275    Page *pHashNext;                /* Next page in hash table slot */
   276    Page *pLruNext;                 /* Next page in LRU list */
   277    Page *pLruPrev;                 /* Previous page in LRU list */
   278    FileSystem *pFS;                /* File system that owns this page */
   279  
   280    /* Only used in compressed database mode: */
   281    int nCompress;                  /* Compressed size (or 0 for uncomp. db) */
   282    int nCompressPrev;              /* Compressed size of prev page */
   283    Segment *pSeg;                  /* Segment this page will be written to */
   284  
   285    /* Pointers for singly linked lists */
   286    Page *pWaitingNext;             /* Next page in FileSystem.pWaiting list */
   287    Page *pFreeNext;                /* Next page in FileSystem.pFree list */
   288    Page *pMappedNext;              /* Next page in FileSystem.pMapped list */
   289  };
   290  
   291  /*
   292  ** Meta-data page handle. There are two meta-data pages at the start of
   293  ** the database file, each FileSystem.nMetasize bytes in size.
   294  */
   295  struct MetaPage {
   296    int iPg;                        /* Either 1 or 2 */
   297    int bWrite;                     /* Write back to db file on release */
   298    u8 *aData;                      /* Pointer to buffer */
   299    FileSystem *pFS;                /* FileSystem that owns this page */
   300  };
   301  
   302  /* 
   303  ** Values for LsmPage.flags 
   304  */
   305  #define PAGE_DIRTY   0x00000001   /* Set if page is dirty */
   306  #define PAGE_FREE    0x00000002   /* Set if Page.aData requires lsmFree() */
   307  #define PAGE_HASPREV 0x00000004   /* Set if page is first on uncomp. block */
   308  
   309  /*
   310  ** Number of pgsz byte pages omitted from the start of block 1. The start
   311  ** of block 1 contains two 4096 byte meta pages (8192 bytes in total).
   312  */
   313  #define BLOCK1_HDR_SIZE(pgsz)  LSM_MAX(1, 8192/(pgsz))
   314  
   315  /*
   316  ** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt()
   317  ** to catch IO errors (any error returned by a VFS method). 
   318  */
   319  #ifndef NDEBUG
   320  static void lsmIoerrBkpt(void){
   321    static int nErr = 0;
   322    nErr++;
   323  }
   324  static int IOERR_WRAPPER(int rc){
   325    if( rc!=LSM_OK ) lsmIoerrBkpt();
   326    return rc;
   327  }
   328  #else
   329  # define IOERR_WRAPPER(rc) (rc)
   330  #endif
   331  
   332  #ifdef NDEBUG
   333  # define assert_lists_are_ok(x)
   334  #else
   335  static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash);
   336  
   337  static void assert_lists_are_ok(FileSystem *pFS){
   338  #if 0
   339    Page *p;
   340  
   341    assert( pFS->nMapLimit>=0 );
   342  
   343    /* Check that all pages in the LRU list have nRef==0, pointers to buffers
   344    ** in heap memory, and corresponding entries in the hash table.  */
   345    for(p=pFS->pLruFirst; p; p=p->pLruNext){
   346      assert( p==pFS->pLruFirst || p->pLruPrev!=0 );
   347      assert( p==pFS->pLruLast || p->pLruNext!=0 );
   348      assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p );
   349      assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p );
   350      assert( p->nRef==0 );
   351      assert( p->flags & PAGE_FREE );
   352      assert( p==fsPageFindInHash(pFS, p->iPg, 0) );
   353    }
   354  #endif
   355  }
   356  #endif
   357  
   358  /*
   359  ** Wrappers around the VFS methods of the lsm_env object:
   360  **
   361  **     lsmEnvOpen()
   362  **     lsmEnvRead()
   363  **     lsmEnvWrite()
   364  **     lsmEnvSync()
   365  **     lsmEnvSectorSize()
   366  **     lsmEnvClose()
   367  **     lsmEnvTruncate()
   368  **     lsmEnvUnlink()
   369  **     lsmEnvRemap()
   370  */
   371  int lsmEnvOpen(lsm_env *pEnv, const char *zFile, int flags, lsm_file **ppNew){
   372    return pEnv->xOpen(pEnv, zFile, flags, ppNew);
   373  }
   374  
   375  static int lsmEnvRead(
   376    lsm_env *pEnv, 
   377    lsm_file *pFile, 
   378    lsm_i64 iOff, 
   379    void *pRead, 
   380    int nRead
   381  ){
   382    return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) );
   383  }
   384  
   385  static int lsmEnvWrite(
   386    lsm_env *pEnv, 
   387    lsm_file *pFile, 
   388    lsm_i64 iOff, 
   389    const void *pWrite, 
   390    int nWrite
   391  ){
   392    return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) );
   393  }
   394  
   395  static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
   396    return IOERR_WRAPPER( pEnv->xSync(pFile) );
   397  }
   398  
   399  static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
   400    return pEnv->xSectorSize(pFile);
   401  }
   402  
   403  int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
   404    return IOERR_WRAPPER( pEnv->xClose(pFile) );
   405  }
   406  
   407  static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
   408    return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) );
   409  }
   410  
   411  static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
   412    return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) );
   413  }
   414  
   415  static int lsmEnvRemap(
   416    lsm_env *pEnv, 
   417    lsm_file *pFile, 
   418    i64 szMin,
   419    void **ppMap,
   420    i64 *pszMap
   421  ){
   422    return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
   423  }
   424  
   425  int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
   426    if( pFile==0 ) return LSM_OK;
   427    return pEnv->xLock(pFile, iLock, eLock);
   428  }
   429  
   430  int lsmEnvTestLock(
   431    lsm_env *pEnv, 
   432    lsm_file *pFile, 
   433    int iLock, 
   434    int nLock, 
   435    int eLock
   436  ){
   437    return pEnv->xTestLock(pFile, iLock, nLock, eLock);
   438  }
   439  
   440  int lsmEnvShmMap(
   441    lsm_env *pEnv, 
   442    lsm_file *pFile, 
   443    int iChunk, 
   444    int sz, 
   445    void **ppOut
   446  ){
   447    return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
   448  }
   449  
   450  void lsmEnvShmBarrier(lsm_env *pEnv){
   451    pEnv->xShmBarrier();
   452  }
   453  
   454  void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
   455    pEnv->xShmUnmap(pFile, bDel);
   456  }
   457  
   458  void lsmEnvSleep(lsm_env *pEnv, int nUs){
   459    pEnv->xSleep(pEnv, nUs);
   460  }
   461  
   462  
   463  /*
   464  ** Write the contents of string buffer pStr into the log file, starting at
   465  ** offset iOff.
   466  */
   467  int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
   468    assert( pFS->fdLog );
   469    return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
   470  }
   471  
   472  /*
   473  ** fsync() the log file.
   474  */
   475  int lsmFsSyncLog(FileSystem *pFS){
   476    assert( pFS->fdLog );
   477    return lsmEnvSync(pFS->pEnv, pFS->fdLog);
   478  }
   479  
   480  /*
   481  ** Read nRead bytes of data starting at offset iOff of the log file. Append
   482  ** the results to string buffer pStr.
   483  */
   484  int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
   485    int rc;                         /* Return code */
   486    assert( pFS->fdLog );
   487    rc = lsmStringExtend(pStr, nRead);
   488    if( rc==LSM_OK ){
   489      rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
   490      pStr->n += nRead;
   491    }
   492    return rc;
   493  }
   494  
   495  /*
   496  ** Truncate the log file to nByte bytes in size.
   497  */
   498  int lsmFsTruncateLog(FileSystem *pFS, i64 nByte){
   499    if( pFS->fdLog==0 ) return LSM_OK;
   500    return lsmEnvTruncate(pFS->pEnv, pFS->fdLog, nByte);
   501  }
   502  
   503  /*
   504  ** Truncate the db file to nByte bytes in size.
   505  */
   506  int lsmFsTruncateDb(FileSystem *pFS, i64 nByte){
   507    if( pFS->fdDb==0 ) return LSM_OK;
   508    return lsmEnvTruncate(pFS->pEnv, pFS->fdDb, nByte);
   509  }
   510  
   511  /*
   512  ** Close the log file. Then delete it from the file-system. This function
   513  ** is called during database shutdown only.
   514  */
   515  int lsmFsCloseAndDeleteLog(FileSystem *pFS){
   516    char *zDel;
   517  
   518    if( pFS->fdLog ){
   519      lsmEnvClose(pFS->pEnv, pFS->fdLog );
   520      pFS->fdLog = 0;
   521    }
   522  
   523    zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb);
   524    if( zDel ){
   525      lsmEnvUnlink(pFS->pEnv, zDel);
   526      lsmFree(pFS->pEnv, zDel);
   527    }
   528    return LSM_OK;
   529  }
   530  
   531  /*
   532  ** Return true if page iReal of the database should be accessed using mmap.
   533  ** False otherwise.
   534  */
   535  static int fsMmapPage(FileSystem *pFS, Pgno iReal){
   536    return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit);
   537  }
   538  
   539  /*
   540  ** Given that there are currently nHash slots in the hash table, return 
   541  ** the hash key for file iFile, page iPg.
   542  */
   543  static int fsHashKey(int nHash, Pgno iPg){
   544    return (iPg % nHash);
   545  }
   546  
   547  /*
   548  ** This is a helper function for lsmFsOpen(). It opens a single file on
   549  ** disk (either the database or log file).
   550  */
   551  static lsm_file *fsOpenFile(
   552    FileSystem *pFS,                /* File system object */
   553    int bReadonly,                  /* True to open this file read-only */
   554    int bLog,                       /* True for log, false for db */
   555    int *pRc                        /* IN/OUT: Error code */
   556  ){
   557    lsm_file *pFile = 0;
   558    if( *pRc==LSM_OK ){
   559      int flags = (bReadonly ? LSM_OPEN_READONLY : 0);
   560      const char *zPath = (bLog ? pFS->zLog : pFS->zDb);
   561  
   562      *pRc = lsmEnvOpen(pFS->pEnv, zPath, flags, &pFile);
   563    }
   564    return pFile;
   565  }
   566  
   567  /*
   568  ** If it is not already open, this function opens the log file. It returns
   569  ** LSM_OK if successful (or if the log file was already open) or an LSM
   570  ** error code otherwise.
   571  **
   572  ** The log file must be opened before any of the following may be called:
   573  **
   574  **     lsmFsWriteLog
   575  **     lsmFsSyncLog
   576  **     lsmFsReadLog
   577  */
   578  int lsmFsOpenLog(lsm_db *db, int *pbOpen){
   579    int rc = LSM_OK;
   580    FileSystem *pFS = db->pFS;
   581  
   582    if( 0==pFS->fdLog ){ 
   583      pFS->fdLog = fsOpenFile(pFS, db->bReadonly, 1, &rc); 
   584  
   585      if( rc==LSM_IOERR_NOENT && db->bReadonly ){
   586        rc = LSM_OK;
   587      }
   588    }
   589  
   590    if( pbOpen ) *pbOpen = (pFS->fdLog!=0);
   591    return rc;
   592  }
   593  
   594  /*
   595  ** Close the log file, if it is open.
   596  */
   597  void lsmFsCloseLog(lsm_db *db){
   598    FileSystem *pFS = db->pFS;
   599    if( pFS->fdLog ){
   600      lsmEnvClose(pFS->pEnv, pFS->fdLog);
   601      pFS->fdLog = 0;
   602    }
   603  }
   604  
   605  /*
   606  ** Open a connection to a database stored within the file-system.
   607  **
   608  ** If parameter bReadonly is true, then open a read-only file-descriptor
   609  ** on the database file. It is possible that bReadonly will be false even
   610  ** if the user requested that pDb be opened read-only. This is because the
   611  ** file-descriptor may later on be recycled by a read-write connection.
   612  ** If the db file can be opened for read-write access, it always is. Parameter
   613  ** bReadonly is only ever true if it has already been determined that the
   614  ** db can only be opened for read-only access.
   615  **
   616  ** Return LSM_OK if successful or an lsm error code otherwise.
   617  */
   618  int lsmFsOpen(
   619    lsm_db *pDb,                    /* Database connection to open fd for */
   620    const char *zDb,                /* Full path to database file */
   621    int bReadonly                   /* True to open db file read-only */
   622  ){
   623    FileSystem *pFS;
   624    int rc = LSM_OK;
   625    int nDb = strlen(zDb);
   626    int nByte;
   627  
   628    assert( pDb->pFS==0 );
   629    assert( pDb->pWorker==0 && pDb->pClient==0 );
   630  
   631    nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
   632    pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
   633    if( pFS ){
   634      LsmFile *pLsmFile;
   635      pFS->zDb = (char *)&pFS[1];
   636      pFS->zLog = &pFS->zDb[nDb+1];
   637      pFS->nPagesize = LSM_DFLT_PAGE_SIZE;
   638      pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE;
   639      pFS->nMetasize = LSM_META_PAGE_SIZE;
   640      pFS->nMetaRwSize = LSM_META_RW_PAGE_SIZE;
   641      pFS->pDb = pDb;
   642      pFS->pEnv = pDb->pEnv;
   643  
   644      /* Make a copy of the database and log file names. */
   645      memcpy(pFS->zDb, zDb, nDb+1);
   646      memcpy(pFS->zLog, zDb, nDb);
   647      memcpy(&pFS->zLog[nDb], "-log", 5);
   648  
   649      /* Allocate the hash-table here. At some point, it should be changed
   650      ** so that it can grow dynamicly. */
   651      pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
   652      pFS->nHash = 4096;
   653      pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
   654  
   655      /* Open the database file */
   656      pLsmFile = lsmDbRecycleFd(pDb);
   657      if( pLsmFile ){
   658        pFS->pLsmFile = pLsmFile;
   659        pFS->fdDb = pLsmFile->pFile;
   660        memset(pLsmFile, 0, sizeof(LsmFile));
   661      }else{
   662        pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);
   663        if( rc==LSM_OK ){
   664          pFS->fdDb = fsOpenFile(pFS, bReadonly, 0, &rc);
   665        }
   666      }
   667  
   668      if( rc!=LSM_OK ){
   669        lsmFsClose(pFS);
   670        pFS = 0;
   671      }else{
   672        pFS->szSector = lsmEnvSectorSize(pFS->pEnv, pFS->fdDb);
   673      }
   674    }
   675  
   676    pDb->pFS = pFS;
   677    return rc;
   678  }
   679  
   680  /*
   681  ** Configure the file-system object according to the current values of
   682  ** the LSM_CONFIG_MMAP and LSM_CONFIG_SET_COMPRESSION options.
   683  */
   684  int lsmFsConfigure(lsm_db *db){
   685    FileSystem *pFS = db->pFS;
   686    if( pFS ){
   687      lsm_env *pEnv = pFS->pEnv;
   688      Page *pPg;
   689  
   690      assert( pFS->nOut==0 );
   691      assert( pFS->pWaiting==0 );
   692      assert( pFS->pMapped==0 );
   693  
   694      /* Reset any compression/decompression buffers already allocated */
   695      lsmFree(pEnv, pFS->aIBuffer);
   696      lsmFree(pEnv, pFS->aOBuffer);
   697      pFS->nBuffer = 0;
   698  
   699      /* Unmap the file, if it is currently mapped */
   700      if( pFS->pMap ){
   701        lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
   702        pFS->nMapLimit = 0;
   703      }
   704  
   705      /* Free all allocated page structures */
   706      pPg = pFS->pLruFirst;
   707      while( pPg ){
   708        Page *pNext = pPg->pLruNext;
   709        assert( pPg->flags & PAGE_FREE );
   710        lsmFree(pEnv, pPg->aData);
   711        lsmFree(pEnv, pPg);
   712        pPg = pNext;
   713      }
   714  
   715      pPg = pFS->pFree;
   716      while( pPg ){
   717        Page *pNext = pPg->pFreeNext;
   718        lsmFree(pEnv, pPg);
   719        pPg = pNext;
   720      }
   721  
   722      /* Zero pointers that point to deleted page objects */
   723      pFS->nCacheAlloc = 0;
   724      pFS->pLruFirst = 0;
   725      pFS->pLruLast = 0;
   726      pFS->pFree = 0;
   727      if( pFS->apHash ){
   728        memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0]));
   729      }
   730  
   731      /* Configure the FileSystem object */
   732      if( db->compress.xCompress ){
   733        pFS->pCompress = &db->compress;
   734        pFS->nMapLimit = 0;
   735      }else{
   736        pFS->pCompress = 0;
   737        if( db->iMmap==1 ){
   738          /* Unlimited */
   739          pFS->nMapLimit = (i64)1 << 60;
   740        }else{
   741          /* iMmap is a limit in KB. Set nMapLimit to the same value in bytes. */
   742          pFS->nMapLimit = (i64)db->iMmap * 1024;
   743        }
   744      }
   745    }
   746  
   747    return LSM_OK;
   748  }
   749  
   750  /*
   751  ** Close and destroy a FileSystem object.
   752  */
   753  void lsmFsClose(FileSystem *pFS){
   754    if( pFS ){
   755      Page *pPg;
   756      lsm_env *pEnv = pFS->pEnv;
   757  
   758      assert( pFS->nOut==0 );
   759      pPg = pFS->pLruFirst;
   760      while( pPg ){
   761        Page *pNext = pPg->pLruNext;
   762        if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
   763        lsmFree(pEnv, pPg);
   764        pPg = pNext;
   765      }
   766  
   767      pPg = pFS->pFree;
   768      while( pPg ){
   769        Page *pNext = pPg->pFreeNext;
   770        if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
   771        lsmFree(pEnv, pPg);
   772        pPg = pNext;
   773      }
   774  
   775      if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
   776      if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );
   777      lsmFree(pEnv, pFS->pLsmFile);
   778      lsmFree(pEnv, pFS->apHash);
   779      lsmFree(pEnv, pFS->aIBuffer);
   780      lsmFree(pEnv, pFS->aOBuffer);
   781      lsmFree(pEnv, pFS);
   782    }
   783  }
   784  
   785  /*
   786  ** This function is called when closing a database handle (i.e. lsm_close()) 
   787  ** if there exist other connections to the same database within this process.
   788  ** In that case the file-descriptor open on the database file is not closed
   789  ** when the FileSystem object is destroyed, as this would cause any POSIX
   790  ** locks held by the other connections to be silently dropped (see "man close"
   791  ** for details). Instead, the file-descriptor is stored in a list by the
   792  ** lsm_shared.c module until it is either closed or reused.
   793  **
   794  ** This function returns a pointer to an object that can be linked into
   795  ** the list described above. The returned object now 'owns' the database
   796  ** file descriptr, so that when the FileSystem object is destroyed, it
   797  ** will not be closed. 
   798  **
   799  ** This function may be called at most once in the life-time of a 
   800  ** FileSystem object. The results of any operations involving the database 
   801  ** file descriptor are undefined once this function has been called.
   802  **
   803  ** None of this is necessary on non-POSIX systems. But we do it anyway in
   804  ** the name of using as similar code as possible on all platforms.
   805  */
   806  LsmFile *lsmFsDeferClose(FileSystem *pFS){
   807    LsmFile *p = pFS->pLsmFile;
   808    assert( p->pNext==0 );
   809    p->pFile = pFS->fdDb;
   810    pFS->fdDb = 0;
   811    pFS->pLsmFile = 0;
   812    return p;
   813  }
   814  
   815  /*
   816  ** Allocate a buffer and populate it with the output of the xFileid() 
   817  ** method of the database file handle. If successful, set *ppId to point 
   818  ** to the buffer and *pnId to the number of bytes in the buffer and return
   819  ** LSM_OK. Otherwise, set *ppId and *pnId to zero and return an LSM
   820  ** error code.
   821  */
   822  int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId){
   823    lsm_env *pEnv = pDb->pEnv;
   824    FileSystem *pFS = pDb->pFS;
   825    int rc;
   826    int nId = 0;
   827    void *pId;
   828  
   829    rc = pEnv->xFileid(pFS->fdDb, 0, &nId);
   830    pId = lsmMallocZeroRc(pEnv, nId, &rc);
   831    if( rc==LSM_OK ) rc = pEnv->xFileid(pFS->fdDb, pId, &nId);
   832  
   833    if( rc!=LSM_OK ){
   834      lsmFree(pEnv, pId);
   835      pId = 0;
   836      nId = 0;
   837    }
   838  
   839    *ppId = pId;
   840    *pnId = nId;
   841    return rc;
   842  }
   843  
   844  /*
   845  ** Return the nominal page-size used by this file-system. Actual pages
   846  ** may be smaller or larger than this value.
   847  */
   848  int lsmFsPageSize(FileSystem *pFS){
   849    return pFS->nPagesize;
   850  }
   851  
   852  /*
   853  ** Return the block-size used by this file-system.
   854  */
   855  int lsmFsBlockSize(FileSystem *pFS){
   856    return pFS->nBlocksize;
   857  }
   858  
   859  /*
   860  ** Configure the nominal page-size used by this file-system. Actual 
   861  ** pages may be smaller or larger than this value.
   862  */
   863  void lsmFsSetPageSize(FileSystem *pFS, int nPgsz){
   864    pFS->nPagesize = nPgsz;
   865    pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
   866  }
   867  
   868  /*
   869  ** Configure the block-size used by this file-system. 
   870  */
   871  void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){
   872    pFS->nBlocksize = nBlocksize;
   873  }
   874  
   875  /*
   876  ** Return the page number of the first page on block iBlock. Blocks are
   877  ** numbered starting from 1.
   878  **
   879  ** For a compressed database, page numbers are byte offsets. The first
   880  ** page on each block is the byte offset immediately following the 4-byte
   881  ** "previous block" pointer at the start of each block.
   882  */
   883  static Pgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){
   884    Pgno iPg;
   885    if( pFS->pCompress ){
   886      if( iBlock==1 ){
   887        iPg = pFS->nMetasize * 2 + 4;
   888      }else{
   889        iPg = pFS->nBlocksize * (Pgno)(iBlock-1) + 4;
   890      }
   891    }else{
   892      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
   893      if( iBlock==1 ){
   894        iPg = 1 + ((pFS->nMetasize*2 + pFS->nPagesize - 1) / pFS->nPagesize);
   895      }else{
   896        iPg = 1 + (iBlock-1) * nPagePerBlock;
   897      }
   898    }
   899    return iPg;
   900  }
   901  
   902  /*
   903  ** Return the page number of the last page on block iBlock. Blocks are
   904  ** numbered starting from 1.
   905  **
   906  ** For a compressed database, page numbers are byte offsets. The first
   907  ** page on each block is the byte offset of the byte immediately before 
   908  ** the 4-byte "next block" pointer at the end of each block.
   909  */
   910  static Pgno fsLastPageOnBlock(FileSystem *pFS, int iBlock){
   911    if( pFS->pCompress ){
   912      return pFS->nBlocksize * (Pgno)iBlock - 1 - 4;
   913    }else{
   914      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
   915      return iBlock * nPagePerBlock;
   916    }
   917  }
   918  
   919  /*
   920  ** Return the block number of the block that page iPg is located on. 
   921  ** Blocks are numbered starting from 1.
   922  */
   923  static int fsPageToBlock(FileSystem *pFS, Pgno iPg){
   924    if( pFS->pCompress ){
   925      return (int)((iPg / pFS->nBlocksize) + 1);
   926    }else{
   927      return (int)(1 + ((iPg-1) / (pFS->nBlocksize / pFS->nPagesize)));
   928    }
   929  }
   930  
   931  /*
   932  ** Return true if page iPg is the last page on its block.
   933  **
   934  ** This function is only called in non-compressed database mode.
   935  */
   936  static int fsIsLast(FileSystem *pFS, Pgno iPg){
   937    const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
   938    assert( !pFS->pCompress );
   939    return ( iPg && (iPg % nPagePerBlock)==0 );
   940  }
   941  
   942  /*
   943  ** Return true if page iPg is the first page on its block.
   944  **
   945  ** This function is only called in non-compressed database mode.
   946  */
   947  static int fsIsFirst(FileSystem *pFS, Pgno iPg){
   948    const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
   949    assert( !pFS->pCompress );
   950    return ( (iPg % nPagePerBlock)==1
   951          || (iPg<nPagePerBlock && iPg==fsFirstPageOnBlock(pFS, 1))
   952    );
   953  }
   954  
   955  /*
   956  ** Given a page reference, return a pointer to the buffer containing the 
   957  ** pages contents. If parameter pnData is not NULL, set *pnData to the size
   958  ** of the buffer in bytes before returning.
   959  */
   960  u8 *lsmFsPageData(Page *pPage, int *pnData){
   961    if( pnData ){
   962      *pnData = pPage->nData;
   963    }
   964    return pPage->aData;
   965  }
   966  
   967  /*
   968  ** Return the page number of a page.
   969  */
   970  Pgno lsmFsPageNumber(Page *pPage){
   971    /* assert( (pPage->flags & PAGE_DIRTY)==0 ); */
   972    return pPage ? pPage->iPg : 0;
   973  }
   974  
   975  /*
   976  ** Page pPg is currently part of the LRU list belonging to pFS. Remove
   977  ** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this
   978  ** operation.
   979  */
   980  static void fsPageRemoveFromLru(FileSystem *pFS, Page *pPg){
   981    assert( pPg->pLruNext || pPg==pFS->pLruLast );
   982    assert( pPg->pLruPrev || pPg==pFS->pLruFirst );
   983    if( pPg->pLruNext ){
   984      pPg->pLruNext->pLruPrev = pPg->pLruPrev;
   985    }else{
   986      pFS->pLruLast = pPg->pLruPrev;
   987    }
   988    if( pPg->pLruPrev ){
   989      pPg->pLruPrev->pLruNext = pPg->pLruNext;
   990    }else{
   991      pFS->pLruFirst = pPg->pLruNext;
   992    }
   993    pPg->pLruPrev = 0;
   994    pPg->pLruNext = 0;
   995  }
   996  
   997  /*
   998  ** Page pPg is not currently part of the LRU list belonging to pFS. Add it.
   999  */
  1000  static void fsPageAddToLru(FileSystem *pFS, Page *pPg){
  1001    assert( pPg->pLruNext==0 && pPg->pLruPrev==0 );
  1002    pPg->pLruPrev = pFS->pLruLast;
  1003    if( pPg->pLruPrev ){
  1004      pPg->pLruPrev->pLruNext = pPg;
  1005    }else{
  1006      pFS->pLruFirst = pPg;
  1007    }
  1008    pFS->pLruLast = pPg;
  1009  }
  1010  
  1011  /*
  1012  ** Page pPg is currently stored in the apHash/nHash hash table. Remove it.
  1013  */
  1014  static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){
  1015    int iHash;
  1016    Page **pp;
  1017  
  1018    iHash = fsHashKey(pFS->nHash, pPg->iPg);
  1019    for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext);
  1020    *pp = pPg->pHashNext;
  1021    pPg->pHashNext = 0;
  1022  }
  1023  
  1024  /*
  1025  ** Free a Page object allocated by fsPageBuffer().
  1026  */
  1027  static void fsPageBufferFree(Page *pPg){
  1028    pPg->pFS->nCacheAlloc--;
  1029    lsmFree(pPg->pFS->pEnv, pPg->aData);
  1030    lsmFree(pPg->pFS->pEnv, pPg);
  1031  }
  1032  
  1033  
  1034  /*
  1035  ** Purge the cache of all non-mmap pages with nRef==0.
  1036  */
  1037  void lsmFsPurgeCache(FileSystem *pFS){
  1038    Page *pPg;
  1039  
  1040    pPg = pFS->pLruFirst;
  1041    while( pPg ){
  1042      Page *pNext = pPg->pLruNext;
  1043      assert( pPg->flags & PAGE_FREE );
  1044      fsPageRemoveFromHash(pFS, pPg);
  1045      fsPageBufferFree(pPg);
  1046      pPg = pNext;
  1047    }
  1048    pFS->pLruFirst = 0;
  1049    pFS->pLruLast = 0;
  1050  
  1051    assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 );
  1052  }
  1053  
  1054  /*
  1055  ** Search the hash-table for page iPg. If an entry is round, return a pointer
  1056  ** to it. Otherwise, return NULL.
  1057  **
  1058  ** Either way, if argument piHash is not NULL set *piHash to the hash slot
  1059  ** number that page iPg would be stored in before returning.
  1060  */
  1061  static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash){
  1062    Page *p;                        /* Return value */
  1063    int iHash = fsHashKey(pFS->nHash, iPg);
  1064  
  1065    if( piHash ) *piHash = iHash;
  1066    for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
  1067      if( p->iPg==iPg) break;
  1068    }
  1069    return p;
  1070  }
  1071  
  1072  /*
  1073  ** Allocate and return a non-mmap Page object. If there are already 
  1074  ** nCacheMax such Page objects outstanding, try to recycle an existing 
  1075  ** Page instead.
  1076  */
  1077  static int fsPageBuffer(
  1078    FileSystem *pFS, 
  1079    Page **ppOut
  1080  ){
  1081    int rc = LSM_OK;
  1082    Page *pPage = 0;
  1083    if( pFS->pLruFirst==0 || pFS->nCacheAlloc<pFS->nCacheMax ){
  1084      /* Allocate a new Page object */
  1085      pPage = lsmMallocZero(pFS->pEnv, sizeof(Page));
  1086      if( !pPage ){
  1087        rc = LSM_NOMEM_BKPT;
  1088      }else{
  1089        pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize);
  1090        if( !pPage->aData ){
  1091          lsmFree(pFS->pEnv, pPage);
  1092          rc = LSM_NOMEM_BKPT;
  1093          pPage = 0;
  1094        }else{
  1095          pFS->nCacheAlloc++;
  1096        }
  1097      }
  1098    }else{
  1099      /* Reuse an existing Page object */
  1100      u8 *aData;
  1101      pPage = pFS->pLruFirst;
  1102      aData = pPage->aData;
  1103      fsPageRemoveFromLru(pFS, pPage);
  1104      fsPageRemoveFromHash(pFS, pPage);
  1105  
  1106      memset(pPage, 0, sizeof(Page));
  1107      pPage->aData = aData;
  1108    }
  1109  
  1110    if( pPage ){
  1111      pPage->flags = PAGE_FREE;
  1112    }
  1113    *ppOut = pPage;
  1114    return rc;
  1115  }
  1116  
  1117  /*
  1118  ** Assuming *pRc is initially LSM_OK, attempt to ensure that the 
  1119  ** memory-mapped region is at least iSz bytes in size. If it is not already,
  1120  ** iSz bytes in size, extend it and update the pointers associated with any
  1121  ** outstanding Page objects.
  1122  **
  1123  ** If *pRc is not LSM_OK when this function is called, it is a no-op. 
  1124  ** Otherwise, *pRc is set to an lsm error code if an error occurs, or
  1125  ** left unmodified otherwise.
  1126  **
  1127  ** This function is never called in compressed database mode.
  1128  */
  1129  static void fsGrowMapping(
  1130    FileSystem *pFS,                /* File system object */
  1131    i64 iSz,                        /* Minimum size to extend mapping to */
  1132    int *pRc                        /* IN/OUT: Error code */
  1133  ){
  1134    assert( pFS->pCompress==0 );
  1135    assert( PAGE_HASPREV==4 );
  1136  
  1137    if( *pRc==LSM_OK && iSz>pFS->nMap ){
  1138      int rc;
  1139      u8 *aOld = pFS->pMap;
  1140      rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
  1141      if( rc==LSM_OK && pFS->pMap!=aOld ){
  1142        Page *pFix;
  1143        i64 iOff = (u8 *)pFS->pMap - aOld;
  1144        for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){
  1145          pFix->aData += iOff;
  1146        }
  1147        lsmSortedRemap(pFS->pDb);
  1148      }
  1149      *pRc = rc;
  1150    }
  1151  }
  1152  
  1153  /*
  1154  ** If it is mapped, unmap the database file.
  1155  */
  1156  int lsmFsUnmap(FileSystem *pFS){
  1157    int rc = LSM_OK;
  1158    if( pFS ){
  1159      rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
  1160    }
  1161    return rc;
  1162  }
  1163  
  1164  /*
  1165  ** fsync() the database file.
  1166  */
  1167  int lsmFsSyncDb(FileSystem *pFS, int nBlock){
  1168    return lsmEnvSync(pFS->pEnv, pFS->fdDb);
  1169  }
  1170  
  1171  /*
  1172  ** If block iBlk has been redirected according to the redirections in the
  1173  ** object passed as the first argument, return the destination block to
  1174  ** which it is redirected. Otherwise, return a copy of iBlk.
  1175  */
  1176  static int fsRedirectBlock(Redirect *p, int iBlk){
  1177    if( p ){
  1178      int i;
  1179      for(i=0; i<p->n; i++){
  1180        if( iBlk==p->a[i].iFrom ) return p->a[i].iTo;
  1181      }
  1182    }
  1183    assert( iBlk!=0 );
  1184    return iBlk;
  1185  }
  1186  
  1187  /*
  1188  ** If page iPg has been redirected according to the redirections in the
  1189  ** object passed as the second argument, return the destination page to
  1190  ** which it is redirected. Otherwise, return a copy of iPg.
  1191  */
  1192  Pgno lsmFsRedirectPage(FileSystem *pFS, Redirect *pRedir, Pgno iPg){
  1193    Pgno iReal = iPg;
  1194  
  1195    if( pRedir ){
  1196      const int nPagePerBlock = (
  1197          pFS->pCompress ? pFS->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
  1198      );
  1199      int iBlk = fsPageToBlock(pFS, iPg);
  1200      int i;
  1201      for(i=0; i<pRedir->n; i++){
  1202        int iFrom = pRedir->a[i].iFrom;
  1203        if( iFrom>iBlk ) break;
  1204        if( iFrom==iBlk ){
  1205          int iTo = pRedir->a[i].iTo;
  1206          iReal = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
  1207          if( iTo==1 ){
  1208            iReal += (fsFirstPageOnBlock(pFS, 1)-1);
  1209          }
  1210          break;
  1211        }
  1212      }
  1213    }
  1214  
  1215    assert( iReal!=0 );
  1216    return iReal;
  1217  }
  1218  
  1219  /* Required by the circular fsBlockNext<->fsPageGet dependency. */
  1220  static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *);
  1221  
  1222  /*
  1223  ** Parameter iBlock is a database file block. This function reads the value 
  1224  ** stored in the blocks "next block" pointer and stores it in *piNext.
  1225  ** LSM_OK is returned if everything is successful, or an LSM error code
  1226  ** otherwise.
  1227  */
  1228  static int fsBlockNext(
  1229    FileSystem *pFS,                /* File-system object handle */
  1230    Segment *pSeg,                  /* Use this segment for block redirects */
  1231    int iBlock,                     /* Read field from this block */
  1232    int *piNext                     /* OUT: Next block in linked list */
  1233  ){
  1234    int rc;
  1235    int iRead;                      /* Read block from here */
  1236    
  1237    if( pSeg ){
  1238      iRead = fsRedirectBlock(pSeg->pRedirect, iBlock);
  1239    }else{
  1240      iRead = iBlock;
  1241    }
  1242  
  1243    assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
  1244    if( pFS->pCompress ){
  1245      i64 iOff;                     /* File offset to read data from */
  1246      u8 aNext[4];                  /* 4-byte pointer read from db file */
  1247  
  1248      iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext);
  1249      rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext));
  1250      if( rc==LSM_OK ){
  1251        *piNext = (int)lsmGetU32(aNext);
  1252      }
  1253    }else{
  1254      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
  1255      Page *pLast;
  1256      rc = fsPageGet(pFS, 0, iRead*nPagePerBlock, 0, &pLast, 0);
  1257      if( rc==LSM_OK ){
  1258        *piNext = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
  1259        lsmFsPageRelease(pLast);
  1260      }
  1261    }
  1262  
  1263    if( pSeg ){
  1264      *piNext = fsRedirectBlock(pSeg->pRedirect, *piNext);
  1265    }
  1266    return rc;
  1267  }
  1268  
  1269  /*
  1270  ** Return the page number of the last page on the same block as page iPg.
  1271  */
  1272  Pgno fsLastPageOnPagesBlock(FileSystem *pFS, Pgno iPg){
  1273    return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg));
  1274  }
  1275  
  1276  /*
  1277  ** Read nData bytes of data from offset iOff of the database file into
  1278  ** buffer aData. If this means reading past the end of a block, follow
  1279  ** the block pointer to the next block and continue reading.
  1280  **
  1281  ** Offset iOff is an absolute offset - not subject to any block redirection.
  1282  ** However any block pointer followed is. Use pSeg->pRedirect in this case.
  1283  **
  1284  ** This function is only called in compressed database mode.
  1285  */
  1286  static int fsReadData(
  1287    FileSystem *pFS,                /* File-system handle */
  1288    Segment *pSeg,                  /* Block redirection */
  1289    i64 iOff,                       /* Read data from this offset */
  1290    u8 *aData,                      /* Buffer to read data into */
  1291    int nData                       /* Number of bytes to read */
  1292  ){
  1293    i64 iEob;                       /* End of block */
  1294    int nRead;
  1295    int rc;
  1296  
  1297    assert( pFS->pCompress );
  1298  
  1299    iEob = fsLastPageOnPagesBlock(pFS, iOff) + 1;
  1300    nRead = (int)LSM_MIN(iEob - iOff, nData);
  1301  
  1302    rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nRead);
  1303    if( rc==LSM_OK && nRead!=nData ){
  1304      int iBlk;
  1305  
  1306      rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
  1307      if( rc==LSM_OK ){
  1308        i64 iOff2 = fsFirstPageOnBlock(pFS, iBlk);
  1309        rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff2, &aData[nRead], nData-nRead);
  1310      }
  1311    }
  1312  
  1313    return rc;
  1314  }
  1315  
  1316  /*
  1317  ** Parameter iBlock is a database file block. This function reads the value 
  1318  ** stored in the blocks "previous block" pointer and stores it in *piPrev.
  1319  ** LSM_OK is returned if everything is successful, or an LSM error code
  1320  ** otherwise.
  1321  */
  1322  static int fsBlockPrev(
  1323    FileSystem *pFS,                /* File-system object handle */
  1324    Segment *pSeg,                  /* Use this segment for block redirects */
  1325    int iBlock,                     /* Read field from this block */
  1326    int *piPrev                     /* OUT: Previous block in linked list */
  1327  ){
  1328    int rc = LSM_OK;                /* Return code */
  1329  
  1330    assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
  1331    assert( iBlock>0 );
  1332  
  1333    if( pFS->pCompress ){
  1334      i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4;
  1335      u8 aPrev[4];                  /* 4-byte pointer read from db file */
  1336      rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev));
  1337      if( rc==LSM_OK ){
  1338        Redirect *pRedir = (pSeg ? pSeg->pRedirect : 0);
  1339        *piPrev = fsRedirectBlock(pRedir, (int)lsmGetU32(aPrev));
  1340      }
  1341    }else{
  1342      assert( 0 );
  1343    }
  1344    return rc;
  1345  }
  1346  
  1347  /*
  1348  ** Encode and decode routines for record size fields.
  1349  */
  1350  static void putRecordSize(u8 *aBuf, int nByte, int bFree){
  1351    aBuf[0] = (u8)(nByte >> 14) | 0x80;
  1352    aBuf[1] = ((u8)(nByte >>  7) & 0x7F) | (bFree ? 0x00 : 0x80);
  1353    aBuf[2] = (u8)nByte | 0x80;
  1354  }
  1355  static int getRecordSize(u8 *aBuf, int *pbFree){
  1356    int nByte;
  1357    nByte  = (aBuf[0] & 0x7F) << 14;
  1358    nByte += (aBuf[1] & 0x7F) << 7;
  1359    nByte += (aBuf[2] & 0x7F);
  1360    *pbFree = !(aBuf[1] & 0x80);
  1361    return nByte;
  1362  }
  1363  
  1364  /*
  1365  ** Subtract iSub from database file offset iOff and set *piRes to the
  1366  ** result. If doing so means passing the start of a block, follow the
  1367  ** block pointer stored in the first 4 bytes of the block.
  1368  **
  1369  ** Offset iOff is an absolute offset - not subject to any block redirection.
  1370  ** However any block pointer followed is. Use pSeg->pRedirect in this case.
  1371  **
  1372  ** Return LSM_OK if successful or an lsm error code if an error occurs.
  1373  */
  1374  static int fsSubtractOffset(
  1375    FileSystem *pFS, 
  1376    Segment *pSeg,
  1377    i64 iOff, 
  1378    int iSub, 
  1379    i64 *piRes
  1380  ){
  1381    i64 iStart;
  1382    int iBlk = 0;
  1383    int rc;
  1384  
  1385    assert( pFS->pCompress );
  1386  
  1387    iStart = fsFirstPageOnBlock(pFS, fsPageToBlock(pFS, iOff));
  1388    if( (iOff-iSub)>=iStart ){
  1389      *piRes = (iOff-iSub);
  1390      return LSM_OK;
  1391    }
  1392  
  1393    rc = fsBlockPrev(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
  1394    *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1);
  1395    return rc;
  1396  }
  1397  
  1398  /*
  1399  ** Add iAdd to database file offset iOff and set *piRes to the
  1400  ** result. If doing so means passing the end of a block, follow the
  1401  ** block pointer stored in the last 4 bytes of the block.
  1402  **
  1403  ** Offset iOff is an absolute offset - not subject to any block redirection.
  1404  ** However any block pointer followed is. Use pSeg->pRedirect in this case.
  1405  **
  1406  ** Return LSM_OK if successful or an lsm error code if an error occurs.
  1407  */
  1408  static int fsAddOffset(
  1409    FileSystem *pFS, 
  1410    Segment *pSeg,
  1411    i64 iOff, 
  1412    int iAdd, 
  1413    i64 *piRes
  1414  ){
  1415    i64 iEob;
  1416    int iBlk;
  1417    int rc;
  1418  
  1419    assert( pFS->pCompress );
  1420  
  1421    iEob = fsLastPageOnPagesBlock(pFS, iOff);
  1422    if( (iOff+iAdd)<=iEob ){
  1423      *piRes = (iOff+iAdd);
  1424      return LSM_OK;
  1425    }
  1426  
  1427    rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
  1428    *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1);
  1429    return rc;
  1430  }
  1431  
  1432  /*
  1433  ** If it is not already allocated, allocate either the FileSystem.aOBuffer (if
  1434  ** bWrite is true) or the FileSystem.aIBuffer (if bWrite is false). Return
  1435  ** LSM_OK if successful if the attempt to allocate memory fails.
  1436  */
  1437  static int fsAllocateBuffer(FileSystem *pFS, int bWrite){
  1438    u8 **pp;                        /* Pointer to either aIBuffer or aOBuffer */
  1439  
  1440    assert( pFS->pCompress );
  1441  
  1442    /* If neither buffer has been allocated, figure out how large they
  1443    ** should be. Store this value in FileSystem.nBuffer.  */
  1444    if( pFS->nBuffer==0 ){
  1445      assert( pFS->aIBuffer==0 && pFS->aOBuffer==0 );
  1446      pFS->nBuffer = pFS->pCompress->xBound(pFS->pCompress->pCtx, pFS->nPagesize);
  1447      if( pFS->nBuffer<(pFS->szSector+6) ){
  1448        pFS->nBuffer = pFS->szSector+6;
  1449      }
  1450    }
  1451  
  1452    pp = (bWrite ? &pFS->aOBuffer : &pFS->aIBuffer);
  1453    if( *pp==0 ){
  1454      *pp = lsmMalloc(pFS->pEnv, LSM_MAX(pFS->nBuffer, pFS->nPagesize));
  1455      if( *pp==0 ) return LSM_NOMEM_BKPT;
  1456    }
  1457  
  1458    return LSM_OK;
  1459  }
  1460  
  1461  /*
  1462  ** This function is only called in compressed database mode. It reads and
  1463  ** uncompresses the compressed data for page pPg from the database and
  1464  ** populates the pPg->aData[] buffer and pPg->nCompress field.
  1465  **
  1466  ** It is possible that instead of a page record, there is free space
  1467  ** at offset pPg->iPgno. In this case no data is read from the file, but
  1468  ** output variable *pnSpace is set to the total number of free bytes.
  1469  **
  1470  ** LSM_OK is returned if successful, or an LSM error code otherwise.
  1471  */
  1472  static int fsReadPagedata(
  1473    FileSystem *pFS,                /* File-system handle */
  1474    Segment *pSeg,                  /* pPg is part of this segment */
  1475    Page *pPg,                      /* Page to read and uncompress data for */
  1476    int *pnSpace                    /* OUT: Total bytes of free space */
  1477  ){
  1478    lsm_compress *p = pFS->pCompress;
  1479    i64 iOff = pPg->iPg;
  1480    u8 aSz[3];
  1481    int rc;
  1482  
  1483    assert( p && pPg->nCompress==0 );
  1484  
  1485    if( fsAllocateBuffer(pFS, 0) ) return LSM_NOMEM;
  1486  
  1487    rc = fsReadData(pFS, pSeg, iOff, aSz, sizeof(aSz));
  1488  
  1489    if( rc==LSM_OK ){
  1490      int bFree;
  1491      if( aSz[0] & 0x80 ){
  1492        pPg->nCompress = (int)getRecordSize(aSz, &bFree);
  1493      }else{
  1494        pPg->nCompress = (int)aSz[0] - sizeof(aSz)*2;
  1495        bFree = 1;
  1496      }
  1497      if( bFree ){
  1498        if( pnSpace ){
  1499          *pnSpace = pPg->nCompress + sizeof(aSz)*2;
  1500        }else{
  1501          rc = LSM_CORRUPT_BKPT;
  1502        }
  1503      }else{
  1504        rc = fsAddOffset(pFS, pSeg, iOff, 3, &iOff);
  1505        if( rc==LSM_OK ){
  1506          if( pPg->nCompress>pFS->nBuffer ){
  1507            rc = LSM_CORRUPT_BKPT;
  1508          }else{
  1509            rc = fsReadData(pFS, pSeg, iOff, pFS->aIBuffer, pPg->nCompress);
  1510          }
  1511          if( rc==LSM_OK ){
  1512            int n = pFS->nPagesize;
  1513            rc = p->xUncompress(p->pCtx, 
  1514                (char *)pPg->aData, &n, 
  1515                (const char *)pFS->aIBuffer, pPg->nCompress
  1516            );
  1517            if( rc==LSM_OK && n!=pPg->pFS->nPagesize ){
  1518              rc = LSM_CORRUPT_BKPT;
  1519            }
  1520          }
  1521        }
  1522      }
  1523    }
  1524    return rc;
  1525  }
  1526  
  1527  /*
  1528  ** Return a handle for a database page.
  1529  **
  1530  ** If this file-system object is accessing a compressed database it may be
  1531  ** that there is no page record at database file offset iPg. Instead, there
  1532  ** may be a free space record. In this case, set *ppPg to NULL and *pnSpace
  1533  ** to the total number of free bytes before returning.
  1534  **
  1535  ** If no error occurs, LSM_OK is returned. Otherwise, an lsm error code.
  1536  */
  1537  static int fsPageGet(
  1538    FileSystem *pFS,                /* File-system handle */
  1539    Segment *pSeg,                  /* Block redirection to use (or NULL) */
  1540    Pgno iPg,                       /* Page id */
  1541    int noContent,                  /* True to not load content from disk */
  1542    Page **ppPg,                    /* OUT: New page handle */
  1543    int *pnSpace                    /* OUT: Bytes of free space */
  1544  ){
  1545    Page *p;
  1546    int iHash;
  1547    int rc = LSM_OK;
  1548  
  1549    /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is 
  1550    ** not NULL, and the block containing iPg has been redirected, then iReal
  1551    ** is the page number after redirection.  */
  1552    Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg);
  1553  
  1554    assert_lists_are_ok(pFS);
  1555    assert( iPg>=fsFirstPageOnBlock(pFS, 1) );
  1556    assert( iReal>=fsFirstPageOnBlock(pFS, 1) );
  1557    *ppPg = 0;
  1558  
  1559    /* Search the hash-table for the page */
  1560    p = fsPageFindInHash(pFS, iReal, &iHash);
  1561  
  1562    if( p ){
  1563      assert( p->flags & PAGE_FREE );
  1564      if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p);
  1565    }else{
  1566  
  1567      if( fsMmapPage(pFS, iReal) ){
  1568        i64 iEnd = (i64)iReal * pFS->nPagesize;
  1569        fsGrowMapping(pFS, iEnd, &rc);
  1570        if( rc!=LSM_OK ) return rc;
  1571  
  1572        if( pFS->pFree ){
  1573          p = pFS->pFree;
  1574          pFS->pFree = p->pFreeNext;
  1575          assert( p->nRef==0 );
  1576        }else{
  1577          p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
  1578          if( rc ) return rc;
  1579          p->pFS = pFS;
  1580        }
  1581        p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)];
  1582        p->iPg = iReal;
  1583  
  1584        /* This page now carries a pointer to the mapping. Link it in to
  1585        ** the FileSystem.pMapped list.  */
  1586        assert( p->pMappedNext==0 );
  1587        p->pMappedNext = pFS->pMapped;
  1588        pFS->pMapped = p;
  1589  
  1590        assert( pFS->pCompress==0 );
  1591        assert( (p->flags & PAGE_FREE)==0 );
  1592      }else{
  1593        rc = fsPageBuffer(pFS, &p);
  1594        if( rc==LSM_OK ){
  1595          int nSpace = 0;
  1596          p->iPg = iReal;
  1597          p->nRef = 0;
  1598          p->pFS = pFS;
  1599          assert( p->flags==0 || p->flags==PAGE_FREE );
  1600  
  1601  #ifdef LSM_DEBUG
  1602          memset(p->aData, 0x56, pFS->nPagesize);
  1603  #endif
  1604          assert( p->pLruNext==0 && p->pLruPrev==0 );
  1605          if( noContent==0 ){
  1606            if( pFS->pCompress ){
  1607              rc = fsReadPagedata(pFS, pSeg, p, &nSpace);
  1608            }else{
  1609              int nByte = pFS->nPagesize;
  1610              i64 iOff = (i64)(iReal-1) * pFS->nPagesize;
  1611              rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte);
  1612            }
  1613            pFS->nRead++;
  1614          }
  1615  
  1616          /* If the xRead() call was successful (or not attempted), link the
  1617          ** page into the page-cache hash-table. Otherwise, if it failed,
  1618          ** free the buffer. */
  1619          if( rc==LSM_OK && nSpace==0 ){
  1620            p->pHashNext = pFS->apHash[iHash];
  1621            pFS->apHash[iHash] = p;
  1622          }else{
  1623            fsPageBufferFree(p);
  1624            p = 0;
  1625            if( pnSpace ) *pnSpace = nSpace;
  1626          }
  1627        }
  1628      }
  1629  
  1630      assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace)))
  1631           || (rc!=LSM_OK && p==0) 
  1632      );
  1633    }
  1634  
  1635    if( rc==LSM_OK && p ){
  1636      if( pFS->pCompress==0 && (fsIsLast(pFS, iReal) || fsIsFirst(pFS, iReal)) ){
  1637        p->nData = pFS->nPagesize - 4;
  1638        if( fsIsFirst(pFS, iReal) && p->nRef==0 ){
  1639          p->aData += 4;
  1640          p->flags |= PAGE_HASPREV;
  1641        }
  1642      }else{
  1643        p->nData = pFS->nPagesize;
  1644      }
  1645      pFS->nOut += (p->nRef==0);
  1646      p->nRef++;
  1647    }
  1648    *ppPg = p;
  1649    return rc;
  1650  }
  1651  
  1652  /*
  1653  ** Read the 64-bit checkpoint id of the checkpoint currently stored on meta
  1654  ** page iMeta of the database file. If no error occurs, store the id value
  1655  ** in *piVal and return LSM_OK. Otherwise, return an LSM error code and leave
  1656  ** *piVal unmodified.
  1657  **
  1658  ** If a checkpointer connection is currently updating meta-page iMeta, or an
  1659  ** earlier checkpointer crashed while doing so, the value read into *piVal
  1660  ** may be garbage. It is the callers responsibility to deal with this.
  1661  */
  1662  int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){
  1663    FileSystem *pFS = db->pFS;
  1664    int rc = LSM_OK;
  1665  
  1666    assert( iMeta==1 || iMeta==2 );
  1667    if( pFS->nMapLimit>0 ){
  1668      fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc);
  1669      if( rc==LSM_OK ){
  1670        *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]);
  1671      }
  1672    }else{
  1673      MetaPage *pMeta = 0;
  1674      rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta);
  1675      if( rc==LSM_OK ){
  1676        *piVal = (i64)lsmGetU64(pMeta->aData);
  1677        lsmFsMetaPageRelease(pMeta);
  1678      }
  1679    }
  1680  
  1681    return rc;
  1682  }
  1683  
  1684  
  1685  /*
  1686  ** Return true if the first or last page of segment pRun falls between iFirst
  1687  ** and iLast, inclusive, and pRun is not equal to pIgnore.
  1688  */
  1689  static int fsRunEndsBetween(
  1690    Segment *pRun, 
  1691    Segment *pIgnore, 
  1692    Pgno iFirst, 
  1693    Pgno iLast
  1694  ){
  1695    return (pRun!=pIgnore && (
  1696          (pRun->iFirst>=iFirst && pRun->iFirst<=iLast)
  1697       || (pRun->iLastPg>=iFirst && pRun->iLastPg<=iLast)
  1698    ));
  1699  }
  1700  
  1701  /*
  1702  ** Return true if level pLevel contains a segment other than pIgnore for
  1703  ** which the first or last page is between iFirst and iLast, inclusive.
  1704  */
  1705  static int fsLevelEndsBetween(
  1706    Level *pLevel, 
  1707    Segment *pIgnore, 
  1708    Pgno iFirst, 
  1709    Pgno iLast
  1710  ){
  1711    int i;
  1712  
  1713    if( fsRunEndsBetween(&pLevel->lhs, pIgnore, iFirst, iLast) ){
  1714      return 1;
  1715    }
  1716    for(i=0; i<pLevel->nRight; i++){
  1717      if( fsRunEndsBetween(&pLevel->aRhs[i], pIgnore, iFirst, iLast) ){
  1718        return 1;
  1719      }
  1720    }
  1721  
  1722    return 0;
  1723  }
  1724  
  1725  /*
  1726  ** Block iBlk is no longer in use by segment pIgnore. If it is not in use
  1727  ** by any other segment, move it to the free block list.
  1728  */
  1729  static int fsFreeBlock(
  1730    FileSystem *pFS,                /* File system object */
  1731    Snapshot *pSnapshot,            /* Worker snapshot */
  1732    Segment *pIgnore,               /* Ignore this run when searching */
  1733    int iBlk                        /* Block number of block to free */
  1734  ){
  1735    int rc = LSM_OK;                /* Return code */
  1736    Pgno iFirst;                    /* First page on block iBlk */
  1737    Pgno iLast;                     /* Last page on block iBlk */
  1738    Level *pLevel;                  /* Used to iterate through levels */
  1739  
  1740    int iIn;                        /* Used to iterate through append points */
  1741    int iOut = 0;                   /* Used to output append points */
  1742    Pgno *aApp = pSnapshot->aiAppend;
  1743  
  1744    iFirst = fsFirstPageOnBlock(pFS, iBlk);
  1745    iLast = fsLastPageOnBlock(pFS, iBlk);
  1746  
  1747    /* Check if any other run in the snapshot has a start or end page 
  1748    ** within this block. If there is such a run, return early. */
  1749    for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
  1750      if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
  1751        return LSM_OK;
  1752      }
  1753    }
  1754  
  1755    /* Remove any entries that lie on this block from the append-list. */
  1756    for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){
  1757      if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){
  1758        aApp[iOut++] = aApp[iIn];
  1759      }
  1760    }
  1761    while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0;
  1762  
  1763    if( rc==LSM_OK ){
  1764      rc = lsmBlockFree(pFS->pDb, iBlk);
  1765    }
  1766    return rc;
  1767  }
  1768  
  1769  /*
  1770  ** Delete or otherwise recycle the blocks currently occupied by run pDel.
  1771  */
  1772  int lsmFsSortedDelete(
  1773    FileSystem *pFS, 
  1774    Snapshot *pSnapshot,
  1775    int bZero,                      /* True to zero the Segment structure */
  1776    Segment *pDel
  1777  ){
  1778    if( pDel->iFirst ){
  1779      int rc = LSM_OK;
  1780  
  1781      int iBlk;
  1782      int iLastBlk;
  1783  
  1784      iBlk = fsPageToBlock(pFS, pDel->iFirst);
  1785      iLastBlk = fsPageToBlock(pFS, pDel->iLastPg);
  1786  
  1787      /* Mark all blocks currently used by this sorted run as free */
  1788      while( iBlk && rc==LSM_OK ){
  1789        int iNext = 0;
  1790        if( iBlk!=iLastBlk ){
  1791          rc = fsBlockNext(pFS, pDel, iBlk, &iNext);
  1792        }else if( bZero==0 && pDel->iLastPg!=fsLastPageOnBlock(pFS, iLastBlk) ){
  1793          break;
  1794        }
  1795        rc = fsFreeBlock(pFS, pSnapshot, pDel, iBlk);
  1796        iBlk = iNext;
  1797      }
  1798  
  1799      if( pDel->pRedirect ){
  1800        assert( pDel->pRedirect==&pSnapshot->redirect );
  1801        pSnapshot->redirect.n = 0;
  1802      }
  1803  
  1804      if( bZero ) memset(pDel, 0, sizeof(Segment));
  1805    }
  1806    return LSM_OK;
  1807  }
  1808  
  1809  /*
  1810  ** aPgno is an array containing nPgno page numbers. Return the smallest page
  1811  ** number from the array that falls on block iBlk. Or, if none of the pages
  1812  ** in aPgno[] fall on block iBlk, return 0.
  1813  */
  1814  static Pgno firstOnBlock(FileSystem *pFS, int iBlk, Pgno *aPgno, int nPgno){
  1815    Pgno iRet = 0;
  1816    int i;
  1817    for(i=0; i<nPgno; i++){
  1818      Pgno iPg = aPgno[i];
  1819      if( fsPageToBlock(pFS, iPg)==iBlk && (iRet==0 || iPg<iRet) ){
  1820        iRet = iPg;
  1821      }
  1822    }
  1823    return iRet;
  1824  }
  1825  
  1826  #ifndef NDEBUG
  1827  /*
  1828  ** Return true if page iPg, which is a part of segment p, lies on
  1829  ** a redirected block. 
  1830  */
  1831  static int fsPageRedirects(FileSystem *pFS, Segment *p, Pgno iPg){
  1832    return (iPg!=0 && iPg!=lsmFsRedirectPage(pFS, p->pRedirect, iPg));
  1833  }
  1834  
  1835  /*
  1836  ** Return true if the second argument is not NULL and any of the first
  1837  ** last or root pages lie on a redirected block. 
  1838  */
  1839  static int fsSegmentRedirects(FileSystem *pFS, Segment *p){
  1840    return (p && (
  1841        fsPageRedirects(pFS, p, p->iFirst)
  1842     || fsPageRedirects(pFS, p, p->iRoot)
  1843     || fsPageRedirects(pFS, p, p->iLastPg)
  1844    ));
  1845  }
  1846  #endif
  1847  
  1848  /*
  1849  ** Argument aPgno is an array of nPgno page numbers. All pages belong to
  1850  ** the segment pRun. This function gobbles from the start of the run to the
  1851  ** first page that appears in aPgno[] (i.e. so that the aPgno[] entry is
  1852  ** the new first page of the run).
  1853  */
  1854  void lsmFsGobble(
  1855    lsm_db *pDb,
  1856    Segment *pRun, 
  1857    Pgno *aPgno,
  1858    int nPgno
  1859  ){
  1860    int rc = LSM_OK;
  1861    FileSystem *pFS = pDb->pFS;
  1862    Snapshot *pSnapshot = pDb->pWorker;
  1863    int iBlk;
  1864  
  1865    assert( pRun->nSize>0 );
  1866    assert( 0==fsSegmentRedirects(pFS, pRun) );
  1867    assert( nPgno>0 && 0==fsPageRedirects(pFS, pRun, aPgno[0]) );
  1868  
  1869    iBlk = fsPageToBlock(pFS, pRun->iFirst);
  1870    pRun->nSize += (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
  1871  
  1872    while( rc==LSM_OK ){
  1873      int iNext = 0;
  1874      Pgno iFirst = firstOnBlock(pFS, iBlk, aPgno, nPgno);
  1875      if( iFirst ){
  1876        pRun->iFirst = iFirst;
  1877        break;
  1878      }
  1879      rc = fsBlockNext(pFS, pRun, iBlk, &iNext);
  1880      if( rc==LSM_OK ) rc = fsFreeBlock(pFS, pSnapshot, pRun, iBlk);
  1881      pRun->nSize -= (int)(
  1882          1 + fsLastPageOnBlock(pFS, iBlk) - fsFirstPageOnBlock(pFS, iBlk)
  1883      );
  1884      iBlk = iNext;
  1885    }
  1886  
  1887    pRun->nSize -= (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
  1888    assert( pRun->nSize>0 );
  1889  }
  1890  
  1891  /*
  1892  ** This function is only used in compressed database mode.
  1893  **
  1894  ** Argument iPg is the page number (byte offset) of a page within segment
  1895  ** pSeg. The page record, including all headers, is nByte bytes in size.
  1896  ** Before returning, set *piNext to the page number of the next page in
  1897  ** the segment, or to zero if iPg is the last.
  1898  **
  1899  ** In other words, do:
  1900  **
  1901  **   *piNext = iPg + nByte;
  1902  **
  1903  ** But take block overflow and redirection into account.
  1904  */
  1905  static int fsNextPageOffset(
  1906    FileSystem *pFS,                /* File system object */
  1907    Segment *pSeg,                  /* Segment to move within */
  1908    Pgno iPg,                       /* Offset of current page */
  1909    int nByte,                      /* Size of current page including headers */
  1910    Pgno *piNext                    /* OUT: Offset of next page. Or zero (EOF) */
  1911  ){
  1912    Pgno iNext;
  1913    int rc;
  1914  
  1915    assert( pFS->pCompress );
  1916  
  1917    rc = fsAddOffset(pFS, pSeg, iPg, nByte-1, &iNext);
  1918    if( pSeg && iNext==pSeg->iLastPg ){
  1919      iNext = 0;
  1920    }else if( rc==LSM_OK ){
  1921      rc = fsAddOffset(pFS, pSeg, iNext, 1, &iNext);
  1922    }
  1923  
  1924    *piNext = iNext;
  1925    return rc;
  1926  }
  1927  
  1928  /*
  1929  ** This function is only used in compressed database mode.
  1930  **
  1931  ** Argument iPg is the page number of a pagethat appears in segment pSeg.
  1932  ** This function determines the page number of the previous page in the
  1933  ** same run. *piPrev is set to the previous page number before returning.
  1934  **
  1935  ** LSM_OK is returned if no error occurs. Otherwise, an lsm error code.
  1936  ** If any value other than LSM_OK is returned, then the final value of
  1937  ** *piPrev is undefined.
  1938  */
  1939  static int fsGetPageBefore(
  1940    FileSystem *pFS, 
  1941    Segment *pSeg, 
  1942    Pgno iPg, 
  1943    Pgno *piPrev
  1944  ){
  1945    u8 aSz[3];
  1946    int rc;
  1947    i64 iRead;
  1948  
  1949    assert( pFS->pCompress );
  1950  
  1951    rc = fsSubtractOffset(pFS, pSeg, iPg, sizeof(aSz), &iRead);
  1952    if( rc==LSM_OK ) rc = fsReadData(pFS, pSeg, iRead, aSz, sizeof(aSz));
  1953  
  1954    if( rc==LSM_OK ){
  1955      int bFree;
  1956      int nSz;
  1957      if( aSz[2] & 0x80 ){
  1958        nSz = getRecordSize(aSz, &bFree) + sizeof(aSz)*2;
  1959      }else{
  1960        nSz = (int)(aSz[2] & 0x7F);
  1961        bFree = 1;
  1962      }
  1963      rc = fsSubtractOffset(pFS, pSeg, iPg, nSz, piPrev);
  1964    }
  1965  
  1966    return rc;
  1967  }
  1968  
  1969  /*
  1970  ** The first argument to this function is a valid reference to a database
  1971  ** file page that is part of a sorted run. If parameter eDir is -1, this 
  1972  ** function attempts to locate and load the previous page in the same run. 
  1973  ** Or, if eDir is +1, it attempts to find the next page in the same run.
  1974  ** The results of passing an eDir value other than positive or negative one
  1975  ** are undefined.
  1976  **
  1977  ** If parameter pRun is not NULL then it must point to the run that page
  1978  ** pPg belongs to. In this case, if pPg is the first or last page of the
  1979  ** run, and the request is for the previous or next page, respectively,
  1980  ** *ppNext is set to NULL before returning LSM_OK. If pRun is NULL, then it
  1981  ** is assumed that the next or previous page, as requested, exists.
  1982  **
  1983  ** If the previous/next page does exist and is successfully loaded, *ppNext
  1984  ** is set to point to it and LSM_OK is returned. Otherwise, if an error 
  1985  ** occurs, *ppNext is set to NULL and and lsm error code returned.
  1986  **
  1987  ** Page references returned by this function should be released by the 
  1988  ** caller using lsmFsPageRelease().
  1989  */
  1990  int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){
  1991    int rc = LSM_OK;
  1992    FileSystem *pFS = pPg->pFS;
  1993    Pgno iPg = pPg->iPg;
  1994  
  1995    assert( 0==fsSegmentRedirects(pFS, pRun) );
  1996    if( pFS->pCompress ){
  1997      int nSpace = pPg->nCompress + 2*3;
  1998  
  1999      do {
  2000        if( eDir>0 ){
  2001          rc = fsNextPageOffset(pFS, pRun, iPg, nSpace, &iPg);
  2002        }else{
  2003          if( iPg==pRun->iFirst ){
  2004            iPg = 0;
  2005          }else{
  2006            rc = fsGetPageBefore(pFS, pRun, iPg, &iPg);
  2007          }
  2008        }
  2009  
  2010        nSpace = 0;
  2011        if( iPg!=0 ){
  2012          rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, &nSpace);
  2013          assert( (*ppNext==0)==(rc!=LSM_OK || nSpace>0) );
  2014        }else{
  2015          *ppNext = 0;
  2016        }
  2017      }while( nSpace>0 && rc==LSM_OK );
  2018  
  2019    }else{
  2020      Redirect *pRedir = pRun ? pRun->pRedirect : 0;
  2021      assert( eDir==1 || eDir==-1 );
  2022      if( eDir<0 ){
  2023        if( pRun && iPg==pRun->iFirst ){
  2024          *ppNext = 0;
  2025          return LSM_OK;
  2026        }else if( fsIsFirst(pFS, iPg) ){
  2027          assert( pPg->flags & PAGE_HASPREV );
  2028          iPg = fsLastPageOnBlock(pFS, lsmGetU32(&pPg->aData[-4]));
  2029        }else{
  2030          iPg--;
  2031        }
  2032      }else{
  2033        if( pRun ){
  2034          if( iPg==pRun->iLastPg ){
  2035            *ppNext = 0;
  2036            return LSM_OK;
  2037          }
  2038        }
  2039  
  2040        if( fsIsLast(pFS, iPg) ){
  2041          int iBlk = fsRedirectBlock(
  2042              pRedir, lsmGetU32(&pPg->aData[pFS->nPagesize-4])
  2043          );
  2044          iPg = fsFirstPageOnBlock(pFS, iBlk);
  2045        }else{
  2046          iPg++;
  2047        }
  2048      }
  2049      rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, 0);
  2050    }
  2051  
  2052    return rc;
  2053  }
  2054  
  2055  /*
  2056  ** This function is called when creating a new segment to determine if the
  2057  ** first part of it can be written following an existing segment on an
  2058  ** already allocated block. If it is possible, the page number of the first
  2059  ** page to use for the new segment is returned. Otherwise zero.
  2060  **
  2061  ** If argument pLvl is not NULL, then this function will not attempt to
  2062  ** start the new segment immediately following any segment that is part
  2063  ** of the right-hand-side of pLvl.
  2064  */
  2065  static Pgno findAppendPoint(FileSystem *pFS, Level *pLvl){
  2066    int i;
  2067    Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
  2068    Pgno iRet = 0;
  2069  
  2070    for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
  2071      if( (iRet = aiAppend[i]) ){
  2072        if( pLvl ){
  2073          int iBlk = fsPageToBlock(pFS, iRet);
  2074          int j;
  2075          for(j=0; iRet && j<pLvl->nRight; j++){
  2076            if( fsPageToBlock(pFS, pLvl->aRhs[j].iLastPg)==iBlk ){
  2077              iRet = 0;
  2078            }
  2079          }
  2080        }
  2081        if( iRet ) aiAppend[i] = 0;
  2082      }
  2083    }
  2084    return iRet;
  2085  }
  2086  
  2087  /*
  2088  ** Append a page to the left-hand-side of pLvl. Set the ref-count to 1 and
  2089  ** return a pointer to it. The page is writable until either 
  2090  ** lsmFsPagePersist() is called on it or the ref-count drops to zero.
  2091  */
  2092  int lsmFsSortedAppend(
  2093    FileSystem *pFS, 
  2094    Snapshot *pSnapshot,
  2095    Level *pLvl,
  2096    int bDefer,
  2097    Page **ppOut
  2098  ){
  2099    int rc = LSM_OK;
  2100    Page *pPg = 0;
  2101    Pgno iApp = 0;
  2102    Pgno iNext = 0;
  2103    Segment *p = &pLvl->lhs;
  2104    Pgno iPrev = p->iLastPg;
  2105  
  2106    *ppOut = 0;
  2107    assert( p->pRedirect==0 );
  2108  
  2109    if( pFS->pCompress || bDefer ){
  2110      /* In compressed database mode the page is not assigned a page number
  2111      ** or location in the database file at this point. This will be done
  2112      ** by the lsmFsPagePersist() call.  */
  2113      rc = fsPageBuffer(pFS, &pPg);
  2114      if( rc==LSM_OK ){
  2115        pPg->pFS = pFS;
  2116        pPg->pSeg = p;
  2117        pPg->iPg = 0;
  2118        pPg->flags |= PAGE_DIRTY;
  2119        pPg->nData = pFS->nPagesize;
  2120        assert( pPg->aData );
  2121        if( pFS->pCompress==0 ) pPg->nData -= 4;
  2122  
  2123        pPg->nRef = 1;
  2124        pFS->nOut++;
  2125      }
  2126    }else{
  2127      if( iPrev==0 ){
  2128        iApp = findAppendPoint(pFS, pLvl);
  2129      }else if( fsIsLast(pFS, iPrev) ){
  2130        int iNext2;
  2131        rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iPrev), &iNext2);
  2132        if( rc!=LSM_OK ) return rc;
  2133        iApp = fsFirstPageOnBlock(pFS, iNext2);
  2134      }else{
  2135        iApp = iPrev + 1;
  2136      }
  2137  
  2138      /* If this is the first page allocated, or if the page allocated is the
  2139      ** last in the block, also allocate the next block here.  */
  2140      if( iApp==0 || fsIsLast(pFS, iApp) ){
  2141        int iNew;                     /* New block number */
  2142  
  2143        rc = lsmBlockAllocate(pFS->pDb, 0, &iNew);
  2144        if( rc!=LSM_OK ) return rc;
  2145        if( iApp==0 ){
  2146          iApp = fsFirstPageOnBlock(pFS, iNew);
  2147        }else{
  2148          iNext = fsFirstPageOnBlock(pFS, iNew);
  2149        }
  2150      }
  2151  
  2152      /* Grab the new page. */
  2153      pPg = 0;
  2154      rc = fsPageGet(pFS, 0, iApp, 1, &pPg, 0);
  2155      assert( rc==LSM_OK || pPg==0 );
  2156  
  2157      /* If this is the first or last page of a block, fill in the pointer 
  2158       ** value at the end of the new page. */
  2159      if( rc==LSM_OK ){
  2160        p->nSize++;
  2161        p->iLastPg = iApp;
  2162        if( p->iFirst==0 ) p->iFirst = iApp;
  2163        pPg->flags |= PAGE_DIRTY;
  2164  
  2165        if( fsIsLast(pFS, iApp) ){
  2166          lsmPutU32(&pPg->aData[pFS->nPagesize-4], fsPageToBlock(pFS, iNext));
  2167        }else if( fsIsFirst(pFS, iApp) ){
  2168          lsmPutU32(&pPg->aData[-4], fsPageToBlock(pFS, iPrev));
  2169        }
  2170      }
  2171    }
  2172  
  2173    *ppOut = pPg;
  2174    return rc;
  2175  }
  2176  
  2177  /*
  2178  ** Mark the segment passed as the second argument as finished. Once a segment
  2179  ** is marked as finished it is not possible to append any further pages to 
  2180  ** it.
  2181  **
  2182  ** Return LSM_OK if successful or an lsm error code if an error occurs.
  2183  */
  2184  int lsmFsSortedFinish(FileSystem *pFS, Segment *p){
  2185    int rc = LSM_OK;
  2186    if( p && p->iLastPg ){
  2187      assert( p->pRedirect==0 );
  2188  
  2189      /* Check if the last page of this run happens to be the last of a block.
  2190      ** If it is, then an extra block has already been allocated for this run.
  2191      ** Shift this extra block back to the free-block list. 
  2192      **
  2193      ** Otherwise, add the first free page in the last block used by the run
  2194      ** to the lAppend list.
  2195      */
  2196      if( fsLastPageOnPagesBlock(pFS, p->iLastPg)!=p->iLastPg ){
  2197        int i;
  2198        Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
  2199        for(i=0; i<LSM_APPLIST_SZ; i++){
  2200          if( aiAppend[i]==0 ){
  2201            aiAppend[i] = p->iLastPg+1;
  2202            break;
  2203          }
  2204        }
  2205      }else if( pFS->pCompress==0 ){
  2206        Page *pLast;
  2207        rc = fsPageGet(pFS, 0, p->iLastPg, 0, &pLast, 0);
  2208        if( rc==LSM_OK ){
  2209          int iBlk = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
  2210          lsmBlockRefree(pFS->pDb, iBlk);
  2211          lsmFsPageRelease(pLast);
  2212        }
  2213      }else{
  2214        int iBlk = 0;
  2215        rc = fsBlockNext(pFS, p, fsPageToBlock(pFS, p->iLastPg), &iBlk);
  2216        if( rc==LSM_OK ){
  2217          lsmBlockRefree(pFS->pDb, iBlk);
  2218        }
  2219      }
  2220    }
  2221    return rc;
  2222  }
  2223  
  2224  /*
  2225  ** Obtain a reference to page number iPg.
  2226  **
  2227  ** Return LSM_OK if successful, or an lsm error code if an error occurs.
  2228  */
  2229  int lsmFsDbPageGet(FileSystem *pFS, Segment *pSeg, Pgno iPg, Page **ppPg){
  2230    return fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
  2231  }
  2232  
  2233  /*
  2234  ** Obtain a reference to the last page in the segment passed as the 
  2235  ** second argument.
  2236  **
  2237  ** Return LSM_OK if successful, or an lsm error code if an error occurs.
  2238  */
  2239  int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg){
  2240    int rc;
  2241    Pgno iPg = pSeg->iLastPg;
  2242    if( pFS->pCompress ){
  2243      int nSpace;
  2244      iPg++;
  2245      do {
  2246        nSpace = 0;
  2247        rc = fsGetPageBefore(pFS, pSeg, iPg, &iPg);
  2248        if( rc==LSM_OK ){
  2249          rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, &nSpace);
  2250        }
  2251      }while( rc==LSM_OK && nSpace>0 );
  2252  
  2253    }else{
  2254      rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
  2255    }
  2256    return rc;
  2257  }
  2258  
  2259  /*
  2260  ** Return a reference to meta-page iPg. If successful, LSM_OK is returned
  2261  ** and *ppPg populated with the new page reference. The reference should
  2262  ** be released by the caller using lsmFsPageRelease().
  2263  **
  2264  ** Otherwise, if an error occurs, *ppPg is set to NULL and an LSM error 
  2265  ** code is returned.
  2266  */
  2267  int lsmFsMetaPageGet(
  2268    FileSystem *pFS,                /* File-system connection */
  2269    int bWrite,                     /* True for write access, false for read */
  2270    int iPg,                        /* Either 1 or 2 */
  2271    MetaPage **ppPg                 /* OUT: Pointer to MetaPage object */
  2272  ){
  2273    int rc = LSM_OK;
  2274    MetaPage *pPg;
  2275    assert( iPg==1 || iPg==2 );
  2276  
  2277    pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
  2278  
  2279    if( pPg ){
  2280      i64 iOff = (iPg-1) * pFS->nMetasize;
  2281      if( pFS->nMapLimit>0 ){
  2282        fsGrowMapping(pFS, 2*pFS->nMetasize, &rc);
  2283        pPg->aData = (u8 *)(pFS->pMap) + iOff;
  2284      }else{
  2285        pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc);
  2286        if( rc==LSM_OK && bWrite==0 ){
  2287          rc = lsmEnvRead(
  2288              pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetaRwSize
  2289          );
  2290        }
  2291  #ifndef NDEBUG
  2292        /* pPg->aData causes an uninitialized access via a downstreadm write().
  2293           After discussion on this list, this memory should not, for performance
  2294           reasons, be memset. However, tracking down "real" misuse is more
  2295           difficult with this "false" positive, so it is set when NDEBUG.
  2296        */
  2297        else if( rc==LSM_OK ){
  2298          memset( pPg->aData, 0x77, pFS->nMetasize );
  2299        }
  2300  #endif
  2301      }
  2302  
  2303      if( rc!=LSM_OK ){
  2304        if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData);
  2305        lsmFree(pFS->pEnv, pPg);
  2306        pPg = 0;
  2307      }else{
  2308        pPg->iPg = iPg;
  2309        pPg->bWrite = bWrite;
  2310        pPg->pFS = pFS;
  2311      }
  2312    }
  2313  
  2314    *ppPg = pPg;
  2315    return rc;
  2316  }
  2317  
  2318  /*
  2319  ** Release a meta-page reference obtained via a call to lsmFsMetaPageGet().
  2320  */
  2321  int lsmFsMetaPageRelease(MetaPage *pPg){
  2322    int rc = LSM_OK;
  2323    if( pPg ){
  2324      FileSystem *pFS = pPg->pFS;
  2325  
  2326      if( pFS->nMapLimit==0 ){
  2327        if( pPg->bWrite ){
  2328          i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0);
  2329          int nWrite = pFS->nMetaRwSize;
  2330          rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite);
  2331        }
  2332        lsmFree(pFS->pEnv, pPg->aData);
  2333      }
  2334  
  2335      lsmFree(pFS->pEnv, pPg);
  2336    }
  2337    return rc;
  2338  }
  2339  
  2340  /*
  2341  ** Return a pointer to a buffer containing the data associated with the
  2342  ** meta-page passed as the first argument. If parameter pnData is not NULL,
  2343  ** set *pnData to the size of the meta-page in bytes before returning.
  2344  */
  2345  u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){
  2346    if( pnData ) *pnData = pPg->pFS->nMetaRwSize;
  2347    return pPg->aData;
  2348  }
  2349  
  2350  /*
  2351  ** Return true if page is currently writable. This is used in assert() 
  2352  ** statements only.
  2353  */
  2354  #ifndef NDEBUG
  2355  int lsmFsPageWritable(Page *pPg){
  2356    return (pPg->flags & PAGE_DIRTY) ? 1 : 0;
  2357  }
  2358  #endif
  2359  
  2360  /*
  2361  ** This is called when block iFrom is being redirected to iTo. If page 
  2362  ** number (*piPg) lies on block iFrom, then calculate the equivalent
  2363  ** page on block iTo and set *piPg to this value before returning.
  2364  */
  2365  static void fsMovePage(
  2366    FileSystem *pFS,                /* File system object */
  2367    int iTo,                        /* Destination block */
  2368    int iFrom,                      /* Source block */
  2369    Pgno *piPg                      /* IN/OUT: Page number */
  2370  ){
  2371    Pgno iPg = *piPg;
  2372    if( iFrom==fsPageToBlock(pFS, iPg) ){
  2373      const int nPagePerBlock = (
  2374          pFS->pCompress ? pFS ->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
  2375      );
  2376      *piPg = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
  2377    }
  2378  }
  2379  
  2380  /*
  2381  ** Copy the contents of block iFrom to block iTo. 
  2382  **
  2383  ** It is safe to assume that there are no outstanding references to pages 
  2384  ** on block iTo. And that block iFrom is not currently being written. In
  2385  ** other words, the data can be read and written directly.
  2386  */
  2387  int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){
  2388    Snapshot *p = pFS->pDb->pWorker;
  2389    int rc = LSM_OK;
  2390    int i;
  2391    i64 nMap;
  2392  
  2393    i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize;
  2394    i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize;
  2395    
  2396    assert( iTo!=1 );
  2397    assert( iFrom>iTo );
  2398  
  2399    /* Grow the mapping as required. */
  2400    nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize);
  2401    fsGrowMapping(pFS, nMap, &rc);
  2402  
  2403    if( rc==LSM_OK ){
  2404      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
  2405      int nSz = pFS->nPagesize;
  2406      u8 *aBuf = 0;
  2407      u8 *aData = 0;
  2408  
  2409      for(i=0; rc==LSM_OK && i<nPagePerBlock; i++){
  2410        i64 iOff = iFromOff + i*nSz;
  2411  
  2412        /* Set aData to point to a buffer containing the from page */
  2413        if( (iOff+nSz)<=pFS->nMapLimit ){
  2414          u8 *aMap = (u8 *)(pFS->pMap);
  2415          aData = &aMap[iOff];
  2416        }else{
  2417          if( aBuf==0 ){
  2418            aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc);
  2419            if( aBuf==0 ) break;
  2420          }
  2421          aData = aBuf;
  2422          rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
  2423        }
  2424  
  2425        /* Copy aData to the to page */
  2426        if( rc==LSM_OK ){
  2427          iOff = iToOff + i*nSz;
  2428          if( (iOff+nSz)<=pFS->nMapLimit ){
  2429            u8 *aMap = (u8 *)(pFS->pMap);
  2430            memcpy(&aMap[iOff], aData, nSz);
  2431          }else{
  2432            rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
  2433          }
  2434        }
  2435      }
  2436      lsmFree(pFS->pEnv, aBuf);
  2437      lsmFsPurgeCache(pFS);
  2438    }
  2439  
  2440    /* Update append-point list if necessary */
  2441    for(i=0; i<LSM_APPLIST_SZ; i++){
  2442      fsMovePage(pFS, iTo, iFrom, &p->aiAppend[i]);
  2443    }
  2444  
  2445    /* Update the Segment structure itself */
  2446    fsMovePage(pFS, iTo, iFrom, &pSeg->iFirst);
  2447    fsMovePage(pFS, iTo, iFrom, &pSeg->iLastPg);
  2448    fsMovePage(pFS, iTo, iFrom, &pSeg->iRoot);
  2449  
  2450    return rc;
  2451  }
  2452  
  2453  /*
  2454  ** Append raw data to a segment. Return the database file offset that the
  2455  ** data is written to (this may be used as the page number if the data
  2456  ** being appended is a new page record).
  2457  **
  2458  ** This function is only used in compressed database mode.
  2459  */
  2460  static Pgno fsAppendData(
  2461    FileSystem *pFS,                /* File-system handle */
  2462    Segment *pSeg,                  /* Segment to append to */
  2463    const u8 *aData,                /* Buffer containing data to write */
  2464    int nData,                      /* Size of buffer aData[] in bytes */
  2465    int *pRc                        /* IN/OUT: Error code */
  2466  ){
  2467    Pgno iRet = 0;
  2468    int rc = *pRc;
  2469    assert( pFS->pCompress );
  2470    if( rc==LSM_OK ){
  2471      int nRem = 0;
  2472      int nWrite = 0;
  2473      Pgno iLastOnBlock;
  2474      Pgno iApp = pSeg->iLastPg+1;
  2475  
  2476      /* If this is the first data written into the segment, find an append-point
  2477      ** or allocate a new block.  */
  2478      if( iApp==1 ){
  2479        pSeg->iFirst = iApp = findAppendPoint(pFS, 0);
  2480        if( iApp==0 ){
  2481          int iBlk;
  2482          rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
  2483          pSeg->iFirst = iApp = fsFirstPageOnBlock(pFS, iBlk);
  2484        }
  2485      }
  2486      iRet = iApp;
  2487  
  2488      /* Write as much data as is possible at iApp (usually all of it). */
  2489      iLastOnBlock = fsLastPageOnPagesBlock(pFS, iApp);
  2490      if( rc==LSM_OK ){
  2491        int nSpace = (int)(iLastOnBlock - iApp + 1);
  2492        nWrite = LSM_MIN(nData, nSpace);
  2493        nRem = nData - nWrite;
  2494        assert( nWrite>=0 );
  2495        if( nWrite!=0 ){
  2496          rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aData, nWrite);
  2497        }
  2498        iApp += nWrite;
  2499      }
  2500  
  2501      /* If required, allocate a new block and write the rest of the data
  2502      ** into it. Set the next and previous block pointers to link the new
  2503      ** block to the old.  */
  2504      assert( nRem<=0 || (iApp-1)==iLastOnBlock );
  2505      if( rc==LSM_OK && (iApp-1)==iLastOnBlock ){
  2506        u8 aPtr[4];                 /* Space to serialize a u32 */
  2507        int iBlk;                   /* New block number */
  2508  
  2509        if( nWrite>0 ){
  2510          /* Allocate a new block. */
  2511          rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
  2512  
  2513          /* Set the "next" pointer on the old block */
  2514          if( rc==LSM_OK ){
  2515            assert( iApp==(fsPageToBlock(pFS, iApp)*pFS->nBlocksize)-4 );
  2516            lsmPutU32(aPtr, iBlk);
  2517            rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aPtr, sizeof(aPtr));
  2518          }
  2519  
  2520          /* Set the "prev" pointer on the new block */
  2521          if( rc==LSM_OK ){
  2522            Pgno iWrite;
  2523            lsmPutU32(aPtr, fsPageToBlock(pFS, iApp));
  2524            iWrite = fsFirstPageOnBlock(pFS, iBlk);
  2525            rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iWrite-4, aPtr, sizeof(aPtr));
  2526            if( nRem>0 ) iApp = iWrite;
  2527          }
  2528        }else{
  2529          /* The next block is already allocated. */
  2530          assert( nRem>0 );
  2531          assert( pSeg->pRedirect==0 );
  2532          rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iApp), &iBlk);
  2533          iRet = iApp = fsFirstPageOnBlock(pFS, iBlk);
  2534        }
  2535  
  2536        /* Write the remaining data into the new block */
  2537        if( rc==LSM_OK && nRem>0 ){
  2538          rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, &aData[nWrite], nRem);
  2539          iApp += nRem;
  2540        }
  2541      }
  2542  
  2543      pSeg->iLastPg = iApp-1;
  2544      *pRc = rc;
  2545    }
  2546  
  2547    return iRet;
  2548  }
  2549  
  2550  /*
  2551  ** This function is only called in compressed database mode. It 
  2552  ** compresses the contents of page pPg and writes the result to the 
  2553  ** buffer at pFS->aOBuffer. The size of the compressed data is stored in
  2554  ** pPg->nCompress.
  2555  **
  2556  ** If buffer pFS->aOBuffer[] has not been allocated then this function
  2557  ** allocates it. If this fails, LSM_NOMEM is returned. Otherwise, LSM_OK.
  2558  */
  2559  static int fsCompressIntoBuffer(FileSystem *pFS, Page *pPg){
  2560    lsm_compress *p = pFS->pCompress;
  2561  
  2562    if( fsAllocateBuffer(pFS, 1) ) return LSM_NOMEM;
  2563    assert( pPg->nData==pFS->nPagesize );
  2564  
  2565    pPg->nCompress = pFS->nBuffer;
  2566    return p->xCompress(p->pCtx, 
  2567        (char *)pFS->aOBuffer, &pPg->nCompress, 
  2568        (const char *)pPg->aData, pPg->nData
  2569    );
  2570  }
  2571  
  2572  /*
  2573  ** Append a new page to segment pSeg. Set output variable *piNew to the
  2574  ** page number of the new page before returning.
  2575  **
  2576  ** If the new page is the last on its block, then the 'next' block that
  2577  ** will be used by the segment is allocated here too. In this case output
  2578  ** variable *piNext is set to the block number of the next block.
  2579  **
  2580  ** If the new page is the first on its block but not the first in the
  2581  ** entire segment, set output variable *piPrev to the block number of
  2582  ** the previous block in the segment.
  2583  **
  2584  ** LSM_OK is returned if successful, or an lsm error code otherwise. If
  2585  ** any value other than LSM_OK is returned, then the final value of all
  2586  ** output variables is undefined.
  2587  */
  2588  static int fsAppendPage(
  2589    FileSystem *pFS, 
  2590    Segment *pSeg,
  2591    Pgno *piNew,
  2592    int *piPrev,
  2593    int *piNext
  2594  ){
  2595    Pgno iPrev = pSeg->iLastPg;
  2596    int rc;
  2597    assert( iPrev!=0 );
  2598  
  2599    *piPrev = 0;
  2600    *piNext = 0;
  2601  
  2602    if( fsIsLast(pFS, iPrev) ){
  2603      /* Grab the first page on the next block (which has already be
  2604      ** allocated). In this case set *piPrev to tell the caller to set
  2605      ** the "previous block" pointer in the first 4 bytes of the page.
  2606      */
  2607      int iNext;
  2608      int iBlk = fsPageToBlock(pFS, iPrev);
  2609      assert( pSeg->pRedirect==0 );
  2610      rc = fsBlockNext(pFS, 0, iBlk, &iNext);
  2611      if( rc!=LSM_OK ) return rc;
  2612      *piNew = fsFirstPageOnBlock(pFS, iNext);
  2613      *piPrev = iBlk;
  2614    }else{
  2615      *piNew = iPrev+1;
  2616      if( fsIsLast(pFS, *piNew) ){
  2617        /* Allocate the next block here. */
  2618        int iBlk;
  2619        rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
  2620        if( rc!=LSM_OK ) return rc;
  2621        *piNext = iBlk;
  2622      }
  2623    }
  2624  
  2625    pSeg->nSize++;
  2626    pSeg->iLastPg = *piNew;
  2627    return LSM_OK;
  2628  }
  2629  
  2630  /*
  2631  ** Flush all pages in the FileSystem.pWaiting list to disk.
  2632  */
  2633  void lsmFsFlushWaiting(FileSystem *pFS, int *pRc){
  2634    int rc = *pRc;
  2635    Page *pPg;
  2636  
  2637    pPg = pFS->pWaiting;
  2638    pFS->pWaiting = 0;
  2639  
  2640    while( pPg ){
  2641      Page *pNext = pPg->pWaitingNext;
  2642      if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg);
  2643      assert( pPg->nRef==1 );
  2644      lsmFsPageRelease(pPg);
  2645      pPg = pNext;
  2646    }
  2647    *pRc = rc;
  2648  }
  2649  
  2650  /*
  2651  ** If there exists a hash-table entry associated with page iPg, remove it.
  2652  */
  2653  static void fsRemoveHashEntry(FileSystem *pFS, Pgno iPg){
  2654    Page *p;
  2655    int iHash = fsHashKey(pFS->nHash, iPg);
  2656  
  2657    for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext);
  2658  
  2659    if( p ){
  2660      assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 );
  2661      fsPageRemoveFromHash(pFS, p);
  2662      p->iPg = 0;
  2663      iHash = fsHashKey(pFS->nHash, 0);
  2664      p->pHashNext = pFS->apHash[iHash];
  2665      pFS->apHash[iHash] = p;
  2666    }
  2667  }
  2668  
  2669  /*
  2670  ** If the page passed as an argument is dirty, update the database file
  2671  ** (or mapping of the database file) with its current contents and mark
  2672  ** the page as clean.
  2673  **
  2674  ** Return LSM_OK if the operation is a success, or an LSM error code
  2675  ** otherwise.
  2676  */
  2677  int lsmFsPagePersist(Page *pPg){
  2678    int rc = LSM_OK;
  2679    if( pPg && (pPg->flags & PAGE_DIRTY) ){
  2680      FileSystem *pFS = pPg->pFS;
  2681  
  2682      if( pFS->pCompress ){
  2683        int iHash;                  /* Hash key of assigned page number */
  2684        u8 aSz[3];                  /* pPg->nCompress as a 24-bit big-endian */
  2685        assert( pPg->pSeg && pPg->iPg==0 && pPg->nCompress==0 );
  2686  
  2687        /* Compress the page image. */
  2688        rc = fsCompressIntoBuffer(pFS, pPg);
  2689  
  2690        /* Serialize the compressed size into buffer aSz[] */
  2691        putRecordSize(aSz, pPg->nCompress, 0);
  2692  
  2693        /* Write the serialized page record into the database file. */
  2694        pPg->iPg = fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
  2695        fsAppendData(pFS, pPg->pSeg, pFS->aOBuffer, pPg->nCompress, &rc);
  2696        fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
  2697  
  2698        /* Now that it has a page number, insert the page into the hash table */
  2699        iHash = fsHashKey(pFS->nHash, pPg->iPg);
  2700        pPg->pHashNext = pFS->apHash[iHash];
  2701        pFS->apHash[iHash] = pPg;
  2702  
  2703        pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress;
  2704  
  2705        pPg->flags &= ~PAGE_DIRTY;
  2706        pFS->nWrite++;
  2707      }else{
  2708  
  2709        if( pPg->iPg==0 ){
  2710          /* No page number has been assigned yet. This occurs with pages used
  2711          ** in the b-tree hierarchy. They were not assigned page numbers when
  2712          ** they were created as doing so would cause this call to
  2713          ** lsmFsPagePersist() to write an out-of-order page. Instead a page 
  2714          ** number is assigned here so that the page data will be appended
  2715          ** to the current segment.
  2716          */
  2717          Page **pp;
  2718          int iPrev = 0;
  2719          int iNext = 0;
  2720          int iHash;
  2721  
  2722          assert( pPg->pSeg->iFirst );
  2723          assert( pPg->flags & PAGE_FREE );
  2724          assert( (pPg->flags & PAGE_HASPREV)==0 );
  2725          assert( pPg->nData==pFS->nPagesize-4 );
  2726  
  2727          rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
  2728          if( rc!=LSM_OK ) return rc;
  2729  
  2730          assert( pPg->flags & PAGE_FREE );
  2731          iHash = fsHashKey(pFS->nHash, pPg->iPg);
  2732          fsRemoveHashEntry(pFS, pPg->iPg);
  2733          pPg->pHashNext = pFS->apHash[iHash];
  2734          pFS->apHash[iHash] = pPg;
  2735          assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg );
  2736  
  2737          if( iPrev ){
  2738            assert( iNext==0 );
  2739            memmove(&pPg->aData[4], pPg->aData, pPg->nData);
  2740            lsmPutU32(pPg->aData, iPrev);
  2741            pPg->flags |= PAGE_HASPREV;
  2742            pPg->aData += 4;
  2743          }else if( iNext ){
  2744            assert( iPrev==0 );
  2745            lsmPutU32(&pPg->aData[pPg->nData], iNext);
  2746          }else{
  2747            int nData = pPg->nData;
  2748            pPg->nData += 4;
  2749            lsmSortedExpandBtreePage(pPg, nData);
  2750          }
  2751  
  2752          pPg->nRef++;
  2753          for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext);
  2754          *pp = pPg;
  2755          assert( pPg->pWaitingNext==0 );
  2756  
  2757        }else{
  2758          i64 iOff;                   /* Offset to write within database file */
  2759  
  2760          iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
  2761          if( fsMmapPage(pFS, pPg->iPg)==0 ){
  2762            u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
  2763            rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
  2764          }else if( pPg->flags & PAGE_FREE ){
  2765            fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
  2766            if( rc==LSM_OK ){
  2767              u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
  2768              u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
  2769              memcpy(aTo, aFrom, pFS->nPagesize);
  2770              lsmFree(pFS->pEnv, aFrom);
  2771              pFS->nCacheAlloc--;
  2772              pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
  2773              pPg->flags &= ~PAGE_FREE;
  2774              fsPageRemoveFromHash(pFS, pPg);
  2775              pPg->pMappedNext = pFS->pMapped;
  2776              pFS->pMapped = pPg;
  2777            }
  2778          }
  2779  
  2780          lsmFsFlushWaiting(pFS, &rc);
  2781          pPg->flags &= ~PAGE_DIRTY;
  2782          pFS->nWrite++;
  2783        }
  2784      }
  2785    }
  2786  
  2787    return rc;
  2788  }
  2789  
  2790  /*
  2791  ** For non-compressed databases, this function is a no-op. For compressed
  2792  ** databases, it adds a padding record to the segment passed as the third
  2793  ** argument.
  2794  **
  2795  ** The size of the padding records is selected so that the last byte 
  2796  ** written is the last byte of a disk sector. This means that if a 
  2797  ** snapshot is taken and checkpointed, subsequent worker processes will
  2798  ** not write to any sector that contains checkpointed data.
  2799  */
  2800  int lsmFsSortedPadding(
  2801    FileSystem *pFS, 
  2802    Snapshot *pSnapshot,
  2803    Segment *pSeg
  2804  ){
  2805    int rc = LSM_OK;
  2806    if( pFS->pCompress ){
  2807      Pgno iLast2;
  2808      Pgno iLast = pSeg->iLastPg;     /* Current last page of segment */
  2809      int nPad;                       /* Bytes of padding required */
  2810      u8 aSz[3];
  2811  
  2812      iLast2 = (1 + iLast/pFS->szSector) * pFS->szSector - 1;
  2813      assert( fsPageToBlock(pFS, iLast)==fsPageToBlock(pFS, iLast2) );
  2814      nPad = (int)(iLast2 - iLast);
  2815  
  2816      if( iLast2>fsLastPageOnPagesBlock(pFS, iLast) ){
  2817        nPad -= 4;
  2818      }
  2819      assert( nPad>=0 );
  2820  
  2821      if( nPad>=6 ){
  2822        pSeg->nSize += nPad;
  2823        nPad -= 6;
  2824        putRecordSize(aSz, nPad, 1);
  2825        fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
  2826        memset(pFS->aOBuffer, 0, nPad);
  2827        fsAppendData(pFS, pSeg, pFS->aOBuffer, nPad, &rc);
  2828        fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
  2829      }else if( nPad>0 ){
  2830        u8 aBuf[5] = {0,0,0,0,0};
  2831        aBuf[0] = (u8)nPad;
  2832        aBuf[nPad-1] = (u8)nPad;
  2833        fsAppendData(pFS, pSeg, aBuf, nPad, &rc);
  2834      }
  2835  
  2836      assert( rc!=LSM_OK 
  2837          || pSeg->iLastPg==fsLastPageOnPagesBlock(pFS, pSeg->iLastPg)
  2838          || ((pSeg->iLastPg + 1) % pFS->szSector)==0
  2839      );
  2840    }
  2841  
  2842    return rc;
  2843  }
  2844  
  2845  
  2846  /*
  2847  ** Increment the reference count on the page object passed as the first
  2848  ** argument.
  2849  */
  2850  void lsmFsPageRef(Page *pPg){
  2851    if( pPg ){
  2852      pPg->nRef++;
  2853    }
  2854  }
  2855  
  2856  /*
  2857  ** Release a page-reference obtained using fsPageGet().
  2858  */
  2859  int lsmFsPageRelease(Page *pPg){
  2860    int rc = LSM_OK;
  2861    if( pPg ){
  2862      assert( pPg->nRef>0 );
  2863      pPg->nRef--;
  2864      if( pPg->nRef==0 ){
  2865        FileSystem *pFS = pPg->pFS;
  2866        rc = lsmFsPagePersist(pPg);
  2867        pFS->nOut--;
  2868  
  2869        assert( pPg->pFS->pCompress 
  2870             || fsIsFirst(pPg->pFS, pPg->iPg)==0 
  2871             || (pPg->flags & PAGE_HASPREV)
  2872        );
  2873        pPg->aData -= (pPg->flags & PAGE_HASPREV);
  2874        pPg->flags &= ~PAGE_HASPREV;
  2875  
  2876        if( (pPg->flags & PAGE_FREE)==0 ){
  2877          /* Removed from mapped list */
  2878          Page **pp;
  2879          for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext);
  2880          *pp = pPg->pMappedNext;
  2881          pPg->pMappedNext = 0;
  2882  
  2883          /* Add to free list */
  2884          pPg->pFreeNext = pFS->pFree;
  2885          pFS->pFree = pPg;
  2886        }else{
  2887          fsPageAddToLru(pFS, pPg);
  2888        }
  2889      }
  2890    }
  2891  
  2892    return rc;
  2893  }
  2894  
  2895  /*
  2896  ** Return the total number of pages read from the database file.
  2897  */
  2898  int lsmFsNRead(FileSystem *pFS){ return pFS->nRead; }
  2899  
  2900  /*
  2901  ** Return the total number of pages written to the database file.
  2902  */
  2903  int lsmFsNWrite(FileSystem *pFS){ return pFS->nWrite; }
  2904  
  2905  /*
  2906  ** Return a copy of the environment pointer used by the file-system object.
  2907  */
  2908  lsm_env *lsmFsEnv(FileSystem *pFS){ 
  2909    return pFS->pEnv; 
  2910  }
  2911  
  2912  /*
  2913  ** Return a copy of the environment pointer used by the file-system object
  2914  ** to which this page belongs.
  2915  */
  2916  lsm_env *lsmPageEnv(Page *pPg) { 
  2917    return pPg->pFS->pEnv; 
  2918  }
  2919  
  2920  /*
  2921  ** Return a pointer to the file-system object associated with the Page
  2922  ** passed as the only argument.
  2923  */
  2924  FileSystem *lsmPageFS(Page *pPg){
  2925    return pPg->pFS;
  2926  }
  2927  
  2928  /*
  2929  ** Return the sector-size as reported by the log file handle.
  2930  */
  2931  int lsmFsSectorSize(FileSystem *pFS){
  2932    return pFS->szSector;
  2933  }
  2934  
  2935  /*
  2936  ** Helper function for lsmInfoArrayStructure().
  2937  */
  2938  static Segment *startsWith(Segment *pRun, Pgno iFirst){
  2939    return (iFirst==pRun->iFirst) ? pRun : 0;
  2940  }
  2941  
  2942  /*
  2943  ** Return the segment that starts with page iFirst, if any. If no such segment
  2944  ** can be found, return NULL.
  2945  */
  2946  static Segment *findSegment(Snapshot *pWorker, Pgno iFirst){
  2947    Level *pLvl;                    /* Used to iterate through db levels */
  2948    Segment *pSeg = 0;              /* Pointer to segment to return */
  2949  
  2950    for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pSeg==0; pLvl=pLvl->pNext){
  2951      if( 0==(pSeg = startsWith(&pLvl->lhs, iFirst)) ){
  2952        int i;
  2953        for(i=0; i<pLvl->nRight; i++){
  2954          if( (pSeg = startsWith(&pLvl->aRhs[i], iFirst)) ) break;
  2955        }
  2956      }
  2957    }
  2958  
  2959    return pSeg;
  2960  }
  2961  
  2962  /*
  2963  ** This function implements the lsm_info(LSM_INFO_ARRAY_STRUCTURE) request.
  2964  ** If successful, *pzOut is set to point to a nul-terminated string 
  2965  ** containing the array structure and LSM_OK is returned. The caller should
  2966  ** eventually free the string using lsmFree().
  2967  **
  2968  ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
  2969  */
  2970  int lsmInfoArrayStructure(
  2971    lsm_db *pDb, 
  2972    int bBlock,                     /* True for block numbers only */
  2973    Pgno iFirst,
  2974    char **pzOut
  2975  ){
  2976    int rc = LSM_OK;
  2977    Snapshot *pWorker;              /* Worker snapshot */
  2978    Segment *pArray = 0;            /* Array to report on */
  2979    int bUnlock = 0;
  2980  
  2981    *pzOut = 0;
  2982    if( iFirst==0 ) return LSM_ERROR;
  2983  
  2984    /* Obtain the worker snapshot */
  2985    pWorker = pDb->pWorker;
  2986    if( !pWorker ){
  2987      rc = lsmBeginWork(pDb);
  2988      if( rc!=LSM_OK ) return rc;
  2989      pWorker = pDb->pWorker;
  2990      bUnlock = 1;
  2991    }
  2992  
  2993    /* Search for the array that starts on page iFirst */
  2994    pArray = findSegment(pWorker, iFirst);
  2995  
  2996    if( pArray==0 ){
  2997      /* Could not find the requested array. This is an error. */
  2998      rc = LSM_ERROR;
  2999    }else{
  3000      FileSystem *pFS = pDb->pFS;
  3001      LsmString str;
  3002      int iBlk;
  3003      int iLastBlk;
  3004     
  3005      iBlk = fsPageToBlock(pFS, pArray->iFirst);
  3006      iLastBlk = fsPageToBlock(pFS, pArray->iLastPg);
  3007  
  3008      lsmStringInit(&str, pDb->pEnv);
  3009      if( bBlock ){
  3010        lsmStringAppendf(&str, "%d", iBlk);
  3011        while( iBlk!=iLastBlk ){
  3012          fsBlockNext(pFS, pArray, iBlk, &iBlk);
  3013          lsmStringAppendf(&str, " %d", iBlk);
  3014        }
  3015      }else{
  3016        lsmStringAppendf(&str, "%d", pArray->iFirst);
  3017        while( iBlk!=iLastBlk ){
  3018          lsmStringAppendf(&str, " %d", fsLastPageOnBlock(pFS, iBlk));
  3019          fsBlockNext(pFS, pArray, iBlk, &iBlk);
  3020          lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
  3021        }
  3022        lsmStringAppendf(&str, " %d", pArray->iLastPg);
  3023      }
  3024  
  3025      *pzOut = str.z;
  3026    }
  3027  
  3028    if( bUnlock ){
  3029      int rcwork = LSM_BUSY;
  3030      lsmFinishWork(pDb, 0, &rcwork);
  3031    }
  3032    return rc;
  3033  }
  3034  
  3035  int lsmFsSegmentContainsPg(
  3036    FileSystem *pFS, 
  3037    Segment *pSeg, 
  3038    Pgno iPg, 
  3039    int *pbRes
  3040  ){
  3041    Redirect *pRedir = pSeg->pRedirect;
  3042    int rc = LSM_OK;
  3043    int iBlk;
  3044    int iLastBlk;
  3045    int iPgBlock;                   /* Block containing page iPg */
  3046  
  3047    iPgBlock = fsPageToBlock(pFS, pSeg->iFirst);
  3048    iBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iFirst));
  3049    iLastBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iLastPg));
  3050  
  3051    while( iBlk!=iLastBlk && iBlk!=iPgBlock && rc==LSM_OK ){
  3052      rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
  3053    }
  3054  
  3055    *pbRes = (iBlk==iPgBlock);
  3056    return rc;
  3057  }
  3058  
  3059  /*
  3060  ** This function implements the lsm_info(LSM_INFO_ARRAY_PAGES) request.
  3061  ** If successful, *pzOut is set to point to a nul-terminated string 
  3062  ** containing the array structure and LSM_OK is returned. The caller should
  3063  ** eventually free the string using lsmFree().
  3064  **
  3065  ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
  3066  */
  3067  int lsmInfoArrayPages(lsm_db *pDb, Pgno iFirst, char **pzOut){
  3068    int rc = LSM_OK;
  3069    Snapshot *pWorker;              /* Worker snapshot */
  3070    Segment *pSeg = 0;              /* Array to report on */
  3071    int bUnlock = 0;
  3072  
  3073    *pzOut = 0;
  3074    if( iFirst==0 ) return LSM_ERROR;
  3075  
  3076    /* Obtain the worker snapshot */
  3077    pWorker = pDb->pWorker;
  3078    if( !pWorker ){
  3079      rc = lsmBeginWork(pDb);
  3080      if( rc!=LSM_OK ) return rc;
  3081      pWorker = pDb->pWorker;
  3082      bUnlock = 1;
  3083    }
  3084  
  3085    /* Search for the array that starts on page iFirst */
  3086    pSeg = findSegment(pWorker, iFirst);
  3087  
  3088    if( pSeg==0 ){
  3089      /* Could not find the requested array. This is an error. */
  3090      rc = LSM_ERROR;
  3091    }else{
  3092      Page *pPg = 0;
  3093      FileSystem *pFS = pDb->pFS;
  3094      LsmString str;
  3095  
  3096      lsmStringInit(&str, pDb->pEnv);
  3097      rc = lsmFsDbPageGet(pFS, pSeg, iFirst, &pPg);
  3098      while( rc==LSM_OK && pPg ){
  3099        Page *pNext = 0;
  3100        lsmStringAppendf(&str, " %lld", lsmFsPageNumber(pPg));
  3101        rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
  3102        lsmFsPageRelease(pPg);
  3103        pPg = pNext;
  3104      }
  3105  
  3106      if( rc!=LSM_OK ){
  3107        lsmFree(pDb->pEnv, str.z);
  3108      }else{
  3109        *pzOut = str.z;
  3110      }
  3111    }
  3112  
  3113    if( bUnlock ){
  3114      int rcwork = LSM_BUSY;
  3115      lsmFinishWork(pDb, 0, &rcwork);
  3116    }
  3117    return rc;
  3118  }
  3119  
  3120  /*
  3121  ** The following macros are used by the integrity-check code. Associated with
  3122  ** each block in the database is an 8-bit bit mask (the entry in the aUsed[]
  3123  ** array). As the integrity-check meanders through the database, it sets the
  3124  ** following bits to indicate how each block is used.
  3125  **
  3126  ** INTEGRITY_CHECK_FIRST_PG:
  3127  **   First page of block is in use by sorted run.
  3128  **
  3129  ** INTEGRITY_CHECK_LAST_PG:
  3130  **   Last page of block is in use by sorted run.
  3131  **
  3132  ** INTEGRITY_CHECK_USED:
  3133  **   At least one page of the block is in use by a sorted run.
  3134  **
  3135  ** INTEGRITY_CHECK_FREE:
  3136  **   The free block list contains an entry corresponding to this block.
  3137  */
  3138  #define INTEGRITY_CHECK_FIRST_PG 0x01
  3139  #define INTEGRITY_CHECK_LAST_PG  0x02
  3140  #define INTEGRITY_CHECK_USED     0x04
  3141  #define INTEGRITY_CHECK_FREE     0x08
  3142  
  3143  /*
  3144  ** Helper function for lsmFsIntegrityCheck()
  3145  */
  3146  static void checkBlocks(
  3147    FileSystem *pFS, 
  3148    Segment *pSeg,
  3149    int bExtra,                     /* If true, count the "next" block if any */
  3150    int nUsed,
  3151    u8 *aUsed
  3152  ){
  3153    if( pSeg ){
  3154      if( pSeg && pSeg->nSize>0 ){
  3155        int rc;
  3156        int iBlk;                   /* Current block (during iteration) */
  3157        int iLastBlk;               /* Last block of segment */
  3158        int iFirstBlk;              /* First block of segment */
  3159        int bLastIsLastOnBlock;     /* True iLast is the last on its block */
  3160  
  3161        assert( 0==fsSegmentRedirects(pFS, pSeg) );
  3162        iBlk = iFirstBlk = fsPageToBlock(pFS, pSeg->iFirst);
  3163        iLastBlk = fsPageToBlock(pFS, pSeg->iLastPg);
  3164  
  3165        bLastIsLastOnBlock = (fsLastPageOnBlock(pFS, iLastBlk)==pSeg->iLastPg);
  3166        assert( iBlk>0 );
  3167  
  3168        do {
  3169          /* iBlk is a part of this sorted run. */
  3170          aUsed[iBlk-1] |= INTEGRITY_CHECK_USED;
  3171  
  3172          /* If the first page of this block is also part of the segment,
  3173          ** set the flag to indicate that the first page of iBlk is in use.  
  3174          */
  3175          if( fsFirstPageOnBlock(pFS, iBlk)==pSeg->iFirst || iBlk!=iFirstBlk ){
  3176            assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_FIRST_PG)==0 );
  3177            aUsed[iBlk-1] |= INTEGRITY_CHECK_FIRST_PG;
  3178          }
  3179  
  3180          /* Unless the sorted run finishes before the last page on this block, 
  3181          ** the last page of this block is also in use.  */
  3182          if( iBlk!=iLastBlk || bLastIsLastOnBlock ){
  3183            assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_LAST_PG)==0 );
  3184            aUsed[iBlk-1] |= INTEGRITY_CHECK_LAST_PG;
  3185          }
  3186  
  3187          /* Special case. The sorted run being scanned is the output run of
  3188          ** a level currently undergoing an incremental merge. The sorted
  3189          ** run ends on the last page of iBlk, but the next block has already
  3190          ** been allocated. So mark it as in use as well.  */
  3191          if( iBlk==iLastBlk && bLastIsLastOnBlock && bExtra ){
  3192            int iExtra = 0;
  3193            rc = fsBlockNext(pFS, pSeg, iBlk, &iExtra);
  3194            assert( rc==LSM_OK );
  3195  
  3196            assert( aUsed[iExtra-1]==0 );
  3197            aUsed[iExtra-1] |= INTEGRITY_CHECK_USED;
  3198            aUsed[iExtra-1] |= INTEGRITY_CHECK_FIRST_PG;
  3199            aUsed[iExtra-1] |= INTEGRITY_CHECK_LAST_PG;
  3200          }
  3201  
  3202          /* Move on to the next block in the sorted run. Or set iBlk to zero
  3203          ** in order to break out of the loop if this was the last block in
  3204          ** the run.  */
  3205          if( iBlk==iLastBlk ){
  3206            iBlk = 0;
  3207          }else{
  3208            rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
  3209            assert( rc==LSM_OK );
  3210          }
  3211        }while( iBlk );
  3212      }
  3213    }
  3214  }
  3215  
  3216  typedef struct CheckFreelistCtx CheckFreelistCtx;
  3217  struct CheckFreelistCtx {
  3218    u8 *aUsed;
  3219    int nBlock;
  3220  };
  3221  static int checkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
  3222    CheckFreelistCtx *p = (CheckFreelistCtx *)pCtx;
  3223  
  3224    assert( iBlk>=1 );
  3225    assert( iBlk<=p->nBlock );
  3226    assert( p->aUsed[iBlk-1]==0 );
  3227    p->aUsed[iBlk-1] = INTEGRITY_CHECK_FREE;
  3228    return 0;
  3229  }
  3230  
  3231  /*
  3232  ** This function checks that all blocks in the database file are accounted
  3233  ** for. For each block, exactly one of the following must be true:
  3234  **
  3235  **   + the block is part of a sorted run, or
  3236  **   + the block is on the free-block list
  3237  **
  3238  ** This function also checks that there are no references to blocks with
  3239  ** out-of-range block numbers.
  3240  **
  3241  ** If no errors are found, non-zero is returned. If an error is found, an
  3242  ** assert() fails.
  3243  */
  3244  int lsmFsIntegrityCheck(lsm_db *pDb){
  3245    CheckFreelistCtx ctx;
  3246    FileSystem *pFS = pDb->pFS;
  3247    int i;
  3248    int rc;
  3249    Freelist freelist = {0, 0, 0};
  3250    u8 *aUsed;
  3251    Level *pLevel;
  3252    Snapshot *pWorker = pDb->pWorker;
  3253    int nBlock = pWorker->nBlock;
  3254  
  3255  #if 0 
  3256    static int nCall = 0;
  3257    nCall++;
  3258    printf("%d calls\n", nCall);
  3259  #endif
  3260  
  3261    aUsed = lsmMallocZero(pDb->pEnv, nBlock);
  3262    if( aUsed==0 ){
  3263      /* Malloc has failed. Since this function is only called within debug
  3264      ** builds, this probably means the user is running an OOM injection test.
  3265      ** Regardless, it will not be possible to run the integrity-check at this
  3266      ** time, so assume the database is Ok and return non-zero. */
  3267      return 1;
  3268    }
  3269  
  3270    for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
  3271      int j;
  3272      checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
  3273      for(j=0; j<pLevel->nRight; j++){
  3274        checkBlocks(pFS, &pLevel->aRhs[j], 0, nBlock, aUsed);
  3275      }
  3276    }
  3277  
  3278    /* Mark all blocks in the free-list as used */
  3279    ctx.aUsed = aUsed;
  3280    ctx.nBlock = nBlock;
  3281    rc = lsmWalkFreelist(pDb, 0, checkFreelistCb, (void *)&ctx);
  3282  
  3283    if( rc==LSM_OK ){
  3284      for(i=0; i<nBlock; i++) assert( aUsed[i]!=0 );
  3285    }
  3286  
  3287    lsmFree(pDb->pEnv, aUsed);
  3288    lsmFree(pDb->pEnv, freelist.aEntry);
  3289  
  3290    return 1;
  3291  }
  3292  
  3293  #ifndef NDEBUG
  3294  /*
  3295  ** Return true if pPg happens to be the last page in segment pSeg. Or false
  3296  ** otherwise. This function is only invoked as part of assert() conditions.
  3297  */
  3298  int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg){
  3299    if( pPg->pFS->pCompress ){
  3300      Pgno iNext = 0;
  3301      int rc;
  3302      rc = fsNextPageOffset(pPg->pFS, pSeg, pPg->iPg, pPg->nCompress+6, &iNext);
  3303      return (rc!=LSM_OK || iNext==0);
  3304    }
  3305    return (pPg->iPg==pSeg->iLastPg);
  3306  }
  3307  #endif