modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/lsm1/lsm_shared.c (about)

     1  /*
     2  ** 2012-01-23
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  **
    13  ** Utilities used to help multiple LSM clients to coexist within the
    14  ** same process space.
    15  */
    16  #include "lsmInt.h"
    17  
    18  /*
    19  ** Global data. All global variables used by code in this file are grouped
    20  ** into the following structure instance.
    21  **
    22  ** pDatabase:
    23  **   Linked list of all Database objects allocated within this process.
    24  **   This list may not be traversed without holding the global mutex (see
    25  **   functions enterGlobalMutex() and leaveGlobalMutex()).
    26  */
    27  static struct SharedData {
    28    Database *pDatabase;            /* Linked list of all Database objects */
    29  } gShared;
    30  
    31  /*
    32  ** Database structure. There is one such structure for each distinct 
    33  ** database accessed by this process. They are stored in the singly linked 
    34  ** list starting at global variable gShared.pDatabase. Database objects are 
    35  ** reference counted. Once the number of connections to the associated
    36  ** database drops to zero, they are removed from the linked list and deleted.
    37  **
    38  ** pFile:
    39  **   In multi-process mode, this file descriptor is used to obtain locks 
    40  **   and to access shared-memory. In single process mode, its only job is
    41  **   to hold the exclusive lock on the file.
    42  **   
    43  */
    44  struct Database {
    45    /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
    46    char *zName;                    /* Canonical path to database file */
    47    int nName;                      /* strlen(zName) */
    48    int nDbRef;                     /* Number of associated lsm_db handles */
    49    Database *pDbNext;              /* Next Database structure in global list */
    50  
    51    /* Protected by the local mutex (pClientMutex) */
    52    int bReadonly;                  /* True if Database.pFile is read-only */
    53    int bMultiProc;                 /* True if running in multi-process mode */
    54    lsm_file *pFile;                /* Used for locks/shm in multi-proc mode */
    55    LsmFile *pLsmFile;              /* List of deferred closes */
    56    lsm_mutex *pClientMutex;        /* Protects the apShmChunk[] and pConn */
    57    int nShmChunk;                  /* Number of entries in apShmChunk[] array */
    58    void **apShmChunk;              /* Array of "shared" memory regions */
    59    lsm_db *pConn;                  /* List of connections to this db. */
    60  };
    61  
    62  /*
    63  ** Functions to enter and leave the global mutex. This mutex is used
    64  ** to protect the global linked-list headed at gShared.pDatabase.
    65  */
    66  static int enterGlobalMutex(lsm_env *pEnv){
    67    lsm_mutex *p;
    68    int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
    69    if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
    70    return rc;
    71  }
    72  static void leaveGlobalMutex(lsm_env *pEnv){
    73    lsm_mutex *p;
    74    lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
    75    lsmMutexLeave(pEnv, p);
    76  }
    77  
    78  #ifdef LSM_DEBUG
    79  static int holdingGlobalMutex(lsm_env *pEnv){
    80    lsm_mutex *p;
    81    lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
    82    return lsmMutexHeld(pEnv, p);
    83  }
    84  #endif
    85  
    86  #if 0
    87  static void assertNotInFreelist(Freelist *p, int iBlk){
    88    int i; 
    89    for(i=0; i<p->nEntry; i++){
    90      assert( p->aEntry[i].iBlk!=iBlk );
    91    }
    92  }
    93  #else
    94  # define assertNotInFreelist(x,y)
    95  #endif
    96  
    97  /*
    98  ** Append an entry to the free-list. If (iId==-1), this is a delete.
    99  */
   100  int freelistAppend(lsm_db *db, u32 iBlk, i64 iId){
   101    lsm_env *pEnv = db->pEnv;
   102    Freelist *p;
   103    int i; 
   104  
   105    assert( iId==-1 || iId>=0 );
   106    p = db->bUseFreelist ? db->pFreelist : &db->pWorker->freelist;
   107  
   108    /* Extend the space allocated for the freelist, if required */
   109    assert( p->nAlloc>=p->nEntry );
   110    if( p->nAlloc==p->nEntry ){
   111      int nNew; 
   112      int nByte; 
   113      FreelistEntry *aNew;
   114  
   115      nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2);
   116      nByte = sizeof(FreelistEntry) * nNew;
   117      aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry, nByte);
   118      if( !aNew ) return LSM_NOMEM_BKPT;
   119      p->nAlloc = nNew;
   120      p->aEntry = aNew;
   121    }
   122  
   123    for(i=0; i<p->nEntry; i++){
   124      assert( i==0 || p->aEntry[i].iBlk > p->aEntry[i-1].iBlk );
   125      if( p->aEntry[i].iBlk>=iBlk ) break;
   126    }
   127  
   128    if( i<p->nEntry && p->aEntry[i].iBlk==iBlk ){
   129      /* Clobber an existing entry */
   130      p->aEntry[i].iId = iId;
   131    }else{
   132      /* Insert a new entry into the list */
   133      int nByte = sizeof(FreelistEntry)*(p->nEntry-i);
   134      memmove(&p->aEntry[i+1], &p->aEntry[i], nByte);
   135      p->aEntry[i].iBlk = iBlk;
   136      p->aEntry[i].iId = iId;
   137      p->nEntry++;
   138    }
   139  
   140    return LSM_OK;
   141  }
   142  
   143  /*
   144  ** This function frees all resources held by the Database structure passed
   145  ** as the only argument.
   146  */
   147  static void freeDatabase(lsm_env *pEnv, Database *p){
   148    assert( holdingGlobalMutex(pEnv) );
   149    if( p ){
   150      /* Free the mutexes */
   151      lsmMutexDel(pEnv, p->pClientMutex);
   152  
   153      if( p->pFile ){
   154        lsmEnvClose(pEnv, p->pFile);
   155      }
   156  
   157      /* Free the array of shm pointers */
   158      lsmFree(pEnv, p->apShmChunk);
   159  
   160      /* Free the memory allocated for the Database struct itself */
   161      lsmFree(pEnv, p);
   162    }
   163  }
   164  
   165  typedef struct DbTruncateCtx DbTruncateCtx;
   166  struct DbTruncateCtx {
   167    int nBlock;
   168    i64 iInUse;
   169  };
   170  
   171  static int dbTruncateCb(void *pCtx, int iBlk, i64 iSnapshot){
   172    DbTruncateCtx *p = (DbTruncateCtx *)pCtx;
   173    if( iBlk!=p->nBlock || (p->iInUse>=0 && iSnapshot>=p->iInUse) ) return 1;
   174    p->nBlock--;
   175    return 0;
   176  }
   177  
   178  static int dbTruncate(lsm_db *pDb, i64 iInUse){
   179    int rc = LSM_OK;
   180  #if 0
   181    int i;
   182    DbTruncateCtx ctx;
   183  
   184    assert( pDb->pWorker );
   185    ctx.nBlock = pDb->pWorker->nBlock;
   186    ctx.iInUse = iInUse;
   187  
   188    rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
   189    for(i=ctx.nBlock+1; rc==LSM_OK && i<=pDb->pWorker->nBlock; i++){
   190      rc = freelistAppend(pDb, i, -1);
   191    }
   192  
   193    if( rc==LSM_OK ){
   194  #ifdef LSM_LOG_FREELIST
   195      if( ctx.nBlock!=pDb->pWorker->nBlock ){
   196        lsmLogMessage(pDb, 0, 
   197            "dbTruncate(): truncated db to %d blocks",ctx.nBlock
   198        );
   199      }
   200  #endif
   201      pDb->pWorker->nBlock = ctx.nBlock;
   202    }
   203  #endif
   204    return rc;
   205  }
   206  
   207  
   208  /*
   209  ** This function is called during database shutdown (when the number of
   210  ** connections drops from one to zero). It truncates the database file
   211  ** to as small a size as possible without truncating away any blocks that
   212  ** contain data.
   213  */
   214  static int dbTruncateFile(lsm_db *pDb){
   215    int rc;
   216  
   217    assert( pDb->pWorker==0 );
   218    assert( lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL) );
   219    rc = lsmCheckpointLoadWorker(pDb);
   220  
   221    if( rc==LSM_OK ){
   222      DbTruncateCtx ctx;
   223  
   224      /* Walk the database free-block-list in reverse order. Set ctx.nBlock
   225      ** to the block number of the last block in the database that actually
   226      ** contains data. */
   227      ctx.nBlock = pDb->pWorker->nBlock;
   228      ctx.iInUse = -1;
   229      rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
   230  
   231      /* If the last block that contains data is not already the last block in
   232      ** the database file, truncate the database file so that it is. */
   233      if( rc==LSM_OK ){
   234        rc = lsmFsTruncateDb(
   235            pDb->pFS, (i64)ctx.nBlock*lsmFsBlockSize(pDb->pFS)
   236        );
   237      }
   238    }
   239  
   240    lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
   241    pDb->pWorker = 0;
   242    return rc;
   243  }
   244  
   245  static void doDbDisconnect(lsm_db *pDb){
   246    int rc;
   247  
   248    if( pDb->bReadonly ){
   249      lsmShmLock(pDb, LSM_LOCK_DMS3, LSM_LOCK_UNLOCK, 0);
   250    }else{
   251      /* Block for an exclusive lock on DMS1. This lock serializes all calls
   252      ** to doDbConnect() and doDbDisconnect() across all processes.  */
   253      rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
   254      if( rc==LSM_OK ){
   255  
   256        lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
   257  
   258        /* Try an exclusive lock on DMS2. If successful, this is the last
   259        ** connection to the database. In this case flush the contents of the
   260        ** in-memory tree to disk and write a checkpoint.  */
   261        rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 1, LSM_LOCK_EXCL);
   262        if( rc==LSM_OK ){
   263          rc = lsmShmTestLock(pDb, LSM_LOCK_CHECKPOINTER, 1, LSM_LOCK_EXCL);
   264        }
   265        if( rc==LSM_OK ){
   266          int bReadonly = 0;        /* True if there exist read-only conns. */
   267  
   268          /* Flush the in-memory tree, if required. If there is data to flush,
   269          ** this will create a new client snapshot in Database.pClient. The
   270          ** checkpoint (serialization) of this snapshot may be written to disk
   271          ** by the following block.  
   272          **
   273          ** There is no need to take a WRITER lock here. That there are no 
   274          ** other locks on DMS2 guarantees that there are no other read-write
   275          ** connections at this time (and the lock on DMS1 guarantees that
   276          ** no new ones may appear).
   277          */
   278          rc = lsmTreeLoadHeader(pDb, 0);
   279          if( rc==LSM_OK && (lsmTreeHasOld(pDb) || lsmTreeSize(pDb)>0) ){
   280            rc = lsmFlushTreeToDisk(pDb);
   281          }
   282  
   283          /* Now check if there are any read-only connections. If there are,
   284          ** then do not truncate the db file or unlink the shared-memory 
   285          ** region.  */
   286          if( rc==LSM_OK ){
   287            rc = lsmShmTestLock(pDb, LSM_LOCK_DMS3, 1, LSM_LOCK_EXCL);
   288            if( rc==LSM_BUSY ){
   289              bReadonly = 1;
   290              rc = LSM_OK;
   291            }
   292          }
   293  
   294          /* Write a checkpoint to disk. */
   295          if( rc==LSM_OK ){
   296            rc = lsmCheckpointWrite(pDb, 0);
   297          }
   298  
   299          /* If the checkpoint was written successfully, delete the log file
   300          ** and, if possible, truncate the database file.  */
   301          if( rc==LSM_OK ){
   302            int bRotrans = 0;
   303            Database *p = pDb->pDatabase;
   304  
   305            /* The log file may only be deleted if there are no clients 
   306            ** read-only clients running rotrans transactions.  */
   307            rc = lsmDetectRoTrans(pDb, &bRotrans);
   308            if( rc==LSM_OK && bRotrans==0 ){
   309              lsmFsCloseAndDeleteLog(pDb->pFS);
   310            }
   311  
   312            /* The database may only be truncated if there exist no read-only
   313            ** clients - either connected or running rotrans transactions. */
   314            if( bReadonly==0 && bRotrans==0 ){
   315              lsmFsUnmap(pDb->pFS);
   316              dbTruncateFile(pDb);
   317              if( p->pFile && p->bMultiProc ){
   318                lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1);
   319              }
   320            }
   321          }
   322        }
   323      }
   324  
   325      if( pDb->iRwclient>=0 ){
   326        lsmShmLock(pDb, LSM_LOCK_RWCLIENT(pDb->iRwclient), LSM_LOCK_UNLOCK, 0);
   327        pDb->iRwclient = -1;
   328      }
   329  
   330      lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
   331    }
   332    pDb->pShmhdr = 0;
   333  }
   334  
   335  static int doDbConnect(lsm_db *pDb){
   336    const int nUsMax = 100000;      /* Max value for nUs */
   337    int nUs = 1000;                 /* us to wait between DMS1 attempts */
   338    int rc;
   339  
   340    /* Obtain a pointer to the shared-memory header */
   341    assert( pDb->pShmhdr==0 );
   342    assert( pDb->bReadonly==0 );
   343  
   344    /* Block for an exclusive lock on DMS1. This lock serializes all calls
   345    ** to doDbConnect() and doDbDisconnect() across all processes.  */
   346    while( 1 ){
   347      rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
   348      if( rc!=LSM_BUSY ) break;
   349      lsmEnvSleep(pDb->pEnv, nUs);
   350      nUs = nUs * 2;
   351      if( nUs>nUsMax ) nUs = nUsMax;
   352    }
   353    if( rc==LSM_OK ){
   354      rc = lsmShmCacheChunks(pDb, 1);
   355    }
   356    if( rc!=LSM_OK ) return rc;
   357    pDb->pShmhdr = (ShmHeader *)pDb->apShm[0];
   358  
   359    /* Try an exclusive lock on DMS2/DMS3. If successful, this is the first 
   360    ** and only connection to the database. In this case initialize the 
   361    ** shared-memory and run log file recovery.  */
   362    assert( LSM_LOCK_DMS3==1+LSM_LOCK_DMS2 );
   363    rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 2, LSM_LOCK_EXCL);
   364    if( rc==LSM_OK ){
   365      memset(pDb->pShmhdr, 0, sizeof(ShmHeader));
   366      rc = lsmCheckpointRecover(pDb);
   367      if( rc==LSM_OK ){
   368        rc = lsmLogRecover(pDb);
   369      }
   370      if( rc==LSM_OK ){
   371        ShmHeader *pShm = pDb->pShmhdr;
   372        pShm->aReader[0].iLsmId = lsmCheckpointId(pShm->aSnap1, 0);
   373        pShm->aReader[0].iTreeId = pDb->treehdr.iUsedShmid;
   374      }
   375    }else if( rc==LSM_BUSY ){
   376      rc = LSM_OK;
   377    }
   378  
   379    /* Take a shared lock on DMS2. In multi-process mode this lock "cannot" 
   380    ** fail, as connections may only hold an exclusive lock on DMS2 if they 
   381    ** first hold an exclusive lock on DMS1. And this connection is currently 
   382    ** holding the exclusive lock on DSM1. 
   383    **
   384    ** However, if some other connection has the database open in single-process
   385    ** mode, this operation will fail. In this case, return the error to the
   386    ** caller - the attempt to connect to the db has failed.
   387    */
   388    if( rc==LSM_OK ){
   389      rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
   390    }
   391  
   392    /* If anything went wrong, unlock DMS2. Otherwise, try to take an exclusive
   393    ** lock on one of the LSM_LOCK_RWCLIENT() locks. Unlock DMS1 in any case. */
   394    if( rc!=LSM_OK ){
   395      pDb->pShmhdr = 0;
   396    }else{
   397      int i;
   398      for(i=0; i<LSM_LOCK_NRWCLIENT; i++){
   399        int rc2 = lsmShmLock(pDb, LSM_LOCK_RWCLIENT(i), LSM_LOCK_EXCL, 0);
   400        if( rc2==LSM_OK ) pDb->iRwclient = i;
   401        if( rc2!=LSM_BUSY ){
   402          rc = rc2;
   403          break;
   404        }
   405      }
   406    }
   407    lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
   408  
   409    return rc;
   410  }
   411  
   412  static int dbOpenSharedFd(lsm_env *pEnv, Database *p, int bRoOk){
   413    int rc;
   414  
   415    rc = lsmEnvOpen(pEnv, p->zName, 0, &p->pFile);
   416    if( rc==LSM_IOERR && bRoOk ){
   417      rc = lsmEnvOpen(pEnv, p->zName, LSM_OPEN_READONLY, &p->pFile);
   418      p->bReadonly = 1;
   419    }
   420  
   421    return rc;
   422  }
   423  
   424  /*
   425  ** Return a reference to the shared Database handle for the database 
   426  ** identified by canonical path zName. If this is the first connection to
   427  ** the named database, a new Database object is allocated. Otherwise, a
   428  ** pointer to an existing object is returned.
   429  **
   430  ** If successful, *ppDatabase is set to point to the shared Database 
   431  ** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL
   432  ** and and LSM error code returned.
   433  **
   434  ** Each successful call to this function should be (eventually) matched
   435  ** by a call to lsmDbDatabaseRelease().
   436  */
   437  int lsmDbDatabaseConnect(
   438    lsm_db *pDb,                    /* Database handle */
   439    const char *zName               /* Full-path to db file */
   440  ){
   441    lsm_env *pEnv = pDb->pEnv;
   442    int rc;                         /* Return code */
   443    Database *p = 0;                /* Pointer returned via *ppDatabase */
   444    int nName = lsmStrlen(zName);
   445  
   446    assert( pDb->pDatabase==0 );
   447    rc = enterGlobalMutex(pEnv);
   448    if( rc==LSM_OK ){
   449  
   450      /* Search the global list for an existing object. TODO: Need something
   451      ** better than the memcmp() below to figure out if a given Database
   452      ** object represents the requested file.  */
   453      for(p=gShared.pDatabase; p; p=p->pDbNext){
   454        if( nName==p->nName && 0==memcmp(zName, p->zName, nName) ) break;
   455      }
   456  
   457      /* If no suitable Database object was found, allocate a new one. */
   458      if( p==0 ){
   459        p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nName+1, &rc);
   460  
   461        /* If the allocation was successful, fill in other fields and
   462        ** allocate the client mutex. */ 
   463        if( rc==LSM_OK ){
   464          p->bMultiProc = pDb->bMultiProc;
   465          p->zName = (char *)&p[1];
   466          p->nName = nName;
   467          memcpy((void *)p->zName, zName, nName+1);
   468          rc = lsmMutexNew(pEnv, &p->pClientMutex);
   469        }
   470  
   471        /* If nothing has gone wrong so far, open the shared fd. And if that
   472        ** succeeds and this connection requested single-process mode, 
   473        ** attempt to take the exclusive lock on DMS2.  */
   474        if( rc==LSM_OK ){
   475          int bReadonly = (pDb->bReadonly && pDb->bMultiProc);
   476          rc = dbOpenSharedFd(pDb->pEnv, p, bReadonly);
   477        }
   478  
   479        if( rc==LSM_OK && p->bMultiProc==0 ){
   480          /* Hold an exclusive lock DMS1 while grabbing DMS2. This ensures
   481          ** that any ongoing call to doDbDisconnect() (even one in another
   482          ** process) is finished before proceeding.  */
   483          assert( p->bReadonly==0 );
   484          rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_EXCL);
   485          if( rc==LSM_OK ){
   486            rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS2, LSM_LOCK_EXCL);
   487            lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK);
   488          }
   489        }
   490  
   491        if( rc==LSM_OK ){
   492          p->pDbNext = gShared.pDatabase;
   493          gShared.pDatabase = p;
   494        }else{
   495          freeDatabase(pEnv, p);
   496          p = 0;
   497        }
   498      }
   499  
   500      if( p ){
   501        p->nDbRef++;
   502      }
   503      leaveGlobalMutex(pEnv);
   504  
   505      if( p ){
   506        lsmMutexEnter(pDb->pEnv, p->pClientMutex);
   507        pDb->pNext = p->pConn;
   508        p->pConn = pDb;
   509        lsmMutexLeave(pDb->pEnv, p->pClientMutex);
   510      }
   511    }
   512  
   513    pDb->pDatabase = p;
   514    if( rc==LSM_OK ){
   515      assert( p );
   516      rc = lsmFsOpen(pDb, zName, p->bReadonly);
   517    }
   518  
   519    /* If the db handle is read-write, then connect to the system now. Run
   520    ** recovery as necessary. Or, if this is a read-only database handle,
   521    ** defer attempting to connect to the system until a read-transaction
   522    ** is opened.  */
   523    if( pDb->bReadonly==0 ){
   524      if( rc==LSM_OK ){
   525        rc = lsmFsConfigure(pDb);
   526      }
   527      if( rc==LSM_OK ){
   528        rc = doDbConnect(pDb);
   529      }
   530    }
   531  
   532    return rc;
   533  }
   534  
   535  static void dbDeferClose(lsm_db *pDb){
   536    if( pDb->pFS ){
   537      LsmFile *pLsmFile;
   538      Database *p = pDb->pDatabase;
   539      pLsmFile = lsmFsDeferClose(pDb->pFS);
   540      pLsmFile->pNext = p->pLsmFile;
   541      p->pLsmFile = pLsmFile;
   542    }
   543  }
   544  
   545  LsmFile *lsmDbRecycleFd(lsm_db *db){
   546    LsmFile *pRet;
   547    Database *p = db->pDatabase;
   548    lsmMutexEnter(db->pEnv, p->pClientMutex);
   549    if( (pRet = p->pLsmFile)!=0 ){
   550      p->pLsmFile = pRet->pNext;
   551    }
   552    lsmMutexLeave(db->pEnv, p->pClientMutex);
   553    return pRet;
   554  }
   555  
   556  /*
   557  ** Release a reference to a Database object obtained from 
   558  ** lsmDbDatabaseConnect(). There should be exactly one call to this function 
   559  ** for each successful call to Find().
   560  */
   561  void lsmDbDatabaseRelease(lsm_db *pDb){
   562    Database *p = pDb->pDatabase;
   563    if( p ){
   564      lsm_db **ppDb;
   565  
   566      if( pDb->pShmhdr ){
   567        doDbDisconnect(pDb);
   568      }
   569  
   570      lsmFsUnmap(pDb->pFS);
   571      lsmMutexEnter(pDb->pEnv, p->pClientMutex);
   572      for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext));
   573      *ppDb = pDb->pNext;
   574      dbDeferClose(pDb);
   575      lsmMutexLeave(pDb->pEnv, p->pClientMutex);
   576  
   577      enterGlobalMutex(pDb->pEnv);
   578      p->nDbRef--;
   579      if( p->nDbRef==0 ){
   580        LsmFile *pIter;
   581        LsmFile *pNext;
   582        Database **pp;
   583  
   584        /* Remove the Database structure from the linked list. */
   585        for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
   586        *pp = p->pDbNext;
   587  
   588        /* If they were allocated from the heap, free the shared memory chunks */
   589        if( p->bMultiProc==0 ){
   590          int i;
   591          for(i=0; i<p->nShmChunk; i++){
   592            lsmFree(pDb->pEnv, p->apShmChunk[i]);
   593          }
   594        }
   595  
   596        /* Close any outstanding file descriptors */
   597        for(pIter=p->pLsmFile; pIter; pIter=pNext){
   598          pNext = pIter->pNext;
   599          lsmEnvClose(pDb->pEnv, pIter->pFile);
   600          lsmFree(pDb->pEnv, pIter);
   601        }
   602        freeDatabase(pDb->pEnv, p);
   603      }
   604      leaveGlobalMutex(pDb->pEnv);
   605    }
   606  }
   607  
   608  Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
   609    return pSnapshot->pLevel;
   610  }
   611  
   612  void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
   613    pSnap->pLevel = pLevel;
   614  }
   615  
   616  /* TODO: Shuffle things around to get rid of this */
   617  static int firstSnapshotInUse(lsm_db *, i64 *);
   618  
   619  /* 
   620  ** Context object used by the lsmWalkFreelist() utility. 
   621  */
   622  typedef struct WalkFreelistCtx WalkFreelistCtx;
   623  struct WalkFreelistCtx {
   624    lsm_db *pDb;
   625    int bReverse;
   626    Freelist *pFreelist;
   627    int iFree;
   628    int (*xUsr)(void *, int, i64);  /* User callback function */
   629    void *pUsrctx;                  /* User callback context */
   630    int bDone;                      /* Set to true after xUsr() returns true */
   631  };
   632  
   633  /* 
   634  ** Callback used by lsmWalkFreelist().
   635  */
   636  static int walkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
   637    WalkFreelistCtx *p = (WalkFreelistCtx *)pCtx;
   638    const int iDir = (p->bReverse ? -1 : 1);
   639    Freelist *pFree = p->pFreelist;
   640  
   641    assert( p->bDone==0 );
   642    assert( iBlk>=0 );
   643    if( pFree ){
   644      while( (p->iFree < pFree->nEntry) && p->iFree>=0 ){
   645        FreelistEntry *pEntry = &pFree->aEntry[p->iFree];
   646        if( (p->bReverse==0 && pEntry->iBlk>(u32)iBlk)
   647         || (p->bReverse!=0 && pEntry->iBlk<(u32)iBlk)
   648        ){
   649          break;
   650        }else{
   651          p->iFree += iDir;
   652          if( pEntry->iId>=0 
   653              && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) 
   654            ){
   655            p->bDone = 1;
   656            return 1;
   657          }
   658          if( pEntry->iBlk==(u32)iBlk ) return 0;
   659        }
   660      }
   661    }
   662  
   663    if( p->xUsr(p->pUsrctx, iBlk, iSnapshot) ){
   664      p->bDone = 1;
   665      return 1;
   666    }
   667    return 0;
   668  }
   669  
   670  /*
   671  ** The database handle passed as the first argument must be the worker
   672  ** connection. This function iterates through the contents of the current
   673  ** free block list, invoking the supplied callback once for each list
   674  ** element.
   675  **
   676  ** The difference between this function and lsmSortedWalkFreelist() is
   677  ** that lsmSortedWalkFreelist() only considers those free-list elements
   678  ** stored within the LSM. This function also merges in any in-memory 
   679  ** elements.
   680  */
   681  int lsmWalkFreelist(
   682    lsm_db *pDb,                    /* Database handle (must be worker) */
   683    int bReverse,                   /* True to iterate from largest to smallest */
   684    int (*x)(void *, int, i64),     /* Callback function */
   685    void *pCtx                      /* First argument to pass to callback */
   686  ){
   687    const int iDir = (bReverse ? -1 : 1);
   688    int rc;
   689    int iCtx;
   690  
   691    WalkFreelistCtx ctx[2];
   692  
   693    ctx[0].pDb = pDb;
   694    ctx[0].bReverse = bReverse;
   695    ctx[0].pFreelist = &pDb->pWorker->freelist;
   696    if( ctx[0].pFreelist && bReverse ){
   697      ctx[0].iFree = ctx[0].pFreelist->nEntry-1;
   698    }else{
   699      ctx[0].iFree = 0;
   700    }
   701    ctx[0].xUsr = walkFreelistCb;
   702    ctx[0].pUsrctx = (void *)&ctx[1];
   703    ctx[0].bDone = 0;
   704  
   705    ctx[1].pDb = pDb;
   706    ctx[1].bReverse = bReverse;
   707    ctx[1].pFreelist = pDb->pFreelist;
   708    if( ctx[1].pFreelist && bReverse ){
   709      ctx[1].iFree = ctx[1].pFreelist->nEntry-1;
   710    }else{
   711      ctx[1].iFree = 0;
   712    }
   713    ctx[1].xUsr = x;
   714    ctx[1].pUsrctx = pCtx;
   715    ctx[1].bDone = 0;
   716  
   717    rc = lsmSortedWalkFreelist(pDb, bReverse, walkFreelistCb, (void *)&ctx[0]);
   718  
   719    if( ctx[0].bDone==0 ){
   720      for(iCtx=0; iCtx<2; iCtx++){
   721        int i;
   722        WalkFreelistCtx *p = &ctx[iCtx];
   723        for(i=p->iFree; 
   724            p->pFreelist && rc==LSM_OK && i<p->pFreelist->nEntry && i>=0;
   725            i += iDir
   726           ){
   727          FreelistEntry *pEntry = &p->pFreelist->aEntry[i];
   728          if( pEntry->iId>=0 && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) ){
   729            return LSM_OK;
   730          }
   731        }
   732      }
   733    }
   734  
   735    return rc;
   736  }
   737  
   738  
   739  typedef struct FindFreeblockCtx FindFreeblockCtx;
   740  struct FindFreeblockCtx {
   741    i64 iInUse;
   742    int iRet;
   743    int bNotOne;
   744  };
   745  
   746  static int findFreeblockCb(void *pCtx, int iBlk, i64 iSnapshot){
   747    FindFreeblockCtx *p = (FindFreeblockCtx *)pCtx;
   748    if( iSnapshot<p->iInUse && (iBlk!=1 || p->bNotOne==0) ){
   749      p->iRet = iBlk;
   750      return 1;
   751    }
   752    return 0;
   753  }
   754  
   755  static int findFreeblock(lsm_db *pDb, i64 iInUse, int bNotOne, int *piRet){
   756    int rc;                         /* Return code */
   757    FindFreeblockCtx ctx;           /* Context object */
   758  
   759    ctx.iInUse = iInUse;
   760    ctx.iRet = 0;
   761    ctx.bNotOne = bNotOne;
   762    rc = lsmWalkFreelist(pDb, 0, findFreeblockCb, (void *)&ctx);
   763    *piRet = ctx.iRet;
   764  
   765    return rc;
   766  }
   767  
   768  /*
   769  ** Allocate a new database file block to write data to, either by extending
   770  ** the database file or by recycling a free-list entry. The worker snapshot 
   771  ** must be held in order to call this function.
   772  **
   773  ** If successful, *piBlk is set to the block number allocated and LSM_OK is
   774  ** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
   775  */
   776  int lsmBlockAllocate(lsm_db *pDb, int iBefore, int *piBlk){
   777    Snapshot *p = pDb->pWorker;
   778    int iRet = 0;                   /* Block number of allocated block */
   779    int rc = LSM_OK;
   780    i64 iInUse = 0;                 /* Snapshot id still in use */
   781    i64 iSynced = 0;                /* Snapshot id synced to disk */
   782  
   783    assert( p );
   784  
   785  #ifdef LSM_LOG_FREELIST
   786    {
   787      static int nCall = 0;
   788      char *zFree = 0;
   789      nCall++;
   790      rc = lsmInfoFreelist(pDb, &zFree);
   791      if( rc!=LSM_OK ) return rc;
   792      lsmLogMessage(pDb, 0, "lsmBlockAllocate(): %d freelist: %s", nCall, zFree);
   793      lsmFree(pDb->pEnv, zFree);
   794    }
   795  #endif
   796  
   797    /* Set iInUse to the smallest snapshot id that is either:
   798    **
   799    **   * Currently in use by a database client,
   800    **   * May be used by a database client in the future, or
   801    **   * Is the most recently checkpointed snapshot (i.e. the one that will
   802    **     be used following recovery if a failure occurs at this point).
   803    */
   804    rc = lsmCheckpointSynced(pDb, &iSynced, 0, 0);
   805    if( rc==LSM_OK && iSynced==0 ) iSynced = p->iId;
   806    iInUse = iSynced;
   807    if( rc==LSM_OK && pDb->iReader>=0 ){
   808      assert( pDb->pClient );
   809      iInUse = LSM_MIN(iInUse, pDb->pClient->iId);
   810    }
   811    if( rc==LSM_OK ) rc = firstSnapshotInUse(pDb, &iInUse);
   812  
   813  #ifdef LSM_LOG_FREELIST
   814    {
   815      lsmLogMessage(pDb, 0, "lsmBlockAllocate(): "
   816          "snapshot-in-use: %lld (iSynced=%lld) (client-id=%lld)", 
   817          iInUse, iSynced, (pDb->iReader>=0 ? pDb->pClient->iId : 0)
   818      );
   819    }
   820  #endif
   821  
   822  
   823    /* Unless there exists a read-only transaction (which prevents us from
   824    ** recycling any blocks regardless, query the free block list for a 
   825    ** suitable block to reuse. 
   826    **
   827    ** It might seem more natural to check for a read-only transaction at
   828    ** the start of this function. However, it is better do wait until after
   829    ** the call to lsmCheckpointSynced() to do so.
   830    */
   831    if( rc==LSM_OK ){
   832      int bRotrans;
   833      rc = lsmDetectRoTrans(pDb, &bRotrans);
   834  
   835      if( rc==LSM_OK && bRotrans==0 ){
   836        rc = findFreeblock(pDb, iInUse, (iBefore>0), &iRet);
   837      }
   838    }
   839  
   840    if( iBefore>0 && (iRet<=0 || iRet>=iBefore) ){
   841      iRet = 0;
   842  
   843    }else if( rc==LSM_OK ){
   844      /* If a block was found in the free block list, use it and remove it from 
   845      ** the list. Otherwise, if no suitable block was found, allocate one from
   846      ** the end of the file.  */
   847      if( iRet>0 ){
   848  #ifdef LSM_LOG_FREELIST
   849        lsmLogMessage(pDb, 0, 
   850            "reusing block %d (snapshot-in-use=%lld)", iRet, iInUse);
   851  #endif
   852        rc = freelistAppend(pDb, iRet, -1);
   853        if( rc==LSM_OK ){
   854          rc = dbTruncate(pDb, iInUse);
   855        }
   856      }else{
   857        iRet = ++(p->nBlock);
   858  #ifdef LSM_LOG_FREELIST
   859        lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet);
   860  #endif
   861      }
   862    }
   863  
   864    assert( iBefore>0 || iRet>0 || rc!=LSM_OK );
   865    *piBlk = iRet;
   866    return rc;
   867  }
   868  
   869  /*
   870  ** Free a database block. The worker snapshot must be held in order to call 
   871  ** this function.
   872  **
   873  ** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
   874  ** LSM_NOMEM).
   875  */
   876  int lsmBlockFree(lsm_db *pDb, int iBlk){
   877    Snapshot *p = pDb->pWorker;
   878    assert( lsmShmAssertWorker(pDb) );
   879  
   880  #ifdef LSM_LOG_FREELIST
   881    lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk);
   882  #endif
   883  
   884    return freelistAppend(pDb, iBlk, p->iId);
   885  }
   886  
   887  /*
   888  ** Refree a database block. The worker snapshot must be held in order to call 
   889  ** this function.
   890  **
   891  ** Refreeing is required when a block is allocated using lsmBlockAllocate()
   892  ** but then not used. This function is used to push the block back onto
   893  ** the freelist. Refreeing a block is different from freeing is, as a refreed
   894  ** block may be reused immediately. Whereas a freed block can not be reused 
   895  ** until (at least) after the next checkpoint.
   896  */
   897  int lsmBlockRefree(lsm_db *pDb, int iBlk){
   898    int rc = LSM_OK;                /* Return code */
   899  
   900  #ifdef LSM_LOG_FREELIST
   901    lsmLogMessage(pDb, LSM_OK, "lsmBlockRefree(): Refree block %d", iBlk);
   902  #endif
   903  
   904    rc = freelistAppend(pDb, iBlk, 0);
   905    return rc;
   906  }
   907  
   908  /*
   909  ** If required, copy a database checkpoint from shared memory into the
   910  ** database itself.
   911  **
   912  ** The WORKER lock must not be held when this is called. This is because
   913  ** this function may indirectly call fsync(). And the WORKER lock should
   914  ** not be held that long (in case it is required by a client flushing an
   915  ** in-memory tree to disk).
   916  */
   917  int lsmCheckpointWrite(lsm_db *pDb, u32 *pnWrite){
   918    int rc;                         /* Return Code */
   919    u32 nWrite = 0;
   920  
   921    assert( pDb->pWorker==0 );
   922    assert( 1 || pDb->pClient==0 );
   923    assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) );
   924  
   925    rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0);
   926    if( rc!=LSM_OK ) return rc;
   927  
   928    rc = lsmCheckpointLoad(pDb, 0);
   929    if( rc==LSM_OK ){
   930      int nBlock = lsmCheckpointNBlock(pDb->aSnapshot);
   931      ShmHeader *pShm = pDb->pShmhdr;
   932      int bDone = 0;                /* True if checkpoint is already stored */
   933  
   934      /* Check if this checkpoint has already been written to the database
   935      ** file. If so, set variable bDone to true.  */
   936      if( pShm->iMetaPage ){
   937        MetaPage *pPg;              /* Meta page */
   938        u8 *aData;                  /* Meta-page data buffer */
   939        int nData;                  /* Size of aData[] in bytes */
   940        i64 iCkpt;                  /* Id of checkpoint just loaded */
   941        i64 iDisk = 0;              /* Id of checkpoint already stored in db */
   942        iCkpt = lsmCheckpointId(pDb->aSnapshot, 0);
   943        rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg);
   944        if( rc==LSM_OK ){
   945          aData = lsmFsMetaPageData(pPg, &nData);
   946          iDisk = lsmCheckpointId((u32 *)aData, 1);
   947          nWrite = lsmCheckpointNWrite((u32 *)aData, 1);
   948          lsmFsMetaPageRelease(pPg);
   949        }
   950        bDone = (iDisk>=iCkpt);
   951      }
   952  
   953      if( rc==LSM_OK && bDone==0 ){
   954        int iMeta = (pShm->iMetaPage % 2) + 1;
   955        if( pDb->eSafety!=LSM_SAFETY_OFF ){
   956          rc = lsmFsSyncDb(pDb->pFS, nBlock);
   957        }
   958        if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta);
   959        if( rc==LSM_OK && pDb->eSafety!=LSM_SAFETY_OFF){
   960          rc = lsmFsSyncDb(pDb->pFS, 0);
   961        }
   962        if( rc==LSM_OK ){
   963          pShm->iMetaPage = iMeta;
   964          nWrite = lsmCheckpointNWrite(pDb->aSnapshot, 0) - nWrite;
   965        }
   966  #ifdef LSM_LOG_WORK
   967        lsmLogMessage(pDb, 0, "finish checkpoint %d", 
   968            (int)lsmCheckpointId(pDb->aSnapshot, 0)
   969        );
   970  #endif
   971      }
   972    }
   973  
   974    lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0);
   975    if( pnWrite && rc==LSM_OK ) *pnWrite = nWrite;
   976    return rc;
   977  }
   978  
   979  int lsmBeginWork(lsm_db *pDb){
   980    int rc;
   981  
   982    /* Attempt to take the WORKER lock */
   983    rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
   984  
   985    /* Deserialize the current worker snapshot */
   986    if( rc==LSM_OK ){
   987      rc = lsmCheckpointLoadWorker(pDb);
   988    }
   989    return rc;
   990  }
   991  
   992  void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){
   993    if( p ){
   994      lsmSortedFreeLevel(pEnv, p->pLevel);
   995      lsmFree(pEnv, p->freelist.aEntry);
   996      lsmFree(pEnv, p->redirect.a);
   997      lsmFree(pEnv, p);
   998    }
   999  }
  1000  
  1001  /*
  1002  ** Attempt to populate one of the read-lock slots to contain lock values
  1003  ** iLsm/iShm. Or, if such a slot exists already, this function is a no-op.
  1004  **
  1005  ** It is not an error if no slot can be populated because the write-lock
  1006  ** cannot be obtained. If any other error occurs, return an LSM error code.
  1007  ** Otherwise, LSM_OK.
  1008  **
  1009  ** This function is called at various points to try to ensure that there
  1010  ** always exists at least one read-lock slot that can be used by a read-only
  1011  ** client. And so that, in the usual case, there is an "exact match" available
  1012  ** whenever a read transaction is opened by any client. At present this
  1013  ** function is called when:
  1014  **
  1015  **    * A write transaction that called lsmTreeDiscardOld() is committed, and
  1016  **    * Whenever the working snapshot is updated (i.e. lsmFinishWork()).
  1017  */
  1018  static int dbSetReadLock(lsm_db *db, i64 iLsm, u32 iShm){
  1019    int rc = LSM_OK;
  1020    ShmHeader *pShm = db->pShmhdr;
  1021    int i;
  1022  
  1023    /* Check if there is already a slot containing the required values. */
  1024    for(i=0; i<LSM_LOCK_NREADER; i++){
  1025      ShmReader *p = &pShm->aReader[i];
  1026      if( p->iLsmId==iLsm && p->iTreeId==iShm ) return LSM_OK;
  1027    }
  1028  
  1029    /* Iterate through all read-lock slots, attempting to take a write-lock
  1030    ** on each of them. If a write-lock succeeds, populate the locked slot
  1031    ** with the required values and break out of the loop.  */
  1032    for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
  1033      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
  1034      if( rc==LSM_BUSY ){
  1035        rc = LSM_OK;
  1036      }else{
  1037        ShmReader *p = &pShm->aReader[i];
  1038        p->iLsmId = iLsm;
  1039        p->iTreeId = iShm;
  1040        lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
  1041        break;
  1042      }
  1043    }
  1044  
  1045    return rc;
  1046  }
  1047  
  1048  /*
  1049  ** Release the read-lock currently held by connection db.
  1050  */
  1051  int dbReleaseReadlock(lsm_db *db){
  1052    int rc = LSM_OK;
  1053    if( db->iReader>=0 ){
  1054      rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0);
  1055      db->iReader = -1;
  1056    }
  1057    db->bRoTrans = 0;
  1058    return rc;
  1059  }
  1060  
  1061  
  1062  /*
  1063  ** Argument bFlush is true if the contents of the in-memory tree has just
  1064  ** been flushed to disk. The significance of this is that once the snapshot
  1065  ** created to hold the updated state of the database is synced to disk, log
  1066  ** file space can be recycled.
  1067  */
  1068  void lsmFinishWork(lsm_db *pDb, int bFlush, int *pRc){
  1069    int rc = *pRc;
  1070    assert( rc!=0 || pDb->pWorker );
  1071    if( pDb->pWorker ){
  1072      /* If no error has occurred, serialize the worker snapshot and write
  1073      ** it to shared memory.  */
  1074      if( rc==LSM_OK ){
  1075        rc = lsmSaveWorker(pDb, bFlush);
  1076      }
  1077  
  1078      /* Assuming no error has occurred, update a read lock slot with the
  1079      ** new snapshot id (see comments above function dbSetReadLock()).  */
  1080      if( rc==LSM_OK ){
  1081        if( pDb->iReader<0 ){
  1082          rc = lsmTreeLoadHeader(pDb, 0);
  1083        }
  1084        if( rc==LSM_OK ){
  1085          rc = dbSetReadLock(pDb, pDb->pWorker->iId, pDb->treehdr.iUsedShmid);
  1086        }
  1087      }
  1088  
  1089      /* Free the snapshot object. */
  1090      lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
  1091      pDb->pWorker = 0;
  1092    }
  1093  
  1094    lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
  1095    *pRc = rc;
  1096  }
  1097  
  1098  /*
  1099  ** Called when recovery is finished.
  1100  */
  1101  int lsmFinishRecovery(lsm_db *pDb){
  1102    lsmTreeEndTransaction(pDb, 1);
  1103    return LSM_OK;
  1104  }
  1105  
  1106  /*
  1107  ** Check if the currently configured compression functions
  1108  ** (LSM_CONFIG_SET_COMPRESSION) are compatible with a database that has its
  1109  ** compression id set to iReq. Compression routines are compatible if iReq
  1110  ** is zero (indicating the database is empty), or if it is equal to the 
  1111  ** compression id of the configured compression routines.
  1112  **
  1113  ** If the check shows that the current compression are incompatible and there
  1114  ** is a compression factory registered, give it a chance to install new
  1115  ** compression routines.
  1116  **
  1117  ** If, after any registered factory is invoked, the compression functions
  1118  ** are still incompatible, return LSM_MISMATCH. Otherwise, LSM_OK.
  1119  */
  1120  int lsmCheckCompressionId(lsm_db *pDb, u32 iReq){
  1121    if( iReq!=LSM_COMPRESSION_EMPTY && pDb->compress.iId!=iReq ){
  1122      if( pDb->factory.xFactory ){
  1123        pDb->bInFactory = 1;
  1124        pDb->factory.xFactory(pDb->factory.pCtx, pDb, iReq);
  1125        pDb->bInFactory = 0;
  1126      }
  1127      if( pDb->compress.iId!=iReq ){
  1128        /* Incompatible */
  1129        return LSM_MISMATCH;
  1130      }
  1131    }
  1132    /* Compatible */
  1133    return LSM_OK;
  1134  }
  1135  
  1136  /*
  1137  ** Begin a read transaction. This function is a no-op if the connection
  1138  ** passed as the only argument already has an open read transaction.
  1139  */
  1140  int lsmBeginReadTrans(lsm_db *pDb){
  1141    const int MAX_READLOCK_ATTEMPTS = 10;
  1142    const int nMaxAttempt = (pDb->bRoTrans ? 1 : MAX_READLOCK_ATTEMPTS);
  1143  
  1144    int rc = LSM_OK;                /* Return code */
  1145    int iAttempt = 0;
  1146  
  1147    assert( pDb->pWorker==0 );
  1148  
  1149    while( rc==LSM_OK && pDb->iReader<0 && (iAttempt++)<nMaxAttempt ){
  1150      int iTreehdr = 0;
  1151      int iSnap = 0;
  1152      assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
  1153  
  1154      /* Load the in-memory tree header. */
  1155      rc = lsmTreeLoadHeader(pDb, &iTreehdr);
  1156  
  1157      /* Load the database snapshot */
  1158      if( rc==LSM_OK ){
  1159        if( lsmCheckpointClientCacheOk(pDb)==0 ){
  1160          lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
  1161          pDb->pClient = 0;
  1162          lsmMCursorFreeCache(pDb);
  1163          lsmFsPurgeCache(pDb->pFS);
  1164          rc = lsmCheckpointLoad(pDb, &iSnap);
  1165        }else{
  1166          iSnap = 1;
  1167        }
  1168      }
  1169  
  1170      /* Take a read-lock on the tree and snapshot just loaded. Then check
  1171      ** that the shared-memory still contains the same values. If so, proceed.
  1172      ** Otherwise, relinquish the read-lock and retry the whole procedure
  1173      ** (starting with loading the in-memory tree header).  */
  1174      if( rc==LSM_OK ){
  1175        u32 iShmMax = pDb->treehdr.iUsedShmid;
  1176        u32 iShmMin = pDb->treehdr.iNextShmid+1-LSM_MAX_SHMCHUNKS;
  1177        rc = lsmReadlock(
  1178            pDb, lsmCheckpointId(pDb->aSnapshot, 0), iShmMin, iShmMax
  1179        );
  1180        if( rc==LSM_OK ){
  1181          if( lsmTreeLoadHeaderOk(pDb, iTreehdr)
  1182           && lsmCheckpointLoadOk(pDb, iSnap)
  1183          ){
  1184            /* Read lock has been successfully obtained. Deserialize the 
  1185            ** checkpoint just loaded. TODO: This will be removed after 
  1186            ** lsm_sorted.c is changed to work directly from the serialized
  1187            ** version of the snapshot.  */
  1188            if( pDb->pClient==0 ){
  1189              rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot,&pDb->pClient);
  1190            }
  1191            assert( (rc==LSM_OK)==(pDb->pClient!=0) );
  1192            assert( pDb->iReader>=0 );
  1193  
  1194            /* Check that the client has the right compression hooks loaded.
  1195            ** If not, set rc to LSM_MISMATCH.  */
  1196            if( rc==LSM_OK ){
  1197              rc = lsmCheckCompressionId(pDb, pDb->pClient->iCmpId);
  1198            }
  1199          }else{
  1200            rc = dbReleaseReadlock(pDb);
  1201          }
  1202        }
  1203  
  1204        if( rc==LSM_BUSY ){
  1205          rc = LSM_OK;
  1206        }
  1207      }
  1208  #if 0
  1209  if( rc==LSM_OK && pDb->pClient ){
  1210    fprintf(stderr, 
  1211        "reading %p: snapshot:%d used-shmid:%d trans-id:%d iOldShmid=%d\n",
  1212        (void *)pDb,
  1213        (int)pDb->pClient->iId, (int)pDb->treehdr.iUsedShmid, 
  1214        (int)pDb->treehdr.root.iTransId,
  1215        (int)pDb->treehdr.iOldShmid
  1216    );
  1217  }
  1218  #endif
  1219    }
  1220  
  1221    if( rc==LSM_OK ){
  1222      rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk);
  1223    }
  1224    if( rc!=LSM_OK ){
  1225      dbReleaseReadlock(pDb);
  1226    }
  1227    if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY;
  1228    return rc;
  1229  }
  1230  
  1231  /*
  1232  ** This function is used by a read-write connection to determine if there
  1233  ** are currently one or more read-only transactions open on the database
  1234  ** (in this context a read-only transaction is one opened by a read-only
  1235  ** connection on a non-live database).
  1236  **
  1237  ** If no error occurs, LSM_OK is returned and *pbExists is set to true if
  1238  ** some other connection has a read-only transaction open, or false 
  1239  ** otherwise. If an error occurs an LSM error code is returned and the final
  1240  ** value of *pbExist is undefined.
  1241  */
  1242  int lsmDetectRoTrans(lsm_db *db, int *pbExist){
  1243    int rc;
  1244  
  1245    /* Only a read-write connection may use this function. */
  1246    assert( db->bReadonly==0 );
  1247  
  1248    rc = lsmShmTestLock(db, LSM_LOCK_ROTRANS, 1, LSM_LOCK_EXCL);
  1249    if( rc==LSM_BUSY ){
  1250      *pbExist = 1;
  1251      rc = LSM_OK;
  1252    }else{
  1253      *pbExist = 0;
  1254    }
  1255  
  1256    return rc;
  1257  }
  1258  
  1259  /*
  1260  ** db is a read-only database handle in the disconnected state. This function
  1261  ** attempts to open a read-transaction on the database. This may involve
  1262  ** connecting to the database system (opening shared memory etc.).
  1263  */
  1264  int lsmBeginRoTrans(lsm_db *db){
  1265    int rc = LSM_OK;
  1266  
  1267    assert( db->bReadonly && db->pShmhdr==0 );
  1268    assert( db->iReader<0 );
  1269  
  1270    if( db->bRoTrans==0 ){
  1271  
  1272      /* Attempt a shared-lock on DMS1. */
  1273      rc = lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_SHARED, 0);
  1274      if( rc!=LSM_OK ) return rc;
  1275  
  1276      rc = lsmShmTestLock(
  1277          db, LSM_LOCK_RWCLIENT(0), LSM_LOCK_NREADER, LSM_LOCK_SHARED
  1278      );
  1279      if( rc==LSM_OK ){
  1280        /* System is not live. Take a SHARED lock on the ROTRANS byte and
  1281        ** release DMS1. Locking ROTRANS tells all read-write clients that they
  1282        ** may not recycle any disk space from within the database or log files,
  1283        ** as a read-only client may be using it.  */
  1284        rc = lsmShmLock(db, LSM_LOCK_ROTRANS, LSM_LOCK_SHARED, 0);
  1285        lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
  1286  
  1287        if( rc==LSM_OK ){
  1288          db->bRoTrans = 1;
  1289          rc = lsmShmCacheChunks(db, 1);
  1290          if( rc==LSM_OK ){
  1291            db->pShmhdr = (ShmHeader *)db->apShm[0];
  1292            memset(db->pShmhdr, 0, sizeof(ShmHeader));
  1293            rc = lsmCheckpointRecover(db);
  1294            if( rc==LSM_OK ){
  1295              rc = lsmLogRecover(db);
  1296            }
  1297          }
  1298        }
  1299      }else if( rc==LSM_BUSY ){
  1300        /* System is live! */
  1301        rc = lsmShmLock(db, LSM_LOCK_DMS3, LSM_LOCK_SHARED, 0);
  1302        lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
  1303        if( rc==LSM_OK ){
  1304          rc = lsmShmCacheChunks(db, 1);
  1305          if( rc==LSM_OK ){
  1306            db->pShmhdr = (ShmHeader *)db->apShm[0];
  1307          }
  1308        }
  1309      }
  1310  
  1311      if( rc==LSM_OK ){
  1312        rc = lsmBeginReadTrans(db);
  1313      }
  1314    }
  1315  
  1316    return rc;
  1317  }
  1318  
  1319  /*
  1320  ** Close the currently open read transaction.
  1321  */
  1322  void lsmFinishReadTrans(lsm_db *pDb){
  1323  
  1324    /* Worker connections should not be closing read transactions. And
  1325    ** read transactions should only be closed after all cursors and write
  1326    ** transactions have been closed. Finally pClient should be non-NULL
  1327    ** only iff pDb->iReader>=0.  */
  1328    assert( pDb->pWorker==0 );
  1329    assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
  1330  
  1331    if( pDb->bRoTrans ){
  1332      int i;
  1333      for(i=0; i<pDb->nShm; i++){
  1334        lsmFree(pDb->pEnv, pDb->apShm[i]);
  1335      }
  1336      lsmFree(pDb->pEnv, pDb->apShm);
  1337      pDb->apShm = 0;
  1338      pDb->nShm = 0;
  1339      pDb->pShmhdr = 0;
  1340  
  1341      lsmShmLock(pDb, LSM_LOCK_ROTRANS, LSM_LOCK_UNLOCK, 0);
  1342    }
  1343    dbReleaseReadlock(pDb);
  1344  }
  1345  
  1346  /*
  1347  ** Open a write transaction.
  1348  */
  1349  int lsmBeginWriteTrans(lsm_db *pDb){
  1350    int rc = LSM_OK;                /* Return code */
  1351    ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */
  1352  
  1353    assert( pDb->nTransOpen==0 );
  1354    assert( pDb->bDiscardOld==0 );
  1355    assert( pDb->bReadonly==0 );
  1356  
  1357    /* If there is no read-transaction open, open one now. */
  1358    if( pDb->iReader<0 ){
  1359      rc = lsmBeginReadTrans(pDb);
  1360    }
  1361  
  1362    /* Attempt to take the WRITER lock */
  1363    if( rc==LSM_OK ){
  1364      rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
  1365    }
  1366  
  1367    /* If the previous writer failed mid-transaction, run emergency rollback. */
  1368    if( rc==LSM_OK && pShm->bWriter ){
  1369      rc = lsmTreeRepair(pDb);
  1370      if( rc==LSM_OK ) pShm->bWriter = 0;
  1371    }
  1372  
  1373    /* Check that this connection is currently reading from the most recent
  1374    ** version of the database. If not, return LSM_BUSY.  */
  1375    if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){
  1376      rc = LSM_BUSY;
  1377    }
  1378  
  1379    if( rc==LSM_OK ){
  1380      rc = lsmLogBegin(pDb);
  1381    }
  1382  
  1383    /* If everything was successful, set the "transaction-in-progress" flag
  1384    ** and return LSM_OK. Otherwise, if some error occurred, relinquish the 
  1385    ** WRITER lock and return an error code.  */
  1386    if( rc==LSM_OK ){
  1387      TreeHeader *p = &pDb->treehdr;
  1388      pShm->bWriter = 1;
  1389      p->root.iTransId++;
  1390      if( lsmTreeHasOld(pDb) && p->iOldLog==pDb->pClient->iLogOff ){
  1391        lsmTreeDiscardOld(pDb);
  1392        pDb->bDiscardOld = 1;
  1393      }
  1394    }else{
  1395      lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
  1396      if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb);
  1397    }
  1398    return rc;
  1399  }
  1400  
  1401  /*
  1402  ** End the current write transaction. The connection is left with an open
  1403  ** read transaction. It is an error to call this if there is no open write 
  1404  ** transaction.
  1405  **
  1406  ** If the transaction was committed, then a commit record has already been
  1407  ** written into the log file when this function is called. Or, if the
  1408  ** transaction was rolled back, both the log file and in-memory tree 
  1409  ** structure have already been restored. In either case, this function 
  1410  ** merely releases locks and other resources held by the write-transaction.
  1411  **
  1412  ** LSM_OK is returned if successful, or an LSM error code otherwise.
  1413  */
  1414  int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
  1415    int rc = LSM_OK;
  1416    int bFlush = 0;
  1417  
  1418    lsmLogEnd(pDb, bCommit);
  1419    if( rc==LSM_OK && bCommit && lsmTreeSize(pDb)>pDb->nTreeLimit ){
  1420      bFlush = 1;
  1421      lsmTreeMakeOld(pDb);
  1422    }
  1423    lsmTreeEndTransaction(pDb, bCommit);
  1424  
  1425    if( rc==LSM_OK ){
  1426      if( bFlush && pDb->bAutowork ){
  1427        rc = lsmSortedAutoWork(pDb, 1);
  1428      }else if( bCommit && pDb->bDiscardOld ){
  1429        rc = dbSetReadLock(pDb, pDb->pClient->iId, pDb->treehdr.iUsedShmid);
  1430      }
  1431    }
  1432    pDb->bDiscardOld = 0;
  1433    lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
  1434  
  1435    if( bFlush && pDb->bAutowork==0 && pDb->xWork ){
  1436      pDb->xWork(pDb, pDb->pWorkCtx);
  1437    }
  1438    return rc;
  1439  }
  1440  
  1441  
  1442  /*
  1443  ** Return non-zero if the caller is holding the client mutex.
  1444  */
  1445  #ifdef LSM_DEBUG
  1446  int lsmHoldingClientMutex(lsm_db *pDb){
  1447    return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
  1448  }
  1449  #endif
  1450  
  1451  static int slotIsUsable(ShmReader *p, i64 iLsm, u32 iShmMin, u32 iShmMax){
  1452    return( 
  1453        p->iLsmId && p->iLsmId<=iLsm 
  1454        && shm_sequence_ge(iShmMax, p->iTreeId)
  1455        && shm_sequence_ge(p->iTreeId, iShmMin)
  1456    );
  1457  }
  1458  
  1459  /*
  1460  ** Obtain a read-lock on database version identified by the combination
  1461  ** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
  1462  ** an LSM error code otherwise.
  1463  */
  1464  int lsmReadlock(lsm_db *db, i64 iLsm, u32 iShmMin, u32 iShmMax){
  1465    int rc = LSM_OK;
  1466    ShmHeader *pShm = db->pShmhdr;
  1467    int i;
  1468  
  1469    assert( db->iReader<0 );
  1470    assert( shm_sequence_ge(iShmMax, iShmMin) );
  1471  
  1472    /* This is a no-op if the read-only transaction flag is set. */
  1473    if( db->bRoTrans ){
  1474      db->iReader = 0;
  1475      return LSM_OK;
  1476    }
  1477  
  1478    /* Search for an exact match. */
  1479    for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
  1480      ShmReader *p = &pShm->aReader[i];
  1481      if( p->iLsmId==iLsm && p->iTreeId==iShmMax ){
  1482        rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
  1483        if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iShmMax ){
  1484          db->iReader = i;
  1485        }else if( rc==LSM_BUSY ){
  1486          rc = LSM_OK;
  1487        }
  1488      }
  1489    }
  1490  
  1491    /* Try to obtain a write-lock on each slot, in order. If successful, set
  1492    ** the slot values to iLsm/iTree.  */
  1493    for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
  1494      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
  1495      if( rc==LSM_BUSY ){
  1496        rc = LSM_OK;
  1497      }else{
  1498        ShmReader *p = &pShm->aReader[i];
  1499        p->iLsmId = iLsm;
  1500        p->iTreeId = iShmMax;
  1501        rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
  1502        assert( rc!=LSM_BUSY );
  1503        if( rc==LSM_OK ) db->iReader = i;
  1504      }
  1505    }
  1506  
  1507    /* Search for any usable slot */
  1508    for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
  1509      ShmReader *p = &pShm->aReader[i];
  1510      if( slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
  1511        rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
  1512        if( rc==LSM_OK && slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
  1513          db->iReader = i;
  1514        }else if( rc==LSM_BUSY ){
  1515          rc = LSM_OK;
  1516        }
  1517      }
  1518    }
  1519  
  1520    if( rc==LSM_OK && db->iReader<0 ){
  1521      rc = LSM_BUSY;
  1522    }
  1523    return rc;
  1524  }
  1525  
  1526  /*
  1527  ** This is used to check if there exists a read-lock locking a particular
  1528  ** version of either the in-memory tree or database file. 
  1529  **
  1530  ** If iLsmId is non-zero, then it is a snapshot id. If there exists a 
  1531  ** read-lock using this snapshot or newer, set *pbInUse to true. Or,
  1532  ** if there is no such read-lock, set it to false.
  1533  **
  1534  ** Or, if iLsmId is zero, then iShmid is a shared-memory sequence id.
  1535  ** Search for a read-lock using this sequence id or newer. etc.
  1536  */
  1537  static int isInUse(lsm_db *db, i64 iLsmId, u32 iShmid, int *pbInUse){
  1538    ShmHeader *pShm = db->pShmhdr;
  1539    int i;
  1540    int rc = LSM_OK;
  1541  
  1542    for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
  1543      ShmReader *p = &pShm->aReader[i];
  1544      if( p->iLsmId ){
  1545        if( (iLsmId!=0 && p->iLsmId!=0 && iLsmId>=p->iLsmId) 
  1546         || (iLsmId==0 && shm_sequence_ge(p->iTreeId, iShmid))
  1547        ){
  1548          rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
  1549          if( rc==LSM_OK ){
  1550            p->iLsmId = 0;
  1551            lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
  1552          }
  1553        }
  1554      }
  1555    }
  1556  
  1557    if( rc==LSM_BUSY ){
  1558      *pbInUse = 1;
  1559      return LSM_OK;
  1560    }
  1561    *pbInUse = 0;
  1562    return rc;
  1563  }
  1564  
  1565  /*
  1566  ** This function is called by worker connections to determine the smallest
  1567  ** snapshot id that is currently in use by a database client. The worker
  1568  ** connection uses this result to determine whether or not it is safe to
  1569  ** recycle a database block.
  1570  */
  1571  static int firstSnapshotInUse(
  1572    lsm_db *db,                     /* Database handle */
  1573    i64 *piInUse                    /* IN/OUT: Smallest snapshot id in use */
  1574  ){
  1575    ShmHeader *pShm = db->pShmhdr;
  1576    i64 iInUse = *piInUse;
  1577    int i;
  1578  
  1579    assert( iInUse>0 );
  1580    for(i=0; i<LSM_LOCK_NREADER; i++){
  1581      ShmReader *p = &pShm->aReader[i];
  1582      if( p->iLsmId ){
  1583        i64 iThis = p->iLsmId;
  1584        if( iThis!=0 && iInUse>iThis ){
  1585          int rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
  1586          if( rc==LSM_OK ){
  1587            p->iLsmId = 0;
  1588            lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
  1589          }else if( rc==LSM_BUSY ){
  1590            iInUse = iThis;
  1591          }else{
  1592            /* Some error other than LSM_BUSY. Return the error code to
  1593            ** the caller in this case.  */
  1594            return rc;
  1595          }
  1596        }
  1597      }
  1598    }
  1599  
  1600    *piInUse = iInUse;
  1601    return LSM_OK;
  1602  }
  1603  
  1604  int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){
  1605    if( db->treehdr.iUsedShmid==iShmid ){
  1606      *pbInUse = 1;
  1607      return LSM_OK;
  1608    }
  1609    return isInUse(db, 0, iShmid, pbInUse);
  1610  }
  1611  
  1612  int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){
  1613    if( db->pClient && db->pClient->iId<=iLsmId ){
  1614      *pbInUse = 1;
  1615      return LSM_OK;
  1616    }
  1617    return isInUse(db, iLsmId, 0, pbInUse);
  1618  }
  1619  
  1620  /*
  1621  ** This function may only be called after a successful call to
  1622  ** lsmDbDatabaseConnect(). It returns true if the connection is in
  1623  ** multi-process mode, or false otherwise.
  1624  */
  1625  int lsmDbMultiProc(lsm_db *pDb){
  1626    return pDb->pDatabase && pDb->pDatabase->bMultiProc;
  1627  }
  1628  
  1629  
  1630  /*************************************************************************
  1631  **************************************************************************
  1632  **************************************************************************
  1633  **************************************************************************
  1634  **************************************************************************
  1635  *************************************************************************/
  1636  
  1637  /*
  1638  ** Ensure that database connection db has cached pointers to at least the 
  1639  ** first nChunk chunks of shared memory.
  1640  */
  1641  int lsmShmCacheChunks(lsm_db *db, int nChunk){
  1642    int rc = LSM_OK;
  1643    if( nChunk>db->nShm ){
  1644      static const int NINCR = 16;
  1645      Database *p = db->pDatabase;
  1646      lsm_env *pEnv = db->pEnv;
  1647      int nAlloc;
  1648      int i;
  1649  
  1650      /* Ensure that the db->apShm[] array is large enough. If an attempt to
  1651      ** allocate memory fails, return LSM_NOMEM immediately. The apShm[] array
  1652      ** is always extended in multiples of 16 entries - so the actual allocated
  1653      ** size can be inferred from nShm.  */ 
  1654      nAlloc = ((db->nShm + NINCR - 1) / NINCR) * NINCR;
  1655      while( nChunk>=nAlloc ){
  1656        void **apShm;
  1657        nAlloc += NINCR;
  1658        apShm = lsmRealloc(pEnv, db->apShm, sizeof(void*)*nAlloc);
  1659        if( !apShm ) return LSM_NOMEM_BKPT;
  1660        db->apShm = apShm;
  1661      }
  1662  
  1663      if( db->bRoTrans ){
  1664        for(i=db->nShm; rc==LSM_OK && i<nChunk; i++){
  1665          db->apShm[i] = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
  1666          db->nShm++;
  1667        }
  1668  
  1669      }else{
  1670  
  1671        /* Enter the client mutex */
  1672        lsmMutexEnter(pEnv, p->pClientMutex);
  1673  
  1674        /* Extend the Database objects apShmChunk[] array if necessary. Using the
  1675         ** same pattern as for the lsm_db.apShm[] array above.  */
  1676        nAlloc = ((p->nShmChunk + NINCR - 1) / NINCR) * NINCR;
  1677        while( nChunk>=nAlloc ){
  1678          void **apShm;
  1679          nAlloc +=  NINCR;
  1680          apShm = lsmRealloc(pEnv, p->apShmChunk, sizeof(void*)*nAlloc);
  1681          if( !apShm ){
  1682            rc = LSM_NOMEM_BKPT;
  1683            break;
  1684          }
  1685          p->apShmChunk = apShm;
  1686        }
  1687  
  1688        for(i=db->nShm; rc==LSM_OK && i<nChunk; i++){
  1689          if( i>=p->nShmChunk ){
  1690            void *pChunk = 0;
  1691            if( p->bMultiProc==0 ){
  1692              /* Single process mode */
  1693              pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
  1694            }else{
  1695              /* Multi-process mode */
  1696              rc = lsmEnvShmMap(pEnv, p->pFile, i, LSM_SHM_CHUNK_SIZE, &pChunk);
  1697            }
  1698            if( rc==LSM_OK ){
  1699              p->apShmChunk[i] = pChunk;
  1700              p->nShmChunk++;
  1701            }
  1702          }
  1703          if( rc==LSM_OK ){
  1704            db->apShm[i] = p->apShmChunk[i];
  1705            db->nShm++;
  1706          }
  1707        }
  1708  
  1709        /* Release the client mutex */
  1710        lsmMutexLeave(pEnv, p->pClientMutex);
  1711      }
  1712    }
  1713  
  1714    return rc;
  1715  }
  1716  
  1717  static int lockSharedFile(lsm_env *pEnv, Database *p, int iLock, int eOp){
  1718    int rc = LSM_OK;
  1719    if( p->bMultiProc ){
  1720      rc = lsmEnvLock(pEnv, p->pFile, iLock, eOp);
  1721    }
  1722    return rc;
  1723  }
  1724  
  1725  /*
  1726  ** Test if it would be possible for connection db to obtain a lock of type
  1727  ** eType on the nLock locks starting at iLock. If so, return LSM_OK. If it
  1728  ** would not be possible to obtain the lock due to a lock held by another
  1729  ** connection, return LSM_BUSY. If an IO or other error occurs (i.e. in the 
  1730  ** lsm_env.xTestLock function), return some other LSM error code.
  1731  **
  1732  ** Note that this function never actually locks the database - it merely
  1733  ** queries the system to see if there exists a lock that would prevent
  1734  ** it from doing so.
  1735  */
  1736  int lsmShmTestLock(
  1737    lsm_db *db,
  1738    int iLock,
  1739    int nLock,
  1740    int eOp
  1741  ){
  1742    int rc = LSM_OK;
  1743    lsm_db *pIter;
  1744    Database *p = db->pDatabase;
  1745    int i;
  1746    u64 mask = 0;
  1747  
  1748    for(i=iLock; i<(iLock+nLock); i++){
  1749      mask |= ((u64)1 << (iLock-1));
  1750      if( eOp==LSM_LOCK_EXCL ) mask |= ((u64)1 << (iLock+32-1));
  1751    }
  1752  
  1753    lsmMutexEnter(db->pEnv, p->pClientMutex);
  1754    for(pIter=p->pConn; pIter; pIter=pIter->pNext){
  1755      if( pIter!=db && (pIter->mLock & mask) ){
  1756        assert( pIter!=db );
  1757        break;
  1758      }
  1759    }
  1760  
  1761    if( pIter ){
  1762      rc = LSM_BUSY;
  1763    }else if( p->bMultiProc ){
  1764      rc = lsmEnvTestLock(db->pEnv, p->pFile, iLock, nLock, eOp);
  1765    }
  1766  
  1767    lsmMutexLeave(db->pEnv, p->pClientMutex);
  1768    return rc;
  1769  }
  1770  
  1771  /*
  1772  ** Attempt to obtain the lock identified by the iLock and bExcl parameters.
  1773  ** If successful, return LSM_OK. If the lock cannot be obtained because 
  1774  ** there exists some other conflicting lock, return LSM_BUSY. If some other
  1775  ** error occurs, return an LSM error code.
  1776  **
  1777  ** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER,
  1778  ** or else a value returned by the LSM_LOCK_READER macro.
  1779  */
  1780  int lsmShmLock(
  1781    lsm_db *db, 
  1782    int iLock,
  1783    int eOp,                        /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */
  1784    int bBlock                      /* True for a blocking lock */
  1785  ){
  1786    lsm_db *pIter;
  1787    const u64 me = ((u64)1 << (iLock-1));
  1788    const u64 ms = ((u64)1 << (iLock+32-1));
  1789    int rc = LSM_OK;
  1790    Database *p = db->pDatabase;
  1791  
  1792    assert( eOp!=LSM_LOCK_EXCL || p->bReadonly==0 );
  1793    assert( iLock>=1 && iLock<=LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1) );
  1794    assert( LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1)<=32 );
  1795    assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
  1796  
  1797    /* Check for a no-op. Proceed only if this is not one of those. */
  1798    if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0)
  1799     || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms)
  1800     || (eOp==LSM_LOCK_EXCL   && (db->mLock & me)==0)
  1801    ){
  1802      int nExcl = 0;                /* Number of connections holding EXCLUSIVE */
  1803      int nShared = 0;              /* Number of connections holding SHARED */
  1804      lsmMutexEnter(db->pEnv, p->pClientMutex);
  1805  
  1806      /* Figure out the locks currently held by this process on iLock, not
  1807      ** including any held by connection db.  */
  1808      for(pIter=p->pConn; pIter; pIter=pIter->pNext){
  1809        assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 );
  1810        if( pIter!=db ){
  1811          if( pIter->mLock & me ){
  1812            nExcl++;
  1813          }else if( pIter->mLock & ms ){
  1814            nShared++;
  1815          }
  1816        }
  1817      }
  1818      assert( nExcl==0 || nExcl==1 );
  1819      assert( nExcl==0 || nShared==0 );
  1820      assert( nExcl==0 || (db->mLock & (me|ms))==0 );
  1821  
  1822      switch( eOp ){
  1823        case LSM_LOCK_UNLOCK:
  1824          if( nShared==0 ){
  1825            lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_UNLOCK);
  1826          }
  1827          db->mLock &= ~(me|ms);
  1828          break;
  1829  
  1830        case LSM_LOCK_SHARED:
  1831          if( nExcl ){
  1832            rc = LSM_BUSY;
  1833          }else{
  1834            if( nShared==0 ){
  1835              rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_SHARED);
  1836            }
  1837            if( rc==LSM_OK ){
  1838              db->mLock |= ms;
  1839              db->mLock &= ~me;
  1840            }
  1841          }
  1842          break;
  1843  
  1844        default:
  1845          assert( eOp==LSM_LOCK_EXCL );
  1846          if( nExcl || nShared ){
  1847            rc = LSM_BUSY;
  1848          }else{
  1849            rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_EXCL);
  1850            if( rc==LSM_OK ){
  1851              db->mLock |= (me|ms);
  1852            }
  1853          }
  1854          break;
  1855      }
  1856  
  1857      lsmMutexLeave(db->pEnv, p->pClientMutex);
  1858    }
  1859  
  1860    return rc;
  1861  }
  1862  
  1863  #ifdef LSM_DEBUG
  1864  
  1865  int shmLockType(lsm_db *db, int iLock){
  1866    const u64 me = ((u64)1 << (iLock-1));
  1867    const u64 ms = ((u64)1 << (iLock+32-1));
  1868  
  1869    if( db->mLock & me ) return LSM_LOCK_EXCL;
  1870    if( db->mLock & ms ) return LSM_LOCK_SHARED;
  1871    return LSM_LOCK_UNLOCK;
  1872  }
  1873  
  1874  /*
  1875  ** The arguments passed to this function are similar to those passed to
  1876  ** the lsmShmLock() function. However, instead of obtaining a new lock 
  1877  ** this function returns true if the specified connection already holds 
  1878  ** (or does not hold) such a lock, depending on the value of eOp. As
  1879  ** follows:
  1880  **
  1881  **   (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock
  1882  **   (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock.
  1883  **   (eOp==LSM_LOCK_EXCL)   -> true if db has an EXCLUSIVE lock on iLock.
  1884  */
  1885  int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){
  1886    int ret = 0;
  1887    int eHave;
  1888  
  1889    assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
  1890    assert( iLock<=16 );
  1891    assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
  1892  
  1893    eHave = shmLockType(db, iLock);
  1894  
  1895    switch( eOp ){
  1896      case LSM_LOCK_UNLOCK:
  1897        ret = (eHave==LSM_LOCK_UNLOCK);
  1898        break;
  1899      case LSM_LOCK_SHARED:
  1900        ret = (eHave!=LSM_LOCK_UNLOCK);
  1901        break;
  1902      case LSM_LOCK_EXCL:
  1903        ret = (eHave==LSM_LOCK_EXCL);
  1904        break;
  1905      default:
  1906        assert( !"bad eOp value passed to lsmShmAssertLock()" );
  1907        break;
  1908    }
  1909  
  1910    return ret;
  1911  }
  1912  
  1913  int lsmShmAssertWorker(lsm_db *db){
  1914    return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker;
  1915  }
  1916  
  1917  /*
  1918  ** This function does not contribute to library functionality, and is not
  1919  ** included in release builds. It is intended to be called from within
  1920  ** an interactive debugger.
  1921  **
  1922  ** When called, this function prints a single line of human readable output
  1923  ** to stdout describing the locks currently held by the connection. For 
  1924  ** example:
  1925  **
  1926  **     (gdb) call print_db_locks(pDb)
  1927  **     (shared on dms2) (exclusive on writer) 
  1928  */
  1929  void print_db_locks(lsm_db *db){
  1930    int iLock;
  1931    for(iLock=0; iLock<16; iLock++){
  1932      int bOne = 0;
  1933      const char *azLock[] = {0, "shared", "exclusive"};
  1934      const char *azName[] = {
  1935        0, "dms1", "dms2", "writer", "worker", "checkpointer",
  1936        "reader0", "reader1", "reader2", "reader3", "reader4", "reader5"
  1937      };
  1938      int eHave = shmLockType(db, iLock);
  1939      if( azLock[eHave] ){
  1940        printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]);
  1941        bOne = 1;
  1942      }
  1943    }
  1944    printf("\n");
  1945  }
  1946  void print_all_db_locks(lsm_db *db){
  1947    lsm_db *p;
  1948    for(p=db->pDatabase->pConn; p; p=p->pNext){
  1949      printf("%s connection %p ", ((p==db)?"*":""), p);
  1950      print_db_locks(p);
  1951    }
  1952  }
  1953  #endif
  1954  
  1955  void lsmShmBarrier(lsm_db *db){
  1956    lsmEnvShmBarrier(db->pEnv);
  1957  }
  1958  
  1959  int lsm_checkpoint(lsm_db *pDb, int *pnKB){
  1960    int rc;                         /* Return code */
  1961    u32 nWrite = 0;                 /* Number of pages checkpointed */
  1962  
  1963    /* Attempt the checkpoint. If successful, nWrite is set to the number of
  1964    ** pages written between this and the previous checkpoint.  */
  1965    rc = lsmCheckpointWrite(pDb, &nWrite);
  1966  
  1967    /* If required, calculate the output variable (KB of data checkpointed). 
  1968    ** Set it to zero if an error occured.  */
  1969    if( pnKB ){
  1970      int nKB = 0;
  1971      if( rc==LSM_OK && nWrite ){
  1972        nKB = (((i64)nWrite * lsmFsPageSize(pDb->pFS)) + 1023) / 1024;
  1973      }
  1974      *pnKB = nKB;
  1975    }
  1976  
  1977    return rc;
  1978  }