modernc.org/cc@v1.0.1/v2/testdata/_sqlite/src/btree.c (about)

     1  /*
     2  ** 2004 April 6
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  ** This file implements an external (disk-based) database using BTrees.
    13  ** See the header comment on "btreeInt.h" for additional information.
    14  ** Including a description of file format and an overview of operation.
    15  */
    16  #include "btreeInt.h"
    17  
    18  /*
    19  ** The header string that appears at the beginning of every
    20  ** SQLite database.
    21  */
    22  static const char zMagicHeader[] = SQLITE_FILE_HEADER;
    23  
    24  /*
    25  ** Set this global variable to 1 to enable tracing using the TRACE
    26  ** macro.
    27  */
    28  #if 0
    29  int sqlite3BtreeTrace=1;  /* True to enable tracing */
    30  # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
    31  #else
    32  # define TRACE(X)
    33  #endif
    34  
    35  /*
    36  ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
    37  ** But if the value is zero, make it 65536.
    38  **
    39  ** This routine is used to extract the "offset to cell content area" value
    40  ** from the header of a btree page.  If the page size is 65536 and the page
    41  ** is empty, the offset should be 65536, but the 2-byte value stores zero.
    42  ** This routine makes the necessary adjustment to 65536.
    43  */
    44  #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
    45  
    46  /*
    47  ** Values passed as the 5th argument to allocateBtreePage()
    48  */
    49  #define BTALLOC_ANY   0           /* Allocate any page */
    50  #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
    51  #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
    52  
    53  /*
    54  ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 
    55  ** defined, or 0 if it is. For example:
    56  **
    57  **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
    58  */
    59  #ifndef SQLITE_OMIT_AUTOVACUUM
    60  #define IfNotOmitAV(expr) (expr)
    61  #else
    62  #define IfNotOmitAV(expr) 0
    63  #endif
    64  
    65  #ifndef SQLITE_OMIT_SHARED_CACHE
    66  /*
    67  ** A list of BtShared objects that are eligible for participation
    68  ** in shared cache.  This variable has file scope during normal builds,
    69  ** but the test harness needs to access it so we make it global for 
    70  ** test builds.
    71  **
    72  ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
    73  */
    74  #ifdef SQLITE_TEST
    75  BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    76  #else
    77  static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    78  #endif
    79  #endif /* SQLITE_OMIT_SHARED_CACHE */
    80  
    81  #ifndef SQLITE_OMIT_SHARED_CACHE
    82  /*
    83  ** Enable or disable the shared pager and schema features.
    84  **
    85  ** This routine has no effect on existing database connections.
    86  ** The shared cache setting effects only future calls to
    87  ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
    88  */
    89  int sqlite3_enable_shared_cache(int enable){
    90    sqlite3GlobalConfig.sharedCacheEnabled = enable;
    91    return SQLITE_OK;
    92  }
    93  #endif
    94  
    95  
    96  
    97  #ifdef SQLITE_OMIT_SHARED_CACHE
    98    /*
    99    ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
   100    ** and clearAllSharedCacheTableLocks()
   101    ** manipulate entries in the BtShared.pLock linked list used to store
   102    ** shared-cache table level locks. If the library is compiled with the
   103    ** shared-cache feature disabled, then there is only ever one user
   104    ** of each BtShared structure and so this locking is not necessary. 
   105    ** So define the lock related functions as no-ops.
   106    */
   107    #define querySharedCacheTableLock(a,b,c) SQLITE_OK
   108    #define setSharedCacheTableLock(a,b,c) SQLITE_OK
   109    #define clearAllSharedCacheTableLocks(a)
   110    #define downgradeAllSharedCacheTableLocks(a)
   111    #define hasSharedCacheTableLock(a,b,c,d) 1
   112    #define hasReadConflicts(a, b) 0
   113  #endif
   114  
   115  #ifndef SQLITE_OMIT_SHARED_CACHE
   116  
   117  #ifdef SQLITE_DEBUG
   118  /*
   119  **** This function is only used as part of an assert() statement. ***
   120  **
   121  ** Check to see if pBtree holds the required locks to read or write to the 
   122  ** table with root page iRoot.   Return 1 if it does and 0 if not.
   123  **
   124  ** For example, when writing to a table with root-page iRoot via 
   125  ** Btree connection pBtree:
   126  **
   127  **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
   128  **
   129  ** When writing to an index that resides in a sharable database, the 
   130  ** caller should have first obtained a lock specifying the root page of
   131  ** the corresponding table. This makes things a bit more complicated,
   132  ** as this module treats each table as a separate structure. To determine
   133  ** the table corresponding to the index being written, this
   134  ** function has to search through the database schema.
   135  **
   136  ** Instead of a lock on the table/index rooted at page iRoot, the caller may
   137  ** hold a write-lock on the schema table (root page 1). This is also
   138  ** acceptable.
   139  */
   140  static int hasSharedCacheTableLock(
   141    Btree *pBtree,         /* Handle that must hold lock */
   142    Pgno iRoot,            /* Root page of b-tree */
   143    int isIndex,           /* True if iRoot is the root of an index b-tree */
   144    int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
   145  ){
   146    Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
   147    Pgno iTab = 0;
   148    BtLock *pLock;
   149  
   150    /* If this database is not shareable, or if the client is reading
   151    ** and has the read-uncommitted flag set, then no lock is required. 
   152    ** Return true immediately.
   153    */
   154    if( (pBtree->sharable==0)
   155     || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
   156    ){
   157      return 1;
   158    }
   159  
   160    /* If the client is reading  or writing an index and the schema is
   161    ** not loaded, then it is too difficult to actually check to see if
   162    ** the correct locks are held.  So do not bother - just return true.
   163    ** This case does not come up very often anyhow.
   164    */
   165    if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
   166      return 1;
   167    }
   168  
   169    /* Figure out the root-page that the lock should be held on. For table
   170    ** b-trees, this is just the root page of the b-tree being read or
   171    ** written. For index b-trees, it is the root page of the associated
   172    ** table.  */
   173    if( isIndex ){
   174      HashElem *p;
   175      for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
   176        Index *pIdx = (Index *)sqliteHashData(p);
   177        if( pIdx->tnum==(int)iRoot ){
   178          if( iTab ){
   179            /* Two or more indexes share the same root page.  There must
   180            ** be imposter tables.  So just return true.  The assert is not
   181            ** useful in that case. */
   182            return 1;
   183          }
   184          iTab = pIdx->pTable->tnum;
   185        }
   186      }
   187    }else{
   188      iTab = iRoot;
   189    }
   190  
   191    /* Search for the required lock. Either a write-lock on root-page iTab, a 
   192    ** write-lock on the schema table, or (if the client is reading) a
   193    ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
   194    for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
   195      if( pLock->pBtree==pBtree 
   196       && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
   197       && pLock->eLock>=eLockType 
   198      ){
   199        return 1;
   200      }
   201    }
   202  
   203    /* Failed to find the required lock. */
   204    return 0;
   205  }
   206  #endif /* SQLITE_DEBUG */
   207  
   208  #ifdef SQLITE_DEBUG
   209  /*
   210  **** This function may be used as part of assert() statements only. ****
   211  **
   212  ** Return true if it would be illegal for pBtree to write into the
   213  ** table or index rooted at iRoot because other shared connections are
   214  ** simultaneously reading that same table or index.
   215  **
   216  ** It is illegal for pBtree to write if some other Btree object that
   217  ** shares the same BtShared object is currently reading or writing
   218  ** the iRoot table.  Except, if the other Btree object has the
   219  ** read-uncommitted flag set, then it is OK for the other object to
   220  ** have a read cursor.
   221  **
   222  ** For example, before writing to any part of the table or index
   223  ** rooted at page iRoot, one should call:
   224  **
   225  **    assert( !hasReadConflicts(pBtree, iRoot) );
   226  */
   227  static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
   228    BtCursor *p;
   229    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
   230      if( p->pgnoRoot==iRoot 
   231       && p->pBtree!=pBtree
   232       && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
   233      ){
   234        return 1;
   235      }
   236    }
   237    return 0;
   238  }
   239  #endif    /* #ifdef SQLITE_DEBUG */
   240  
   241  /*
   242  ** Query to see if Btree handle p may obtain a lock of type eLock 
   243  ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
   244  ** SQLITE_OK if the lock may be obtained (by calling
   245  ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
   246  */
   247  static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
   248    BtShared *pBt = p->pBt;
   249    BtLock *pIter;
   250  
   251    assert( sqlite3BtreeHoldsMutex(p) );
   252    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   253    assert( p->db!=0 );
   254    assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
   255    
   256    /* If requesting a write-lock, then the Btree must have an open write
   257    ** transaction on this file. And, obviously, for this to be so there 
   258    ** must be an open write transaction on the file itself.
   259    */
   260    assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
   261    assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
   262    
   263    /* This routine is a no-op if the shared-cache is not enabled */
   264    if( !p->sharable ){
   265      return SQLITE_OK;
   266    }
   267  
   268    /* If some other connection is holding an exclusive lock, the
   269    ** requested lock may not be obtained.
   270    */
   271    if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
   272      sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
   273      return SQLITE_LOCKED_SHAREDCACHE;
   274    }
   275  
   276    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   277      /* The condition (pIter->eLock!=eLock) in the following if(...) 
   278      ** statement is a simplification of:
   279      **
   280      **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
   281      **
   282      ** since we know that if eLock==WRITE_LOCK, then no other connection
   283      ** may hold a WRITE_LOCK on any table in this file (since there can
   284      ** only be a single writer).
   285      */
   286      assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
   287      assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
   288      if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
   289        sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
   290        if( eLock==WRITE_LOCK ){
   291          assert( p==pBt->pWriter );
   292          pBt->btsFlags |= BTS_PENDING;
   293        }
   294        return SQLITE_LOCKED_SHAREDCACHE;
   295      }
   296    }
   297    return SQLITE_OK;
   298  }
   299  #endif /* !SQLITE_OMIT_SHARED_CACHE */
   300  
   301  #ifndef SQLITE_OMIT_SHARED_CACHE
   302  /*
   303  ** Add a lock on the table with root-page iTable to the shared-btree used
   304  ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
   305  ** WRITE_LOCK.
   306  **
   307  ** This function assumes the following:
   308  **
   309  **   (a) The specified Btree object p is connected to a sharable
   310  **       database (one with the BtShared.sharable flag set), and
   311  **
   312  **   (b) No other Btree objects hold a lock that conflicts
   313  **       with the requested lock (i.e. querySharedCacheTableLock() has
   314  **       already been called and returned SQLITE_OK).
   315  **
   316  ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 
   317  ** is returned if a malloc attempt fails.
   318  */
   319  static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
   320    BtShared *pBt = p->pBt;
   321    BtLock *pLock = 0;
   322    BtLock *pIter;
   323  
   324    assert( sqlite3BtreeHoldsMutex(p) );
   325    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   326    assert( p->db!=0 );
   327  
   328    /* A connection with the read-uncommitted flag set will never try to
   329    ** obtain a read-lock using this function. The only read-lock obtained
   330    ** by a connection in read-uncommitted mode is on the sqlite_master 
   331    ** table, and that lock is obtained in BtreeBeginTrans().  */
   332    assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
   333  
   334    /* This function should only be called on a sharable b-tree after it 
   335    ** has been determined that no other b-tree holds a conflicting lock.  */
   336    assert( p->sharable );
   337    assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
   338  
   339    /* First search the list for an existing lock on this table. */
   340    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   341      if( pIter->iTable==iTable && pIter->pBtree==p ){
   342        pLock = pIter;
   343        break;
   344      }
   345    }
   346  
   347    /* If the above search did not find a BtLock struct associating Btree p
   348    ** with table iTable, allocate one and link it into the list.
   349    */
   350    if( !pLock ){
   351      pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
   352      if( !pLock ){
   353        return SQLITE_NOMEM_BKPT;
   354      }
   355      pLock->iTable = iTable;
   356      pLock->pBtree = p;
   357      pLock->pNext = pBt->pLock;
   358      pBt->pLock = pLock;
   359    }
   360  
   361    /* Set the BtLock.eLock variable to the maximum of the current lock
   362    ** and the requested lock. This means if a write-lock was already held
   363    ** and a read-lock requested, we don't incorrectly downgrade the lock.
   364    */
   365    assert( WRITE_LOCK>READ_LOCK );
   366    if( eLock>pLock->eLock ){
   367      pLock->eLock = eLock;
   368    }
   369  
   370    return SQLITE_OK;
   371  }
   372  #endif /* !SQLITE_OMIT_SHARED_CACHE */
   373  
   374  #ifndef SQLITE_OMIT_SHARED_CACHE
   375  /*
   376  ** Release all the table locks (locks obtained via calls to
   377  ** the setSharedCacheTableLock() procedure) held by Btree object p.
   378  **
   379  ** This function assumes that Btree p has an open read or write 
   380  ** transaction. If it does not, then the BTS_PENDING flag
   381  ** may be incorrectly cleared.
   382  */
   383  static void clearAllSharedCacheTableLocks(Btree *p){
   384    BtShared *pBt = p->pBt;
   385    BtLock **ppIter = &pBt->pLock;
   386  
   387    assert( sqlite3BtreeHoldsMutex(p) );
   388    assert( p->sharable || 0==*ppIter );
   389    assert( p->inTrans>0 );
   390  
   391    while( *ppIter ){
   392      BtLock *pLock = *ppIter;
   393      assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
   394      assert( pLock->pBtree->inTrans>=pLock->eLock );
   395      if( pLock->pBtree==p ){
   396        *ppIter = pLock->pNext;
   397        assert( pLock->iTable!=1 || pLock==&p->lock );
   398        if( pLock->iTable!=1 ){
   399          sqlite3_free(pLock);
   400        }
   401      }else{
   402        ppIter = &pLock->pNext;
   403      }
   404    }
   405  
   406    assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
   407    if( pBt->pWriter==p ){
   408      pBt->pWriter = 0;
   409      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
   410    }else if( pBt->nTransaction==2 ){
   411      /* This function is called when Btree p is concluding its 
   412      ** transaction. If there currently exists a writer, and p is not
   413      ** that writer, then the number of locks held by connections other
   414      ** than the writer must be about to drop to zero. In this case
   415      ** set the BTS_PENDING flag to 0.
   416      **
   417      ** If there is not currently a writer, then BTS_PENDING must
   418      ** be zero already. So this next line is harmless in that case.
   419      */
   420      pBt->btsFlags &= ~BTS_PENDING;
   421    }
   422  }
   423  
   424  /*
   425  ** This function changes all write-locks held by Btree p into read-locks.
   426  */
   427  static void downgradeAllSharedCacheTableLocks(Btree *p){
   428    BtShared *pBt = p->pBt;
   429    if( pBt->pWriter==p ){
   430      BtLock *pLock;
   431      pBt->pWriter = 0;
   432      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
   433      for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
   434        assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
   435        pLock->eLock = READ_LOCK;
   436      }
   437    }
   438  }
   439  
   440  #endif /* SQLITE_OMIT_SHARED_CACHE */
   441  
   442  static void releasePage(MemPage *pPage);         /* Forward reference */
   443  static void releasePageOne(MemPage *pPage);      /* Forward reference */
   444  static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
   445  
   446  /*
   447  ***** This routine is used inside of assert() only ****
   448  **
   449  ** Verify that the cursor holds the mutex on its BtShared
   450  */
   451  #ifdef SQLITE_DEBUG
   452  static int cursorHoldsMutex(BtCursor *p){
   453    return sqlite3_mutex_held(p->pBt->mutex);
   454  }
   455  
   456  /* Verify that the cursor and the BtShared agree about what is the current
   457  ** database connetion. This is important in shared-cache mode. If the database 
   458  ** connection pointers get out-of-sync, it is possible for routines like
   459  ** btreeInitPage() to reference an stale connection pointer that references a
   460  ** a connection that has already closed.  This routine is used inside assert()
   461  ** statements only and for the purpose of double-checking that the btree code
   462  ** does keep the database connection pointers up-to-date.
   463  */
   464  static int cursorOwnsBtShared(BtCursor *p){
   465    assert( cursorHoldsMutex(p) );
   466    return (p->pBtree->db==p->pBt->db);
   467  }
   468  #endif
   469  
   470  /*
   471  ** Invalidate the overflow cache of the cursor passed as the first argument.
   472  ** on the shared btree structure pBt.
   473  */
   474  #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
   475  
   476  /*
   477  ** Invalidate the overflow page-list cache for all cursors opened
   478  ** on the shared btree structure pBt.
   479  */
   480  static void invalidateAllOverflowCache(BtShared *pBt){
   481    BtCursor *p;
   482    assert( sqlite3_mutex_held(pBt->mutex) );
   483    for(p=pBt->pCursor; p; p=p->pNext){
   484      invalidateOverflowCache(p);
   485    }
   486  }
   487  
   488  #ifndef SQLITE_OMIT_INCRBLOB
   489  /*
   490  ** This function is called before modifying the contents of a table
   491  ** to invalidate any incrblob cursors that are open on the
   492  ** row or one of the rows being modified.
   493  **
   494  ** If argument isClearTable is true, then the entire contents of the
   495  ** table is about to be deleted. In this case invalidate all incrblob
   496  ** cursors open on any row within the table with root-page pgnoRoot.
   497  **
   498  ** Otherwise, if argument isClearTable is false, then the row with
   499  ** rowid iRow is being replaced or deleted. In this case invalidate
   500  ** only those incrblob cursors open on that specific row.
   501  */
   502  static void invalidateIncrblobCursors(
   503    Btree *pBtree,          /* The database file to check */
   504    Pgno pgnoRoot,          /* The table that might be changing */
   505    i64 iRow,               /* The rowid that might be changing */
   506    int isClearTable        /* True if all rows are being deleted */
   507  ){
   508    BtCursor *p;
   509    if( pBtree->hasIncrblobCur==0 ) return;
   510    assert( sqlite3BtreeHoldsMutex(pBtree) );
   511    pBtree->hasIncrblobCur = 0;
   512    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
   513      if( (p->curFlags & BTCF_Incrblob)!=0 ){
   514        pBtree->hasIncrblobCur = 1;
   515        if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
   516          p->eState = CURSOR_INVALID;
   517        }
   518      }
   519    }
   520  }
   521  
   522  #else
   523    /* Stub function when INCRBLOB is omitted */
   524    #define invalidateIncrblobCursors(w,x,y,z)
   525  #endif /* SQLITE_OMIT_INCRBLOB */
   526  
   527  /*
   528  ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 
   529  ** when a page that previously contained data becomes a free-list leaf 
   530  ** page.
   531  **
   532  ** The BtShared.pHasContent bitvec exists to work around an obscure
   533  ** bug caused by the interaction of two useful IO optimizations surrounding
   534  ** free-list leaf pages:
   535  **
   536  **   1) When all data is deleted from a page and the page becomes
   537  **      a free-list leaf page, the page is not written to the database
   538  **      (as free-list leaf pages contain no meaningful data). Sometimes
   539  **      such a page is not even journalled (as it will not be modified,
   540  **      why bother journalling it?).
   541  **
   542  **   2) When a free-list leaf page is reused, its content is not read
   543  **      from the database or written to the journal file (why should it
   544  **      be, if it is not at all meaningful?).
   545  **
   546  ** By themselves, these optimizations work fine and provide a handy
   547  ** performance boost to bulk delete or insert operations. However, if
   548  ** a page is moved to the free-list and then reused within the same
   549  ** transaction, a problem comes up. If the page is not journalled when
   550  ** it is moved to the free-list and it is also not journalled when it
   551  ** is extracted from the free-list and reused, then the original data
   552  ** may be lost. In the event of a rollback, it may not be possible
   553  ** to restore the database to its original configuration.
   554  **
   555  ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 
   556  ** moved to become a free-list leaf page, the corresponding bit is
   557  ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
   558  ** optimization 2 above is omitted if the corresponding bit is already
   559  ** set in BtShared.pHasContent. The contents of the bitvec are cleared
   560  ** at the end of every transaction.
   561  */
   562  static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
   563    int rc = SQLITE_OK;
   564    if( !pBt->pHasContent ){
   565      assert( pgno<=pBt->nPage );
   566      pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
   567      if( !pBt->pHasContent ){
   568        rc = SQLITE_NOMEM_BKPT;
   569      }
   570    }
   571    if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
   572      rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
   573    }
   574    return rc;
   575  }
   576  
   577  /*
   578  ** Query the BtShared.pHasContent vector.
   579  **
   580  ** This function is called when a free-list leaf page is removed from the
   581  ** free-list for reuse. It returns false if it is safe to retrieve the
   582  ** page from the pager layer with the 'no-content' flag set. True otherwise.
   583  */
   584  static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
   585    Bitvec *p = pBt->pHasContent;
   586    return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
   587  }
   588  
   589  /*
   590  ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
   591  ** invoked at the conclusion of each write-transaction.
   592  */
   593  static void btreeClearHasContent(BtShared *pBt){
   594    sqlite3BitvecDestroy(pBt->pHasContent);
   595    pBt->pHasContent = 0;
   596  }
   597  
   598  /*
   599  ** Release all of the apPage[] pages for a cursor.
   600  */
   601  static void btreeReleaseAllCursorPages(BtCursor *pCur){
   602    int i;
   603    if( pCur->iPage>=0 ){
   604      for(i=0; i<pCur->iPage; i++){
   605        releasePageNotNull(pCur->apPage[i]);
   606      }
   607      releasePageNotNull(pCur->pPage);
   608      pCur->iPage = -1;
   609    }
   610  }
   611  
   612  /*
   613  ** The cursor passed as the only argument must point to a valid entry
   614  ** when this function is called (i.e. have eState==CURSOR_VALID). This
   615  ** function saves the current cursor key in variables pCur->nKey and
   616  ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 
   617  ** code otherwise.
   618  **
   619  ** If the cursor is open on an intkey table, then the integer key
   620  ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
   621  ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 
   622  ** set to point to a malloced buffer pCur->nKey bytes in size containing 
   623  ** the key.
   624  */
   625  static int saveCursorKey(BtCursor *pCur){
   626    int rc = SQLITE_OK;
   627    assert( CURSOR_VALID==pCur->eState );
   628    assert( 0==pCur->pKey );
   629    assert( cursorHoldsMutex(pCur) );
   630  
   631    if( pCur->curIntKey ){
   632      /* Only the rowid is required for a table btree */
   633      pCur->nKey = sqlite3BtreeIntegerKey(pCur);
   634    }else{
   635      /* For an index btree, save the complete key content */
   636      void *pKey;
   637      pCur->nKey = sqlite3BtreePayloadSize(pCur);
   638      pKey = sqlite3Malloc( pCur->nKey );
   639      if( pKey ){
   640        rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
   641        if( rc==SQLITE_OK ){
   642          pCur->pKey = pKey;
   643        }else{
   644          sqlite3_free(pKey);
   645        }
   646      }else{
   647        rc = SQLITE_NOMEM_BKPT;
   648      }
   649    }
   650    assert( !pCur->curIntKey || !pCur->pKey );
   651    return rc;
   652  }
   653  
   654  /*
   655  ** Save the current cursor position in the variables BtCursor.nKey 
   656  ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
   657  **
   658  ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
   659  ** prior to calling this routine.  
   660  */
   661  static int saveCursorPosition(BtCursor *pCur){
   662    int rc;
   663  
   664    assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
   665    assert( 0==pCur->pKey );
   666    assert( cursorHoldsMutex(pCur) );
   667  
   668    if( pCur->eState==CURSOR_SKIPNEXT ){
   669      pCur->eState = CURSOR_VALID;
   670    }else{
   671      pCur->skipNext = 0;
   672    }
   673  
   674    rc = saveCursorKey(pCur);
   675    if( rc==SQLITE_OK ){
   676      btreeReleaseAllCursorPages(pCur);
   677      pCur->eState = CURSOR_REQUIRESEEK;
   678    }
   679  
   680    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
   681    return rc;
   682  }
   683  
   684  /* Forward reference */
   685  static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
   686  
   687  /*
   688  ** Save the positions of all cursors (except pExcept) that are open on
   689  ** the table with root-page iRoot.  "Saving the cursor position" means that
   690  ** the location in the btree is remembered in such a way that it can be
   691  ** moved back to the same spot after the btree has been modified.  This
   692  ** routine is called just before cursor pExcept is used to modify the
   693  ** table, for example in BtreeDelete() or BtreeInsert().
   694  **
   695  ** If there are two or more cursors on the same btree, then all such 
   696  ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
   697  ** routine enforces that rule.  This routine only needs to be called in
   698  ** the uncommon case when pExpect has the BTCF_Multiple flag set.
   699  **
   700  ** If pExpect!=NULL and if no other cursors are found on the same root-page,
   701  ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
   702  ** pointless call to this routine.
   703  **
   704  ** Implementation note:  This routine merely checks to see if any cursors
   705  ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
   706  ** event that cursors are in need to being saved.
   707  */
   708  static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
   709    BtCursor *p;
   710    assert( sqlite3_mutex_held(pBt->mutex) );
   711    assert( pExcept==0 || pExcept->pBt==pBt );
   712    for(p=pBt->pCursor; p; p=p->pNext){
   713      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
   714    }
   715    if( p ) return saveCursorsOnList(p, iRoot, pExcept);
   716    if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
   717    return SQLITE_OK;
   718  }
   719  
   720  /* This helper routine to saveAllCursors does the actual work of saving
   721  ** the cursors if and when a cursor is found that actually requires saving.
   722  ** The common case is that no cursors need to be saved, so this routine is
   723  ** broken out from its caller to avoid unnecessary stack pointer movement.
   724  */
   725  static int SQLITE_NOINLINE saveCursorsOnList(
   726    BtCursor *p,         /* The first cursor that needs saving */
   727    Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
   728    BtCursor *pExcept    /* Do not save this cursor */
   729  ){
   730    do{
   731      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
   732        if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
   733          int rc = saveCursorPosition(p);
   734          if( SQLITE_OK!=rc ){
   735            return rc;
   736          }
   737        }else{
   738          testcase( p->iPage>=0 );
   739          btreeReleaseAllCursorPages(p);
   740        }
   741      }
   742      p = p->pNext;
   743    }while( p );
   744    return SQLITE_OK;
   745  }
   746  
   747  /*
   748  ** Clear the current cursor position.
   749  */
   750  void sqlite3BtreeClearCursor(BtCursor *pCur){
   751    assert( cursorHoldsMutex(pCur) );
   752    sqlite3_free(pCur->pKey);
   753    pCur->pKey = 0;
   754    pCur->eState = CURSOR_INVALID;
   755  }
   756  
   757  /*
   758  ** In this version of BtreeMoveto, pKey is a packed index record
   759  ** such as is generated by the OP_MakeRecord opcode.  Unpack the
   760  ** record and then call BtreeMovetoUnpacked() to do the work.
   761  */
   762  static int btreeMoveto(
   763    BtCursor *pCur,     /* Cursor open on the btree to be searched */
   764    const void *pKey,   /* Packed key if the btree is an index */
   765    i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
   766    int bias,           /* Bias search to the high end */
   767    int *pRes           /* Write search results here */
   768  ){
   769    int rc;                    /* Status code */
   770    UnpackedRecord *pIdxKey;   /* Unpacked index key */
   771  
   772    if( pKey ){
   773      assert( nKey==(i64)(int)nKey );
   774      pIdxKey = sqlite3VdbeAllocUnpackedRecord(pCur->pKeyInfo);
   775      if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
   776      sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
   777      if( pIdxKey->nField==0 ){
   778        rc = SQLITE_CORRUPT_BKPT;
   779        goto moveto_done;
   780      }
   781    }else{
   782      pIdxKey = 0;
   783    }
   784    rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
   785  moveto_done:
   786    if( pIdxKey ){
   787      sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
   788    }
   789    return rc;
   790  }
   791  
   792  /*
   793  ** Restore the cursor to the position it was in (or as close to as possible)
   794  ** when saveCursorPosition() was called. Note that this call deletes the 
   795  ** saved position info stored by saveCursorPosition(), so there can be
   796  ** at most one effective restoreCursorPosition() call after each 
   797  ** saveCursorPosition().
   798  */
   799  static int btreeRestoreCursorPosition(BtCursor *pCur){
   800    int rc;
   801    int skipNext;
   802    assert( cursorOwnsBtShared(pCur) );
   803    assert( pCur->eState>=CURSOR_REQUIRESEEK );
   804    if( pCur->eState==CURSOR_FAULT ){
   805      return pCur->skipNext;
   806    }
   807    pCur->eState = CURSOR_INVALID;
   808    rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
   809    if( rc==SQLITE_OK ){
   810      sqlite3_free(pCur->pKey);
   811      pCur->pKey = 0;
   812      assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
   813      pCur->skipNext |= skipNext;
   814      if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
   815        pCur->eState = CURSOR_SKIPNEXT;
   816      }
   817    }
   818    return rc;
   819  }
   820  
   821  #define restoreCursorPosition(p) \
   822    (p->eState>=CURSOR_REQUIRESEEK ? \
   823           btreeRestoreCursorPosition(p) : \
   824           SQLITE_OK)
   825  
   826  /*
   827  ** Determine whether or not a cursor has moved from the position where
   828  ** it was last placed, or has been invalidated for any other reason.
   829  ** Cursors can move when the row they are pointing at is deleted out
   830  ** from under them, for example.  Cursor might also move if a btree
   831  ** is rebalanced.
   832  **
   833  ** Calling this routine with a NULL cursor pointer returns false.
   834  **
   835  ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
   836  ** back to where it ought to be if this routine returns true.
   837  */
   838  int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
   839    return pCur->eState!=CURSOR_VALID;
   840  }
   841  
   842  /*
   843  ** Return a pointer to a fake BtCursor object that will always answer
   844  ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
   845  ** cursor returned must not be used with any other Btree interface.
   846  */
   847  BtCursor *sqlite3BtreeFakeValidCursor(void){
   848    static u8 fakeCursor = CURSOR_VALID;
   849    assert( offsetof(BtCursor, eState)==0 );
   850    return (BtCursor*)&fakeCursor;
   851  }
   852  
   853  /*
   854  ** This routine restores a cursor back to its original position after it
   855  ** has been moved by some outside activity (such as a btree rebalance or
   856  ** a row having been deleted out from under the cursor).  
   857  **
   858  ** On success, the *pDifferentRow parameter is false if the cursor is left
   859  ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
   860  ** was pointing to has been deleted, forcing the cursor to point to some
   861  ** nearby row.
   862  **
   863  ** This routine should only be called for a cursor that just returned
   864  ** TRUE from sqlite3BtreeCursorHasMoved().
   865  */
   866  int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
   867    int rc;
   868  
   869    assert( pCur!=0 );
   870    assert( pCur->eState!=CURSOR_VALID );
   871    rc = restoreCursorPosition(pCur);
   872    if( rc ){
   873      *pDifferentRow = 1;
   874      return rc;
   875    }
   876    if( pCur->eState!=CURSOR_VALID ){
   877      *pDifferentRow = 1;
   878    }else{
   879      assert( pCur->skipNext==0 );
   880      *pDifferentRow = 0;
   881    }
   882    return SQLITE_OK;
   883  }
   884  
   885  #ifdef SQLITE_ENABLE_CURSOR_HINTS
   886  /*
   887  ** Provide hints to the cursor.  The particular hint given (and the type
   888  ** and number of the varargs parameters) is determined by the eHintType
   889  ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
   890  */
   891  void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
   892    /* Used only by system that substitute their own storage engine */
   893  }
   894  #endif
   895  
   896  /*
   897  ** Provide flag hints to the cursor.
   898  */
   899  void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
   900    assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
   901    pCur->hints = x;
   902  }
   903  
   904  
   905  #ifndef SQLITE_OMIT_AUTOVACUUM
   906  /*
   907  ** Given a page number of a regular database page, return the page
   908  ** number for the pointer-map page that contains the entry for the
   909  ** input page number.
   910  **
   911  ** Return 0 (not a valid page) for pgno==1 since there is
   912  ** no pointer map associated with page 1.  The integrity_check logic
   913  ** requires that ptrmapPageno(*,1)!=1.
   914  */
   915  static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
   916    int nPagesPerMapPage;
   917    Pgno iPtrMap, ret;
   918    assert( sqlite3_mutex_held(pBt->mutex) );
   919    if( pgno<2 ) return 0;
   920    nPagesPerMapPage = (pBt->usableSize/5)+1;
   921    iPtrMap = (pgno-2)/nPagesPerMapPage;
   922    ret = (iPtrMap*nPagesPerMapPage) + 2; 
   923    if( ret==PENDING_BYTE_PAGE(pBt) ){
   924      ret++;
   925    }
   926    return ret;
   927  }
   928  
   929  /*
   930  ** Write an entry into the pointer map.
   931  **
   932  ** This routine updates the pointer map entry for page number 'key'
   933  ** so that it maps to type 'eType' and parent page number 'pgno'.
   934  **
   935  ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
   936  ** a no-op.  If an error occurs, the appropriate error code is written
   937  ** into *pRC.
   938  */
   939  static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
   940    DbPage *pDbPage;  /* The pointer map page */
   941    u8 *pPtrmap;      /* The pointer map data */
   942    Pgno iPtrmap;     /* The pointer map page number */
   943    int offset;       /* Offset in pointer map page */
   944    int rc;           /* Return code from subfunctions */
   945  
   946    if( *pRC ) return;
   947  
   948    assert( sqlite3_mutex_held(pBt->mutex) );
   949    /* The master-journal page number must never be used as a pointer map page */
   950    assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
   951  
   952    assert( pBt->autoVacuum );
   953    if( key==0 ){
   954      *pRC = SQLITE_CORRUPT_BKPT;
   955      return;
   956    }
   957    iPtrmap = PTRMAP_PAGENO(pBt, key);
   958    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
   959    if( rc!=SQLITE_OK ){
   960      *pRC = rc;
   961      return;
   962    }
   963    offset = PTRMAP_PTROFFSET(iPtrmap, key);
   964    if( offset<0 ){
   965      *pRC = SQLITE_CORRUPT_BKPT;
   966      goto ptrmap_exit;
   967    }
   968    assert( offset <= (int)pBt->usableSize-5 );
   969    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
   970  
   971    if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
   972      TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
   973      *pRC= rc = sqlite3PagerWrite(pDbPage);
   974      if( rc==SQLITE_OK ){
   975        pPtrmap[offset] = eType;
   976        put4byte(&pPtrmap[offset+1], parent);
   977      }
   978    }
   979  
   980  ptrmap_exit:
   981    sqlite3PagerUnref(pDbPage);
   982  }
   983  
   984  /*
   985  ** Read an entry from the pointer map.
   986  **
   987  ** This routine retrieves the pointer map entry for page 'key', writing
   988  ** the type and parent page number to *pEType and *pPgno respectively.
   989  ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
   990  */
   991  static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
   992    DbPage *pDbPage;   /* The pointer map page */
   993    int iPtrmap;       /* Pointer map page index */
   994    u8 *pPtrmap;       /* Pointer map page data */
   995    int offset;        /* Offset of entry in pointer map */
   996    int rc;
   997  
   998    assert( sqlite3_mutex_held(pBt->mutex) );
   999  
  1000    iPtrmap = PTRMAP_PAGENO(pBt, key);
  1001    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
  1002    if( rc!=0 ){
  1003      return rc;
  1004    }
  1005    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
  1006  
  1007    offset = PTRMAP_PTROFFSET(iPtrmap, key);
  1008    if( offset<0 ){
  1009      sqlite3PagerUnref(pDbPage);
  1010      return SQLITE_CORRUPT_BKPT;
  1011    }
  1012    assert( offset <= (int)pBt->usableSize-5 );
  1013    assert( pEType!=0 );
  1014    *pEType = pPtrmap[offset];
  1015    if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
  1016  
  1017    sqlite3PagerUnref(pDbPage);
  1018    if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
  1019    return SQLITE_OK;
  1020  }
  1021  
  1022  #else /* if defined SQLITE_OMIT_AUTOVACUUM */
  1023    #define ptrmapPut(w,x,y,z,rc)
  1024    #define ptrmapGet(w,x,y,z) SQLITE_OK
  1025    #define ptrmapPutOvflPtr(x, y, rc)
  1026  #endif
  1027  
  1028  /*
  1029  ** Given a btree page and a cell index (0 means the first cell on
  1030  ** the page, 1 means the second cell, and so forth) return a pointer
  1031  ** to the cell content.
  1032  **
  1033  ** findCellPastPtr() does the same except it skips past the initial
  1034  ** 4-byte child pointer found on interior pages, if there is one.
  1035  **
  1036  ** This routine works only for pages that do not contain overflow cells.
  1037  */
  1038  #define findCell(P,I) \
  1039    ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
  1040  #define findCellPastPtr(P,I) \
  1041    ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
  1042  
  1043  
  1044  /*
  1045  ** This is common tail processing for btreeParseCellPtr() and
  1046  ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
  1047  ** on a single B-tree page.  Make necessary adjustments to the CellInfo
  1048  ** structure.
  1049  */
  1050  static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
  1051    MemPage *pPage,         /* Page containing the cell */
  1052    u8 *pCell,              /* Pointer to the cell text. */
  1053    CellInfo *pInfo         /* Fill in this structure */
  1054  ){
  1055    /* If the payload will not fit completely on the local page, we have
  1056    ** to decide how much to store locally and how much to spill onto
  1057    ** overflow pages.  The strategy is to minimize the amount of unused
  1058    ** space on overflow pages while keeping the amount of local storage
  1059    ** in between minLocal and maxLocal.
  1060    **
  1061    ** Warning:  changing the way overflow payload is distributed in any
  1062    ** way will result in an incompatible file format.
  1063    */
  1064    int minLocal;  /* Minimum amount of payload held locally */
  1065    int maxLocal;  /* Maximum amount of payload held locally */
  1066    int surplus;   /* Overflow payload available for local storage */
  1067  
  1068    minLocal = pPage->minLocal;
  1069    maxLocal = pPage->maxLocal;
  1070    surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
  1071    testcase( surplus==maxLocal );
  1072    testcase( surplus==maxLocal+1 );
  1073    if( surplus <= maxLocal ){
  1074      pInfo->nLocal = (u16)surplus;
  1075    }else{
  1076      pInfo->nLocal = (u16)minLocal;
  1077    }
  1078    pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
  1079  }
  1080  
  1081  /*
  1082  ** The following routines are implementations of the MemPage.xParseCell()
  1083  ** method.
  1084  **
  1085  ** Parse a cell content block and fill in the CellInfo structure.
  1086  **
  1087  ** btreeParseCellPtr()        =>   table btree leaf nodes
  1088  ** btreeParseCellNoPayload()  =>   table btree internal nodes
  1089  ** btreeParseCellPtrIndex()   =>   index btree nodes
  1090  **
  1091  ** There is also a wrapper function btreeParseCell() that works for
  1092  ** all MemPage types and that references the cell by index rather than
  1093  ** by pointer.
  1094  */
  1095  static void btreeParseCellPtrNoPayload(
  1096    MemPage *pPage,         /* Page containing the cell */
  1097    u8 *pCell,              /* Pointer to the cell text. */
  1098    CellInfo *pInfo         /* Fill in this structure */
  1099  ){
  1100    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1101    assert( pPage->leaf==0 );
  1102    assert( pPage->childPtrSize==4 );
  1103  #ifndef SQLITE_DEBUG
  1104    UNUSED_PARAMETER(pPage);
  1105  #endif
  1106    pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
  1107    pInfo->nPayload = 0;
  1108    pInfo->nLocal = 0;
  1109    pInfo->pPayload = 0;
  1110    return;
  1111  }
  1112  static void btreeParseCellPtr(
  1113    MemPage *pPage,         /* Page containing the cell */
  1114    u8 *pCell,              /* Pointer to the cell text. */
  1115    CellInfo *pInfo         /* Fill in this structure */
  1116  ){
  1117    u8 *pIter;              /* For scanning through pCell */
  1118    u32 nPayload;           /* Number of bytes of cell payload */
  1119    u64 iKey;               /* Extracted Key value */
  1120  
  1121    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1122    assert( pPage->leaf==0 || pPage->leaf==1 );
  1123    assert( pPage->intKeyLeaf );
  1124    assert( pPage->childPtrSize==0 );
  1125    pIter = pCell;
  1126  
  1127    /* The next block of code is equivalent to:
  1128    **
  1129    **     pIter += getVarint32(pIter, nPayload);
  1130    **
  1131    ** The code is inlined to avoid a function call.
  1132    */
  1133    nPayload = *pIter;
  1134    if( nPayload>=0x80 ){
  1135      u8 *pEnd = &pIter[8];
  1136      nPayload &= 0x7f;
  1137      do{
  1138        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
  1139      }while( (*pIter)>=0x80 && pIter<pEnd );
  1140    }
  1141    pIter++;
  1142  
  1143    /* The next block of code is equivalent to:
  1144    **
  1145    **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
  1146    **
  1147    ** The code is inlined to avoid a function call.
  1148    */
  1149    iKey = *pIter;
  1150    if( iKey>=0x80 ){
  1151      u8 *pEnd = &pIter[7];
  1152      iKey &= 0x7f;
  1153      while(1){
  1154        iKey = (iKey<<7) | (*++pIter & 0x7f);
  1155        if( (*pIter)<0x80 ) break;
  1156        if( pIter>=pEnd ){
  1157          iKey = (iKey<<8) | *++pIter;
  1158          break;
  1159        }
  1160      }
  1161    }
  1162    pIter++;
  1163  
  1164    pInfo->nKey = *(i64*)&iKey;
  1165    pInfo->nPayload = nPayload;
  1166    pInfo->pPayload = pIter;
  1167    testcase( nPayload==pPage->maxLocal );
  1168    testcase( nPayload==pPage->maxLocal+1 );
  1169    if( nPayload<=pPage->maxLocal ){
  1170      /* This is the (easy) common case where the entire payload fits
  1171      ** on the local page.  No overflow is required.
  1172      */
  1173      pInfo->nSize = nPayload + (u16)(pIter - pCell);
  1174      if( pInfo->nSize<4 ) pInfo->nSize = 4;
  1175      pInfo->nLocal = (u16)nPayload;
  1176    }else{
  1177      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
  1178    }
  1179  }
  1180  static void btreeParseCellPtrIndex(
  1181    MemPage *pPage,         /* Page containing the cell */
  1182    u8 *pCell,              /* Pointer to the cell text. */
  1183    CellInfo *pInfo         /* Fill in this structure */
  1184  ){
  1185    u8 *pIter;              /* For scanning through pCell */
  1186    u32 nPayload;           /* Number of bytes of cell payload */
  1187  
  1188    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1189    assert( pPage->leaf==0 || pPage->leaf==1 );
  1190    assert( pPage->intKeyLeaf==0 );
  1191    pIter = pCell + pPage->childPtrSize;
  1192    nPayload = *pIter;
  1193    if( nPayload>=0x80 ){
  1194      u8 *pEnd = &pIter[8];
  1195      nPayload &= 0x7f;
  1196      do{
  1197        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
  1198      }while( *(pIter)>=0x80 && pIter<pEnd );
  1199    }
  1200    pIter++;
  1201    pInfo->nKey = nPayload;
  1202    pInfo->nPayload = nPayload;
  1203    pInfo->pPayload = pIter;
  1204    testcase( nPayload==pPage->maxLocal );
  1205    testcase( nPayload==pPage->maxLocal+1 );
  1206    if( nPayload<=pPage->maxLocal ){
  1207      /* This is the (easy) common case where the entire payload fits
  1208      ** on the local page.  No overflow is required.
  1209      */
  1210      pInfo->nSize = nPayload + (u16)(pIter - pCell);
  1211      if( pInfo->nSize<4 ) pInfo->nSize = 4;
  1212      pInfo->nLocal = (u16)nPayload;
  1213    }else{
  1214      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
  1215    }
  1216  }
  1217  static void btreeParseCell(
  1218    MemPage *pPage,         /* Page containing the cell */
  1219    int iCell,              /* The cell index.  First cell is 0 */
  1220    CellInfo *pInfo         /* Fill in this structure */
  1221  ){
  1222    pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
  1223  }
  1224  
  1225  /*
  1226  ** The following routines are implementations of the MemPage.xCellSize
  1227  ** method.
  1228  **
  1229  ** Compute the total number of bytes that a Cell needs in the cell
  1230  ** data area of the btree-page.  The return number includes the cell
  1231  ** data header and the local payload, but not any overflow page or
  1232  ** the space used by the cell pointer.
  1233  **
  1234  ** cellSizePtrNoPayload()    =>   table internal nodes
  1235  ** cellSizePtr()             =>   all index nodes & table leaf nodes
  1236  */
  1237  static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
  1238    u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
  1239    u8 *pEnd;                                /* End mark for a varint */
  1240    u32 nSize;                               /* Size value to return */
  1241  
  1242  #ifdef SQLITE_DEBUG
  1243    /* The value returned by this function should always be the same as
  1244    ** the (CellInfo.nSize) value found by doing a full parse of the
  1245    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
  1246    ** this function verifies that this invariant is not violated. */
  1247    CellInfo debuginfo;
  1248    pPage->xParseCell(pPage, pCell, &debuginfo);
  1249  #endif
  1250  
  1251    nSize = *pIter;
  1252    if( nSize>=0x80 ){
  1253      pEnd = &pIter[8];
  1254      nSize &= 0x7f;
  1255      do{
  1256        nSize = (nSize<<7) | (*++pIter & 0x7f);
  1257      }while( *(pIter)>=0x80 && pIter<pEnd );
  1258    }
  1259    pIter++;
  1260    if( pPage->intKey ){
  1261      /* pIter now points at the 64-bit integer key value, a variable length 
  1262      ** integer. The following block moves pIter to point at the first byte
  1263      ** past the end of the key value. */
  1264      pEnd = &pIter[9];
  1265      while( (*pIter++)&0x80 && pIter<pEnd );
  1266    }
  1267    testcase( nSize==pPage->maxLocal );
  1268    testcase( nSize==pPage->maxLocal+1 );
  1269    if( nSize<=pPage->maxLocal ){
  1270      nSize += (u32)(pIter - pCell);
  1271      if( nSize<4 ) nSize = 4;
  1272    }else{
  1273      int minLocal = pPage->minLocal;
  1274      nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
  1275      testcase( nSize==pPage->maxLocal );
  1276      testcase( nSize==pPage->maxLocal+1 );
  1277      if( nSize>pPage->maxLocal ){
  1278        nSize = minLocal;
  1279      }
  1280      nSize += 4 + (u16)(pIter - pCell);
  1281    }
  1282    assert( nSize==debuginfo.nSize || CORRUPT_DB );
  1283    return (u16)nSize;
  1284  }
  1285  static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
  1286    u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
  1287    u8 *pEnd;              /* End mark for a varint */
  1288  
  1289  #ifdef SQLITE_DEBUG
  1290    /* The value returned by this function should always be the same as
  1291    ** the (CellInfo.nSize) value found by doing a full parse of the
  1292    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
  1293    ** this function verifies that this invariant is not violated. */
  1294    CellInfo debuginfo;
  1295    pPage->xParseCell(pPage, pCell, &debuginfo);
  1296  #else
  1297    UNUSED_PARAMETER(pPage);
  1298  #endif
  1299  
  1300    assert( pPage->childPtrSize==4 );
  1301    pEnd = pIter + 9;
  1302    while( (*pIter++)&0x80 && pIter<pEnd );
  1303    assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
  1304    return (u16)(pIter - pCell);
  1305  }
  1306  
  1307  
  1308  #ifdef SQLITE_DEBUG
  1309  /* This variation on cellSizePtr() is used inside of assert() statements
  1310  ** only. */
  1311  static u16 cellSize(MemPage *pPage, int iCell){
  1312    return pPage->xCellSize(pPage, findCell(pPage, iCell));
  1313  }
  1314  #endif
  1315  
  1316  #ifndef SQLITE_OMIT_AUTOVACUUM
  1317  /*
  1318  ** If the cell pCell, part of page pPage contains a pointer
  1319  ** to an overflow page, insert an entry into the pointer-map
  1320  ** for the overflow page.
  1321  */
  1322  static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
  1323    CellInfo info;
  1324    if( *pRC ) return;
  1325    assert( pCell!=0 );
  1326    pPage->xParseCell(pPage, pCell, &info);
  1327    if( info.nLocal<info.nPayload ){
  1328      Pgno ovfl = get4byte(&pCell[info.nSize-4]);
  1329      ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
  1330    }
  1331  }
  1332  #endif
  1333  
  1334  
  1335  /*
  1336  ** Defragment the page given. This routine reorganizes cells within the
  1337  ** page so that there are no free-blocks on the free-block list.
  1338  **
  1339  ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
  1340  ** present in the page after this routine returns.
  1341  **
  1342  ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
  1343  ** b-tree page so that there are no freeblocks or fragment bytes, all
  1344  ** unused bytes are contained in the unallocated space region, and all
  1345  ** cells are packed tightly at the end of the page.
  1346  */
  1347  static int defragmentPage(MemPage *pPage, int nMaxFrag){
  1348    int i;                     /* Loop counter */
  1349    int pc;                    /* Address of the i-th cell */
  1350    int hdr;                   /* Offset to the page header */
  1351    int size;                  /* Size of a cell */
  1352    int usableSize;            /* Number of usable bytes on a page */
  1353    int cellOffset;            /* Offset to the cell pointer array */
  1354    int cbrk;                  /* Offset to the cell content area */
  1355    int nCell;                 /* Number of cells on the page */
  1356    unsigned char *data;       /* The page data */
  1357    unsigned char *temp;       /* Temp area for cell content */
  1358    unsigned char *src;        /* Source of content */
  1359    int iCellFirst;            /* First allowable cell index */
  1360    int iCellLast;             /* Last possible cell index */
  1361  
  1362    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1363    assert( pPage->pBt!=0 );
  1364    assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
  1365    assert( pPage->nOverflow==0 );
  1366    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1367    temp = 0;
  1368    src = data = pPage->aData;
  1369    hdr = pPage->hdrOffset;
  1370    cellOffset = pPage->cellOffset;
  1371    nCell = pPage->nCell;
  1372    assert( nCell==get2byte(&data[hdr+3]) );
  1373    iCellFirst = cellOffset + 2*nCell;
  1374    usableSize = pPage->pBt->usableSize;
  1375  
  1376    /* This block handles pages with two or fewer free blocks and nMaxFrag
  1377    ** or fewer fragmented bytes. In this case it is faster to move the
  1378    ** two (or one) blocks of cells using memmove() and add the required
  1379    ** offsets to each pointer in the cell-pointer array than it is to 
  1380    ** reconstruct the entire page.  */
  1381    if( (int)data[hdr+7]<=nMaxFrag ){
  1382      int iFree = get2byte(&data[hdr+1]);
  1383      if( iFree ){
  1384        int iFree2 = get2byte(&data[iFree]);
  1385  
  1386        /* pageFindSlot() has already verified that free blocks are sorted
  1387        ** in order of offset within the page, and that no block extends
  1388        ** past the end of the page. Provided the two free slots do not 
  1389        ** overlap, this guarantees that the memmove() calls below will not
  1390        ** overwrite the usableSize byte buffer, even if the database page
  1391        ** is corrupt.  */
  1392        assert( iFree2==0 || iFree2>iFree );
  1393        assert( iFree+get2byte(&data[iFree+2]) <= usableSize );
  1394        assert( iFree2==0 || iFree2+get2byte(&data[iFree2+2]) <= usableSize );
  1395  
  1396        if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
  1397          u8 *pEnd = &data[cellOffset + nCell*2];
  1398          u8 *pAddr;
  1399          int sz2 = 0;
  1400          int sz = get2byte(&data[iFree+2]);
  1401          int top = get2byte(&data[hdr+5]);
  1402          if( top>=iFree ){
  1403            return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1404          }
  1405          if( iFree2 ){
  1406            assert( iFree+sz<=iFree2 ); /* Verified by pageFindSlot() */
  1407            sz2 = get2byte(&data[iFree2+2]);
  1408            assert( iFree+sz+sz2+iFree2-(iFree+sz) <= usableSize );
  1409            memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
  1410            sz += sz2;
  1411          }
  1412          cbrk = top+sz;
  1413          assert( cbrk+(iFree-top) <= usableSize );
  1414          memmove(&data[cbrk], &data[top], iFree-top);
  1415          for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
  1416            pc = get2byte(pAddr);
  1417            if( pc<iFree ){ put2byte(pAddr, pc+sz); }
  1418            else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
  1419          }
  1420          goto defragment_out;
  1421        }
  1422      }
  1423    }
  1424  
  1425    cbrk = usableSize;
  1426    iCellLast = usableSize - 4;
  1427    for(i=0; i<nCell; i++){
  1428      u8 *pAddr;     /* The i-th cell pointer */
  1429      pAddr = &data[cellOffset + i*2];
  1430      pc = get2byte(pAddr);
  1431      testcase( pc==iCellFirst );
  1432      testcase( pc==iCellLast );
  1433      /* These conditions have already been verified in btreeInitPage()
  1434      ** if PRAGMA cell_size_check=ON.
  1435      */
  1436      if( pc<iCellFirst || pc>iCellLast ){
  1437        return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1438      }
  1439      assert( pc>=iCellFirst && pc<=iCellLast );
  1440      size = pPage->xCellSize(pPage, &src[pc]);
  1441      cbrk -= size;
  1442      if( cbrk<iCellFirst || pc+size>usableSize ){
  1443        return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1444      }
  1445      assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
  1446      testcase( cbrk+size==usableSize );
  1447      testcase( pc+size==usableSize );
  1448      put2byte(pAddr, cbrk);
  1449      if( temp==0 ){
  1450        int x;
  1451        if( cbrk==pc ) continue;
  1452        temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
  1453        x = get2byte(&data[hdr+5]);
  1454        memcpy(&temp[x], &data[x], (cbrk+size) - x);
  1455        src = temp;
  1456      }
  1457      memcpy(&data[cbrk], &src[pc], size);
  1458    }
  1459    data[hdr+7] = 0;
  1460  
  1461   defragment_out:
  1462    if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
  1463      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1464    }
  1465    assert( cbrk>=iCellFirst );
  1466    put2byte(&data[hdr+5], cbrk);
  1467    data[hdr+1] = 0;
  1468    data[hdr+2] = 0;
  1469    memset(&data[iCellFirst], 0, cbrk-iCellFirst);
  1470    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1471    return SQLITE_OK;
  1472  }
  1473  
  1474  /*
  1475  ** Search the free-list on page pPg for space to store a cell nByte bytes in
  1476  ** size. If one can be found, return a pointer to the space and remove it
  1477  ** from the free-list.
  1478  **
  1479  ** If no suitable space can be found on the free-list, return NULL.
  1480  **
  1481  ** This function may detect corruption within pPg.  If corruption is
  1482  ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
  1483  **
  1484  ** Slots on the free list that are between 1 and 3 bytes larger than nByte
  1485  ** will be ignored if adding the extra space to the fragmentation count
  1486  ** causes the fragmentation count to exceed 60.
  1487  */
  1488  static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
  1489    const int hdr = pPg->hdrOffset;
  1490    u8 * const aData = pPg->aData;
  1491    int iAddr = hdr + 1;
  1492    int pc = get2byte(&aData[iAddr]);
  1493    int x;
  1494    int usableSize = pPg->pBt->usableSize;
  1495    int size;            /* Size of the free slot */
  1496  
  1497    assert( pc>0 );
  1498    while( pc<=usableSize-4 ){
  1499      /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
  1500      ** freeblock form a big-endian integer which is the size of the freeblock
  1501      ** in bytes, including the 4-byte header. */
  1502      size = get2byte(&aData[pc+2]);
  1503      if( (x = size - nByte)>=0 ){
  1504        testcase( x==4 );
  1505        testcase( x==3 );
  1506        if( size+pc > usableSize ){
  1507          *pRc = SQLITE_CORRUPT_PGNO(pPg->pgno);
  1508          return 0;
  1509        }else if( x<4 ){
  1510          /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
  1511          ** number of bytes in fragments may not exceed 60. */
  1512          if( aData[hdr+7]>57 ) return 0;
  1513  
  1514          /* Remove the slot from the free-list. Update the number of
  1515          ** fragmented bytes within the page. */
  1516          memcpy(&aData[iAddr], &aData[pc], 2);
  1517          aData[hdr+7] += (u8)x;
  1518        }else{
  1519          /* The slot remains on the free-list. Reduce its size to account
  1520           ** for the portion used by the new allocation. */
  1521          put2byte(&aData[pc+2], x);
  1522        }
  1523        return &aData[pc + x];
  1524      }
  1525      iAddr = pc;
  1526      pc = get2byte(&aData[pc]);
  1527      if( pc<iAddr+size ) break;
  1528    }
  1529    if( pc ){
  1530      *pRc = SQLITE_CORRUPT_PGNO(pPg->pgno);
  1531    }
  1532  
  1533    return 0;
  1534  }
  1535  
  1536  /*
  1537  ** Allocate nByte bytes of space from within the B-Tree page passed
  1538  ** as the first argument. Write into *pIdx the index into pPage->aData[]
  1539  ** of the first byte of allocated space. Return either SQLITE_OK or
  1540  ** an error code (usually SQLITE_CORRUPT).
  1541  **
  1542  ** The caller guarantees that there is sufficient space to make the
  1543  ** allocation.  This routine might need to defragment in order to bring
  1544  ** all the space together, however.  This routine will avoid using
  1545  ** the first two bytes past the cell pointer area since presumably this
  1546  ** allocation is being made in order to insert a new cell, so we will
  1547  ** also end up needing a new cell pointer.
  1548  */
  1549  static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
  1550    const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
  1551    u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
  1552    int top;                             /* First byte of cell content area */
  1553    int rc = SQLITE_OK;                  /* Integer return code */
  1554    int gap;        /* First byte of gap between cell pointers and cell content */
  1555    
  1556    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1557    assert( pPage->pBt );
  1558    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1559    assert( nByte>=0 );  /* Minimum cell size is 4 */
  1560    assert( pPage->nFree>=nByte );
  1561    assert( pPage->nOverflow==0 );
  1562    assert( nByte < (int)(pPage->pBt->usableSize-8) );
  1563  
  1564    assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
  1565    gap = pPage->cellOffset + 2*pPage->nCell;
  1566    assert( gap<=65536 );
  1567    /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
  1568    ** and the reserved space is zero (the usual value for reserved space)
  1569    ** then the cell content offset of an empty page wants to be 65536.
  1570    ** However, that integer is too large to be stored in a 2-byte unsigned
  1571    ** integer, so a value of 0 is used in its place. */
  1572    top = get2byte(&data[hdr+5]);
  1573    assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
  1574    if( gap>top ){
  1575      if( top==0 && pPage->pBt->usableSize==65536 ){
  1576        top = 65536;
  1577      }else{
  1578        return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1579      }
  1580    }
  1581  
  1582    /* If there is enough space between gap and top for one more cell pointer
  1583    ** array entry offset, and if the freelist is not empty, then search the
  1584    ** freelist looking for a free slot big enough to satisfy the request.
  1585    */
  1586    testcase( gap+2==top );
  1587    testcase( gap+1==top );
  1588    testcase( gap==top );
  1589    if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
  1590      u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
  1591      if( pSpace ){
  1592        assert( pSpace>=data && (pSpace - data)<65536 );
  1593        *pIdx = (int)(pSpace - data);
  1594        return SQLITE_OK;
  1595      }else if( rc ){
  1596        return rc;
  1597      }
  1598    }
  1599  
  1600    /* The request could not be fulfilled using a freelist slot.  Check
  1601    ** to see if defragmentation is necessary.
  1602    */
  1603    testcase( gap+2+nByte==top );
  1604    if( gap+2+nByte>top ){
  1605      assert( pPage->nCell>0 || CORRUPT_DB );
  1606      rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
  1607      if( rc ) return rc;
  1608      top = get2byteNotZero(&data[hdr+5]);
  1609      assert( gap+2+nByte<=top );
  1610    }
  1611  
  1612  
  1613    /* Allocate memory from the gap in between the cell pointer array
  1614    ** and the cell content area.  The btreeInitPage() call has already
  1615    ** validated the freelist.  Given that the freelist is valid, there
  1616    ** is no way that the allocation can extend off the end of the page.
  1617    ** The assert() below verifies the previous sentence.
  1618    */
  1619    top -= nByte;
  1620    put2byte(&data[hdr+5], top);
  1621    assert( top+nByte <= (int)pPage->pBt->usableSize );
  1622    *pIdx = top;
  1623    return SQLITE_OK;
  1624  }
  1625  
  1626  /*
  1627  ** Return a section of the pPage->aData to the freelist.
  1628  ** The first byte of the new free block is pPage->aData[iStart]
  1629  ** and the size of the block is iSize bytes.
  1630  **
  1631  ** Adjacent freeblocks are coalesced.
  1632  **
  1633  ** Note that even though the freeblock list was checked by btreeInitPage(),
  1634  ** that routine will not detect overlap between cells or freeblocks.  Nor
  1635  ** does it detect cells or freeblocks that encrouch into the reserved bytes
  1636  ** at the end of the page.  So do additional corruption checks inside this
  1637  ** routine and return SQLITE_CORRUPT if any problems are found.
  1638  */
  1639  static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
  1640    u16 iPtr;                             /* Address of ptr to next freeblock */
  1641    u16 iFreeBlk;                         /* Address of the next freeblock */
  1642    u8 hdr;                               /* Page header size.  0 or 100 */
  1643    u8 nFrag = 0;                         /* Reduction in fragmentation */
  1644    u16 iOrigSize = iSize;                /* Original value of iSize */
  1645    u16 x;                                /* Offset to cell content area */
  1646    u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
  1647    unsigned char *data = pPage->aData;   /* Page content */
  1648  
  1649    assert( pPage->pBt!=0 );
  1650    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1651    assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
  1652    assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
  1653    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1654    assert( iSize>=4 );   /* Minimum cell size is 4 */
  1655    assert( iStart<=pPage->pBt->usableSize-4 );
  1656  
  1657    /* The list of freeblocks must be in ascending order.  Find the 
  1658    ** spot on the list where iStart should be inserted.
  1659    */
  1660    hdr = pPage->hdrOffset;
  1661    iPtr = hdr + 1;
  1662    if( data[iPtr+1]==0 && data[iPtr]==0 ){
  1663      iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
  1664    }else{
  1665      while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
  1666        if( iFreeBlk<iPtr+4 ){
  1667          if( iFreeBlk==0 ) break;
  1668          return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1669        }
  1670        iPtr = iFreeBlk;
  1671      }
  1672      if( iFreeBlk>pPage->pBt->usableSize-4 ){
  1673        return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1674      }
  1675      assert( iFreeBlk>iPtr || iFreeBlk==0 );
  1676    
  1677      /* At this point:
  1678      **    iFreeBlk:   First freeblock after iStart, or zero if none
  1679      **    iPtr:       The address of a pointer to iFreeBlk
  1680      **
  1681      ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
  1682      */
  1683      if( iFreeBlk && iEnd+3>=iFreeBlk ){
  1684        nFrag = iFreeBlk - iEnd;
  1685        if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1686        iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
  1687        if( iEnd > pPage->pBt->usableSize ){
  1688          return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1689        }
  1690        iSize = iEnd - iStart;
  1691        iFreeBlk = get2byte(&data[iFreeBlk]);
  1692      }
  1693    
  1694      /* If iPtr is another freeblock (that is, if iPtr is not the freelist
  1695      ** pointer in the page header) then check to see if iStart should be
  1696      ** coalesced onto the end of iPtr.
  1697      */
  1698      if( iPtr>hdr+1 ){
  1699        int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
  1700        if( iPtrEnd+3>=iStart ){
  1701          if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1702          nFrag += iStart - iPtrEnd;
  1703          iSize = iEnd - iPtr;
  1704          iStart = iPtr;
  1705        }
  1706      }
  1707      if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1708      data[hdr+7] -= nFrag;
  1709    }
  1710    x = get2byte(&data[hdr+5]);
  1711    if( iStart<=x ){
  1712      /* The new freeblock is at the beginning of the cell content area,
  1713      ** so just extend the cell content area rather than create another
  1714      ** freelist entry */
  1715      if( iStart<x || iPtr!=hdr+1 ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1716      put2byte(&data[hdr+1], iFreeBlk);
  1717      put2byte(&data[hdr+5], iEnd);
  1718    }else{
  1719      /* Insert the new freeblock into the freelist */
  1720      put2byte(&data[iPtr], iStart);
  1721    }
  1722    if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
  1723      /* Overwrite deleted information with zeros when the secure_delete
  1724      ** option is enabled */
  1725      memset(&data[iStart], 0, iSize);
  1726    }
  1727    put2byte(&data[iStart], iFreeBlk);
  1728    put2byte(&data[iStart+2], iSize);
  1729    pPage->nFree += iOrigSize;
  1730    return SQLITE_OK;
  1731  }
  1732  
  1733  /*
  1734  ** Decode the flags byte (the first byte of the header) for a page
  1735  ** and initialize fields of the MemPage structure accordingly.
  1736  **
  1737  ** Only the following combinations are supported.  Anything different
  1738  ** indicates a corrupt database files:
  1739  **
  1740  **         PTF_ZERODATA
  1741  **         PTF_ZERODATA | PTF_LEAF
  1742  **         PTF_LEAFDATA | PTF_INTKEY
  1743  **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
  1744  */
  1745  static int decodeFlags(MemPage *pPage, int flagByte){
  1746    BtShared *pBt;     /* A copy of pPage->pBt */
  1747  
  1748    assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
  1749    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1750    pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
  1751    flagByte &= ~PTF_LEAF;
  1752    pPage->childPtrSize = 4-4*pPage->leaf;
  1753    pPage->xCellSize = cellSizePtr;
  1754    pBt = pPage->pBt;
  1755    if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
  1756      /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
  1757      ** interior table b-tree page. */
  1758      assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
  1759      /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
  1760      ** leaf table b-tree page. */
  1761      assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
  1762      pPage->intKey = 1;
  1763      if( pPage->leaf ){
  1764        pPage->intKeyLeaf = 1;
  1765        pPage->xParseCell = btreeParseCellPtr;
  1766      }else{
  1767        pPage->intKeyLeaf = 0;
  1768        pPage->xCellSize = cellSizePtrNoPayload;
  1769        pPage->xParseCell = btreeParseCellPtrNoPayload;
  1770      }
  1771      pPage->maxLocal = pBt->maxLeaf;
  1772      pPage->minLocal = pBt->minLeaf;
  1773    }else if( flagByte==PTF_ZERODATA ){
  1774      /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
  1775      ** interior index b-tree page. */
  1776      assert( (PTF_ZERODATA)==2 );
  1777      /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
  1778      ** leaf index b-tree page. */
  1779      assert( (PTF_ZERODATA|PTF_LEAF)==10 );
  1780      pPage->intKey = 0;
  1781      pPage->intKeyLeaf = 0;
  1782      pPage->xParseCell = btreeParseCellPtrIndex;
  1783      pPage->maxLocal = pBt->maxLocal;
  1784      pPage->minLocal = pBt->minLocal;
  1785    }else{
  1786      /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
  1787      ** an error. */
  1788      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1789    }
  1790    pPage->max1bytePayload = pBt->max1bytePayload;
  1791    return SQLITE_OK;
  1792  }
  1793  
  1794  /*
  1795  ** Initialize the auxiliary information for a disk block.
  1796  **
  1797  ** Return SQLITE_OK on success.  If we see that the page does
  1798  ** not contain a well-formed database page, then return 
  1799  ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
  1800  ** guarantee that the page is well-formed.  It only shows that
  1801  ** we failed to detect any corruption.
  1802  */
  1803  static int btreeInitPage(MemPage *pPage){
  1804    int pc;            /* Address of a freeblock within pPage->aData[] */
  1805    u8 hdr;            /* Offset to beginning of page header */
  1806    u8 *data;          /* Equal to pPage->aData */
  1807    BtShared *pBt;        /* The main btree structure */
  1808    int usableSize;    /* Amount of usable space on each page */
  1809    u16 cellOffset;    /* Offset from start of page to first cell pointer */
  1810    int nFree;         /* Number of unused bytes on the page */
  1811    int top;           /* First byte of the cell content area */
  1812    int iCellFirst;    /* First allowable cell or freeblock offset */
  1813    int iCellLast;     /* Last possible cell or freeblock offset */
  1814  
  1815    assert( pPage->pBt!=0 );
  1816    assert( pPage->pBt->db!=0 );
  1817    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1818    assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
  1819    assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
  1820    assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
  1821    assert( pPage->isInit==0 );
  1822  
  1823    pBt = pPage->pBt;
  1824    hdr = pPage->hdrOffset;
  1825    data = pPage->aData;
  1826    /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
  1827    ** the b-tree page type. */
  1828    if( decodeFlags(pPage, data[hdr]) ){
  1829      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1830    }
  1831    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  1832    pPage->maskPage = (u16)(pBt->pageSize - 1);
  1833    pPage->nOverflow = 0;
  1834    usableSize = pBt->usableSize;
  1835    pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
  1836    pPage->aDataEnd = &data[usableSize];
  1837    pPage->aCellIdx = &data[cellOffset];
  1838    pPage->aDataOfst = &data[pPage->childPtrSize];
  1839    /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
  1840    ** the start of the cell content area. A zero value for this integer is
  1841    ** interpreted as 65536. */
  1842    top = get2byteNotZero(&data[hdr+5]);
  1843    /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
  1844    ** number of cells on the page. */
  1845    pPage->nCell = get2byte(&data[hdr+3]);
  1846    if( pPage->nCell>MX_CELL(pBt) ){
  1847      /* To many cells for a single page.  The page must be corrupt */
  1848      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1849    }
  1850    testcase( pPage->nCell==MX_CELL(pBt) );
  1851    /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
  1852    ** possible for a root page of a table that contains no rows) then the
  1853    ** offset to the cell content area will equal the page size minus the
  1854    ** bytes of reserved space. */
  1855    assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
  1856  
  1857    /* A malformed database page might cause us to read past the end
  1858    ** of page when parsing a cell.  
  1859    **
  1860    ** The following block of code checks early to see if a cell extends
  1861    ** past the end of a page boundary and causes SQLITE_CORRUPT to be 
  1862    ** returned if it does.
  1863    */
  1864    iCellFirst = cellOffset + 2*pPage->nCell;
  1865    iCellLast = usableSize - 4;
  1866    if( pBt->db->flags & SQLITE_CellSizeCk ){
  1867      int i;            /* Index into the cell pointer array */
  1868      int sz;           /* Size of a cell */
  1869  
  1870      if( !pPage->leaf ) iCellLast--;
  1871      for(i=0; i<pPage->nCell; i++){
  1872        pc = get2byteAligned(&data[cellOffset+i*2]);
  1873        testcase( pc==iCellFirst );
  1874        testcase( pc==iCellLast );
  1875        if( pc<iCellFirst || pc>iCellLast ){
  1876          return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1877        }
  1878        sz = pPage->xCellSize(pPage, &data[pc]);
  1879        testcase( pc+sz==usableSize );
  1880        if( pc+sz>usableSize ){
  1881          return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1882        }
  1883      }
  1884      if( !pPage->leaf ) iCellLast++;
  1885    }  
  1886  
  1887    /* Compute the total free space on the page
  1888    ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
  1889    ** start of the first freeblock on the page, or is zero if there are no
  1890    ** freeblocks. */
  1891    pc = get2byte(&data[hdr+1]);
  1892    nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
  1893    if( pc>0 ){
  1894      u32 next, size;
  1895      if( pc<iCellFirst ){
  1896        /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
  1897        ** always be at least one cell before the first freeblock.
  1898        */
  1899        return SQLITE_CORRUPT_PGNO(pPage->pgno); 
  1900      }
  1901      while( 1 ){
  1902        if( pc>iCellLast ){
  1903          /* Freeblock off the end of the page */
  1904          return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1905        }
  1906        next = get2byte(&data[pc]);
  1907        size = get2byte(&data[pc+2]);
  1908        nFree = nFree + size;
  1909        if( next<=pc+size+3 ) break;
  1910        pc = next;
  1911      }
  1912      if( next>0 ){
  1913        /* Freeblock not in ascending order */
  1914        return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1915      }
  1916      if( pc+size>(unsigned int)usableSize ){
  1917        /* Last freeblock extends past page end */
  1918        return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1919      }
  1920    }
  1921  
  1922    /* At this point, nFree contains the sum of the offset to the start
  1923    ** of the cell-content area plus the number of free bytes within
  1924    ** the cell-content area. If this is greater than the usable-size
  1925    ** of the page, then the page must be corrupted. This check also
  1926    ** serves to verify that the offset to the start of the cell-content
  1927    ** area, according to the page header, lies within the page.
  1928    */
  1929    if( nFree>usableSize ){
  1930      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  1931    }
  1932    pPage->nFree = (u16)(nFree - iCellFirst);
  1933    pPage->isInit = 1;
  1934    return SQLITE_OK;
  1935  }
  1936  
  1937  /*
  1938  ** Set up a raw page so that it looks like a database page holding
  1939  ** no entries.
  1940  */
  1941  static void zeroPage(MemPage *pPage, int flags){
  1942    unsigned char *data = pPage->aData;
  1943    BtShared *pBt = pPage->pBt;
  1944    u8 hdr = pPage->hdrOffset;
  1945    u16 first;
  1946  
  1947    assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
  1948    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1949    assert( sqlite3PagerGetData(pPage->pDbPage) == data );
  1950    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1951    assert( sqlite3_mutex_held(pBt->mutex) );
  1952    if( pBt->btsFlags & BTS_FAST_SECURE ){
  1953      memset(&data[hdr], 0, pBt->usableSize - hdr);
  1954    }
  1955    data[hdr] = (char)flags;
  1956    first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
  1957    memset(&data[hdr+1], 0, 4);
  1958    data[hdr+7] = 0;
  1959    put2byte(&data[hdr+5], pBt->usableSize);
  1960    pPage->nFree = (u16)(pBt->usableSize - first);
  1961    decodeFlags(pPage, flags);
  1962    pPage->cellOffset = first;
  1963    pPage->aDataEnd = &data[pBt->usableSize];
  1964    pPage->aCellIdx = &data[first];
  1965    pPage->aDataOfst = &data[pPage->childPtrSize];
  1966    pPage->nOverflow = 0;
  1967    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  1968    pPage->maskPage = (u16)(pBt->pageSize - 1);
  1969    pPage->nCell = 0;
  1970    pPage->isInit = 1;
  1971  }
  1972  
  1973  
  1974  /*
  1975  ** Convert a DbPage obtained from the pager into a MemPage used by
  1976  ** the btree layer.
  1977  */
  1978  static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
  1979    MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  1980    if( pgno!=pPage->pgno ){
  1981      pPage->aData = sqlite3PagerGetData(pDbPage);
  1982      pPage->pDbPage = pDbPage;
  1983      pPage->pBt = pBt;
  1984      pPage->pgno = pgno;
  1985      pPage->hdrOffset = pgno==1 ? 100 : 0;
  1986    }
  1987    assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
  1988    return pPage; 
  1989  }
  1990  
  1991  /*
  1992  ** Get a page from the pager.  Initialize the MemPage.pBt and
  1993  ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
  1994  **
  1995  ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
  1996  ** about the content of the page at this time.  So do not go to the disk
  1997  ** to fetch the content.  Just fill in the content with zeros for now.
  1998  ** If in the future we call sqlite3PagerWrite() on this page, that
  1999  ** means we have started to be concerned about content and the disk
  2000  ** read should occur at that point.
  2001  */
  2002  static int btreeGetPage(
  2003    BtShared *pBt,       /* The btree */
  2004    Pgno pgno,           /* Number of the page to fetch */
  2005    MemPage **ppPage,    /* Return the page in this parameter */
  2006    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
  2007  ){
  2008    int rc;
  2009    DbPage *pDbPage;
  2010  
  2011    assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
  2012    assert( sqlite3_mutex_held(pBt->mutex) );
  2013    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
  2014    if( rc ) return rc;
  2015    *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
  2016    return SQLITE_OK;
  2017  }
  2018  
  2019  /*
  2020  ** Retrieve a page from the pager cache. If the requested page is not
  2021  ** already in the pager cache return NULL. Initialize the MemPage.pBt and
  2022  ** MemPage.aData elements if needed.
  2023  */
  2024  static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
  2025    DbPage *pDbPage;
  2026    assert( sqlite3_mutex_held(pBt->mutex) );
  2027    pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
  2028    if( pDbPage ){
  2029      return btreePageFromDbPage(pDbPage, pgno, pBt);
  2030    }
  2031    return 0;
  2032  }
  2033  
  2034  /*
  2035  ** Return the size of the database file in pages. If there is any kind of
  2036  ** error, return ((unsigned int)-1).
  2037  */
  2038  static Pgno btreePagecount(BtShared *pBt){
  2039    return pBt->nPage;
  2040  }
  2041  u32 sqlite3BtreeLastPage(Btree *p){
  2042    assert( sqlite3BtreeHoldsMutex(p) );
  2043    assert( ((p->pBt->nPage)&0x80000000)==0 );
  2044    return btreePagecount(p->pBt);
  2045  }
  2046  
  2047  /*
  2048  ** Get a page from the pager and initialize it.
  2049  **
  2050  ** If pCur!=0 then the page is being fetched as part of a moveToChild()
  2051  ** call.  Do additional sanity checking on the page in this case.
  2052  ** And if the fetch fails, this routine must decrement pCur->iPage.
  2053  **
  2054  ** The page is fetched as read-write unless pCur is not NULL and is
  2055  ** a read-only cursor.
  2056  **
  2057  ** If an error occurs, then *ppPage is undefined. It
  2058  ** may remain unchanged, or it may be set to an invalid value.
  2059  */
  2060  static int getAndInitPage(
  2061    BtShared *pBt,                  /* The database file */
  2062    Pgno pgno,                      /* Number of the page to get */
  2063    MemPage **ppPage,               /* Write the page pointer here */
  2064    BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
  2065    int bReadOnly                   /* True for a read-only page */
  2066  ){
  2067    int rc;
  2068    DbPage *pDbPage;
  2069    assert( sqlite3_mutex_held(pBt->mutex) );
  2070    assert( pCur==0 || ppPage==&pCur->pPage );
  2071    assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
  2072    assert( pCur==0 || pCur->iPage>0 );
  2073  
  2074    if( pgno>btreePagecount(pBt) ){
  2075      rc = SQLITE_CORRUPT_BKPT;
  2076      goto getAndInitPage_error;
  2077    }
  2078    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
  2079    if( rc ){
  2080      goto getAndInitPage_error;
  2081    }
  2082    *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  2083    if( (*ppPage)->isInit==0 ){
  2084      btreePageFromDbPage(pDbPage, pgno, pBt);
  2085      rc = btreeInitPage(*ppPage);
  2086      if( rc!=SQLITE_OK ){
  2087        releasePage(*ppPage);
  2088        goto getAndInitPage_error;
  2089      }
  2090    }
  2091    assert( (*ppPage)->pgno==pgno );
  2092    assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
  2093  
  2094    /* If obtaining a child page for a cursor, we must verify that the page is
  2095    ** compatible with the root page. */
  2096    if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
  2097      rc = SQLITE_CORRUPT_PGNO(pgno);
  2098      releasePage(*ppPage);
  2099      goto getAndInitPage_error;
  2100    }
  2101    return SQLITE_OK;
  2102  
  2103  getAndInitPage_error:
  2104    if( pCur ){
  2105      pCur->iPage--;
  2106      pCur->pPage = pCur->apPage[pCur->iPage];
  2107    }
  2108    testcase( pgno==0 );
  2109    assert( pgno!=0 || rc==SQLITE_CORRUPT );
  2110    return rc;
  2111  }
  2112  
  2113  /*
  2114  ** Release a MemPage.  This should be called once for each prior
  2115  ** call to btreeGetPage.
  2116  **
  2117  ** Page1 is a special case and must be released using releasePageOne().
  2118  */
  2119  static void releasePageNotNull(MemPage *pPage){
  2120    assert( pPage->aData );
  2121    assert( pPage->pBt );
  2122    assert( pPage->pDbPage!=0 );
  2123    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  2124    assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
  2125    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2126    sqlite3PagerUnrefNotNull(pPage->pDbPage);
  2127  }
  2128  static void releasePage(MemPage *pPage){
  2129    if( pPage ) releasePageNotNull(pPage);
  2130  }
  2131  static void releasePageOne(MemPage *pPage){
  2132    assert( pPage!=0 );
  2133    assert( pPage->aData );
  2134    assert( pPage->pBt );
  2135    assert( pPage->pDbPage!=0 );
  2136    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  2137    assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
  2138    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2139    sqlite3PagerUnrefPageOne(pPage->pDbPage);
  2140  }
  2141  
  2142  /*
  2143  ** Get an unused page.
  2144  **
  2145  ** This works just like btreeGetPage() with the addition:
  2146  **
  2147  **   *  If the page is already in use for some other purpose, immediately
  2148  **      release it and return an SQLITE_CURRUPT error.
  2149  **   *  Make sure the isInit flag is clear
  2150  */
  2151  static int btreeGetUnusedPage(
  2152    BtShared *pBt,       /* The btree */
  2153    Pgno pgno,           /* Number of the page to fetch */
  2154    MemPage **ppPage,    /* Return the page in this parameter */
  2155    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
  2156  ){
  2157    int rc = btreeGetPage(pBt, pgno, ppPage, flags);
  2158    if( rc==SQLITE_OK ){
  2159      if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
  2160        releasePage(*ppPage);
  2161        *ppPage = 0;
  2162        return SQLITE_CORRUPT_BKPT;
  2163      }
  2164      (*ppPage)->isInit = 0;
  2165    }else{
  2166      *ppPage = 0;
  2167    }
  2168    return rc;
  2169  }
  2170  
  2171  
  2172  /*
  2173  ** During a rollback, when the pager reloads information into the cache
  2174  ** so that the cache is restored to its original state at the start of
  2175  ** the transaction, for each page restored this routine is called.
  2176  **
  2177  ** This routine needs to reset the extra data section at the end of the
  2178  ** page to agree with the restored data.
  2179  */
  2180  static void pageReinit(DbPage *pData){
  2181    MemPage *pPage;
  2182    pPage = (MemPage *)sqlite3PagerGetExtra(pData);
  2183    assert( sqlite3PagerPageRefcount(pData)>0 );
  2184    if( pPage->isInit ){
  2185      assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2186      pPage->isInit = 0;
  2187      if( sqlite3PagerPageRefcount(pData)>1 ){
  2188        /* pPage might not be a btree page;  it might be an overflow page
  2189        ** or ptrmap page or a free page.  In those cases, the following
  2190        ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
  2191        ** But no harm is done by this.  And it is very important that
  2192        ** btreeInitPage() be called on every btree page so we make
  2193        ** the call for every page that comes in for re-initing. */
  2194        btreeInitPage(pPage);
  2195      }
  2196    }
  2197  }
  2198  
  2199  /*
  2200  ** Invoke the busy handler for a btree.
  2201  */
  2202  static int btreeInvokeBusyHandler(void *pArg){
  2203    BtShared *pBt = (BtShared*)pArg;
  2204    assert( pBt->db );
  2205    assert( sqlite3_mutex_held(pBt->db->mutex) );
  2206    return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
  2207  }
  2208  
  2209  /*
  2210  ** Open a database file.
  2211  ** 
  2212  ** zFilename is the name of the database file.  If zFilename is NULL
  2213  ** then an ephemeral database is created.  The ephemeral database might
  2214  ** be exclusively in memory, or it might use a disk-based memory cache.
  2215  ** Either way, the ephemeral database will be automatically deleted 
  2216  ** when sqlite3BtreeClose() is called.
  2217  **
  2218  ** If zFilename is ":memory:" then an in-memory database is created
  2219  ** that is automatically destroyed when it is closed.
  2220  **
  2221  ** The "flags" parameter is a bitmask that might contain bits like
  2222  ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
  2223  **
  2224  ** If the database is already opened in the same database connection
  2225  ** and we are in shared cache mode, then the open will fail with an
  2226  ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
  2227  ** objects in the same database connection since doing so will lead
  2228  ** to problems with locking.
  2229  */
  2230  int sqlite3BtreeOpen(
  2231    sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
  2232    const char *zFilename,  /* Name of the file containing the BTree database */
  2233    sqlite3 *db,            /* Associated database handle */
  2234    Btree **ppBtree,        /* Pointer to new Btree object written here */
  2235    int flags,              /* Options */
  2236    int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
  2237  ){
  2238    BtShared *pBt = 0;             /* Shared part of btree structure */
  2239    Btree *p;                      /* Handle to return */
  2240    sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
  2241    int rc = SQLITE_OK;            /* Result code from this function */
  2242    u8 nReserve;                   /* Byte of unused space on each page */
  2243    unsigned char zDbHeader[100];  /* Database header content */
  2244  
  2245    /* True if opening an ephemeral, temporary database */
  2246    const int isTempDb = zFilename==0 || zFilename[0]==0;
  2247  
  2248    /* Set the variable isMemdb to true for an in-memory database, or 
  2249    ** false for a file-based database.
  2250    */
  2251  #ifdef SQLITE_OMIT_MEMORYDB
  2252    const int isMemdb = 0;
  2253  #else
  2254    const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
  2255                         || (isTempDb && sqlite3TempInMemory(db))
  2256                         || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
  2257  #endif
  2258  
  2259    assert( db!=0 );
  2260    assert( pVfs!=0 );
  2261    assert( sqlite3_mutex_held(db->mutex) );
  2262    assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
  2263  
  2264    /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
  2265    assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
  2266  
  2267    /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
  2268    assert( (flags & BTREE_SINGLE)==0 || isTempDb );
  2269  
  2270    if( isMemdb ){
  2271      flags |= BTREE_MEMORY;
  2272    }
  2273    if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
  2274      vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
  2275    }
  2276    p = sqlite3MallocZero(sizeof(Btree));
  2277    if( !p ){
  2278      return SQLITE_NOMEM_BKPT;
  2279    }
  2280    p->inTrans = TRANS_NONE;
  2281    p->db = db;
  2282  #ifndef SQLITE_OMIT_SHARED_CACHE
  2283    p->lock.pBtree = p;
  2284    p->lock.iTable = 1;
  2285  #endif
  2286  
  2287  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  2288    /*
  2289    ** If this Btree is a candidate for shared cache, try to find an
  2290    ** existing BtShared object that we can share with
  2291    */
  2292    if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
  2293      if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
  2294        int nFilename = sqlite3Strlen30(zFilename)+1;
  2295        int nFullPathname = pVfs->mxPathname+1;
  2296        char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
  2297        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
  2298  
  2299        p->sharable = 1;
  2300        if( !zFullPathname ){
  2301          sqlite3_free(p);
  2302          return SQLITE_NOMEM_BKPT;
  2303        }
  2304        if( isMemdb ){
  2305          memcpy(zFullPathname, zFilename, nFilename);
  2306        }else{
  2307          rc = sqlite3OsFullPathname(pVfs, zFilename,
  2308                                     nFullPathname, zFullPathname);
  2309          if( rc ){
  2310            sqlite3_free(zFullPathname);
  2311            sqlite3_free(p);
  2312            return rc;
  2313          }
  2314        }
  2315  #if SQLITE_THREADSAFE
  2316        mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
  2317        sqlite3_mutex_enter(mutexOpen);
  2318        mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  2319        sqlite3_mutex_enter(mutexShared);
  2320  #endif
  2321        for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
  2322          assert( pBt->nRef>0 );
  2323          if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
  2324                   && sqlite3PagerVfs(pBt->pPager)==pVfs ){
  2325            int iDb;
  2326            for(iDb=db->nDb-1; iDb>=0; iDb--){
  2327              Btree *pExisting = db->aDb[iDb].pBt;
  2328              if( pExisting && pExisting->pBt==pBt ){
  2329                sqlite3_mutex_leave(mutexShared);
  2330                sqlite3_mutex_leave(mutexOpen);
  2331                sqlite3_free(zFullPathname);
  2332                sqlite3_free(p);
  2333                return SQLITE_CONSTRAINT;
  2334              }
  2335            }
  2336            p->pBt = pBt;
  2337            pBt->nRef++;
  2338            break;
  2339          }
  2340        }
  2341        sqlite3_mutex_leave(mutexShared);
  2342        sqlite3_free(zFullPathname);
  2343      }
  2344  #ifdef SQLITE_DEBUG
  2345      else{
  2346        /* In debug mode, we mark all persistent databases as sharable
  2347        ** even when they are not.  This exercises the locking code and
  2348        ** gives more opportunity for asserts(sqlite3_mutex_held())
  2349        ** statements to find locking problems.
  2350        */
  2351        p->sharable = 1;
  2352      }
  2353  #endif
  2354    }
  2355  #endif
  2356    if( pBt==0 ){
  2357      /*
  2358      ** The following asserts make sure that structures used by the btree are
  2359      ** the right size.  This is to guard against size changes that result
  2360      ** when compiling on a different architecture.
  2361      */
  2362      assert( sizeof(i64)==8 );
  2363      assert( sizeof(u64)==8 );
  2364      assert( sizeof(u32)==4 );
  2365      assert( sizeof(u16)==2 );
  2366      assert( sizeof(Pgno)==4 );
  2367    
  2368      pBt = sqlite3MallocZero( sizeof(*pBt) );
  2369      if( pBt==0 ){
  2370        rc = SQLITE_NOMEM_BKPT;
  2371        goto btree_open_out;
  2372      }
  2373      rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
  2374                            sizeof(MemPage), flags, vfsFlags, pageReinit);
  2375      if( rc==SQLITE_OK ){
  2376        sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
  2377        rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
  2378      }
  2379      if( rc!=SQLITE_OK ){
  2380        goto btree_open_out;
  2381      }
  2382      pBt->openFlags = (u8)flags;
  2383      pBt->db = db;
  2384      sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
  2385      p->pBt = pBt;
  2386    
  2387      pBt->pCursor = 0;
  2388      pBt->pPage1 = 0;
  2389      if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
  2390  #if defined(SQLITE_SECURE_DELETE)
  2391      pBt->btsFlags |= BTS_SECURE_DELETE;
  2392  #elif defined(SQLITE_FAST_SECURE_DELETE)
  2393      pBt->btsFlags |= BTS_OVERWRITE;
  2394  #endif
  2395      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
  2396      ** determined by the 2-byte integer located at an offset of 16 bytes from
  2397      ** the beginning of the database file. */
  2398      pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
  2399      if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
  2400           || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
  2401        pBt->pageSize = 0;
  2402  #ifndef SQLITE_OMIT_AUTOVACUUM
  2403        /* If the magic name ":memory:" will create an in-memory database, then
  2404        ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
  2405        ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
  2406        ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
  2407        ** regular file-name. In this case the auto-vacuum applies as per normal.
  2408        */
  2409        if( zFilename && !isMemdb ){
  2410          pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
  2411          pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
  2412        }
  2413  #endif
  2414        nReserve = 0;
  2415      }else{
  2416        /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
  2417        ** determined by the one-byte unsigned integer found at an offset of 20
  2418        ** into the database file header. */
  2419        nReserve = zDbHeader[20];
  2420        pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  2421  #ifndef SQLITE_OMIT_AUTOVACUUM
  2422        pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
  2423        pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
  2424  #endif
  2425      }
  2426      rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
  2427      if( rc ) goto btree_open_out;
  2428      pBt->usableSize = pBt->pageSize - nReserve;
  2429      assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
  2430     
  2431  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  2432      /* Add the new BtShared object to the linked list sharable BtShareds.
  2433      */
  2434      pBt->nRef = 1;
  2435      if( p->sharable ){
  2436        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
  2437        MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
  2438        if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
  2439          pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
  2440          if( pBt->mutex==0 ){
  2441            rc = SQLITE_NOMEM_BKPT;
  2442            goto btree_open_out;
  2443          }
  2444        }
  2445        sqlite3_mutex_enter(mutexShared);
  2446        pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
  2447        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
  2448        sqlite3_mutex_leave(mutexShared);
  2449      }
  2450  #endif
  2451    }
  2452  
  2453  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  2454    /* If the new Btree uses a sharable pBtShared, then link the new
  2455    ** Btree into the list of all sharable Btrees for the same connection.
  2456    ** The list is kept in ascending order by pBt address.
  2457    */
  2458    if( p->sharable ){
  2459      int i;
  2460      Btree *pSib;
  2461      for(i=0; i<db->nDb; i++){
  2462        if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
  2463          while( pSib->pPrev ){ pSib = pSib->pPrev; }
  2464          if( (uptr)p->pBt<(uptr)pSib->pBt ){
  2465            p->pNext = pSib;
  2466            p->pPrev = 0;
  2467            pSib->pPrev = p;
  2468          }else{
  2469            while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
  2470              pSib = pSib->pNext;
  2471            }
  2472            p->pNext = pSib->pNext;
  2473            p->pPrev = pSib;
  2474            if( p->pNext ){
  2475              p->pNext->pPrev = p;
  2476            }
  2477            pSib->pNext = p;
  2478          }
  2479          break;
  2480        }
  2481      }
  2482    }
  2483  #endif
  2484    *ppBtree = p;
  2485  
  2486  btree_open_out:
  2487    if( rc!=SQLITE_OK ){
  2488      if( pBt && pBt->pPager ){
  2489        sqlite3PagerClose(pBt->pPager, 0);
  2490      }
  2491      sqlite3_free(pBt);
  2492      sqlite3_free(p);
  2493      *ppBtree = 0;
  2494    }else{
  2495      sqlite3_file *pFile;
  2496  
  2497      /* If the B-Tree was successfully opened, set the pager-cache size to the
  2498      ** default value. Except, when opening on an existing shared pager-cache,
  2499      ** do not change the pager-cache size.
  2500      */
  2501      if( sqlite3BtreeSchema(p, 0, 0)==0 ){
  2502        sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
  2503      }
  2504  
  2505      pFile = sqlite3PagerFile(pBt->pPager);
  2506      if( pFile->pMethods ){
  2507        sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
  2508      }
  2509    }
  2510    if( mutexOpen ){
  2511      assert( sqlite3_mutex_held(mutexOpen) );
  2512      sqlite3_mutex_leave(mutexOpen);
  2513    }
  2514    assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
  2515    return rc;
  2516  }
  2517  
  2518  /*
  2519  ** Decrement the BtShared.nRef counter.  When it reaches zero,
  2520  ** remove the BtShared structure from the sharing list.  Return
  2521  ** true if the BtShared.nRef counter reaches zero and return
  2522  ** false if it is still positive.
  2523  */
  2524  static int removeFromSharingList(BtShared *pBt){
  2525  #ifndef SQLITE_OMIT_SHARED_CACHE
  2526    MUTEX_LOGIC( sqlite3_mutex *pMaster; )
  2527    BtShared *pList;
  2528    int removed = 0;
  2529  
  2530    assert( sqlite3_mutex_notheld(pBt->mutex) );
  2531    MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
  2532    sqlite3_mutex_enter(pMaster);
  2533    pBt->nRef--;
  2534    if( pBt->nRef<=0 ){
  2535      if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
  2536        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
  2537      }else{
  2538        pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
  2539        while( ALWAYS(pList) && pList->pNext!=pBt ){
  2540          pList=pList->pNext;
  2541        }
  2542        if( ALWAYS(pList) ){
  2543          pList->pNext = pBt->pNext;
  2544        }
  2545      }
  2546      if( SQLITE_THREADSAFE ){
  2547        sqlite3_mutex_free(pBt->mutex);
  2548      }
  2549      removed = 1;
  2550    }
  2551    sqlite3_mutex_leave(pMaster);
  2552    return removed;
  2553  #else
  2554    return 1;
  2555  #endif
  2556  }
  2557  
  2558  /*
  2559  ** Make sure pBt->pTmpSpace points to an allocation of 
  2560  ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
  2561  ** pointer.
  2562  */
  2563  static void allocateTempSpace(BtShared *pBt){
  2564    if( !pBt->pTmpSpace ){
  2565      pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
  2566  
  2567      /* One of the uses of pBt->pTmpSpace is to format cells before
  2568      ** inserting them into a leaf page (function fillInCell()). If
  2569      ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
  2570      ** by the various routines that manipulate binary cells. Which
  2571      ** can mean that fillInCell() only initializes the first 2 or 3
  2572      ** bytes of pTmpSpace, but that the first 4 bytes are copied from
  2573      ** it into a database page. This is not actually a problem, but it
  2574      ** does cause a valgrind error when the 1 or 2 bytes of unitialized 
  2575      ** data is passed to system call write(). So to avoid this error,
  2576      ** zero the first 4 bytes of temp space here.
  2577      **
  2578      ** Also:  Provide four bytes of initialized space before the
  2579      ** beginning of pTmpSpace as an area available to prepend the
  2580      ** left-child pointer to the beginning of a cell.
  2581      */
  2582      if( pBt->pTmpSpace ){
  2583        memset(pBt->pTmpSpace, 0, 8);
  2584        pBt->pTmpSpace += 4;
  2585      }
  2586    }
  2587  }
  2588  
  2589  /*
  2590  ** Free the pBt->pTmpSpace allocation
  2591  */
  2592  static void freeTempSpace(BtShared *pBt){
  2593    if( pBt->pTmpSpace ){
  2594      pBt->pTmpSpace -= 4;
  2595      sqlite3PageFree(pBt->pTmpSpace);
  2596      pBt->pTmpSpace = 0;
  2597    }
  2598  }
  2599  
  2600  /*
  2601  ** Close an open database and invalidate all cursors.
  2602  */
  2603  int sqlite3BtreeClose(Btree *p){
  2604    BtShared *pBt = p->pBt;
  2605    BtCursor *pCur;
  2606  
  2607    /* Close all cursors opened via this handle.  */
  2608    assert( sqlite3_mutex_held(p->db->mutex) );
  2609    sqlite3BtreeEnter(p);
  2610    pCur = pBt->pCursor;
  2611    while( pCur ){
  2612      BtCursor *pTmp = pCur;
  2613      pCur = pCur->pNext;
  2614      if( pTmp->pBtree==p ){
  2615        sqlite3BtreeCloseCursor(pTmp);
  2616      }
  2617    }
  2618  
  2619    /* Rollback any active transaction and free the handle structure.
  2620    ** The call to sqlite3BtreeRollback() drops any table-locks held by
  2621    ** this handle.
  2622    */
  2623    sqlite3BtreeRollback(p, SQLITE_OK, 0);
  2624    sqlite3BtreeLeave(p);
  2625  
  2626    /* If there are still other outstanding references to the shared-btree
  2627    ** structure, return now. The remainder of this procedure cleans 
  2628    ** up the shared-btree.
  2629    */
  2630    assert( p->wantToLock==0 && p->locked==0 );
  2631    if( !p->sharable || removeFromSharingList(pBt) ){
  2632      /* The pBt is no longer on the sharing list, so we can access
  2633      ** it without having to hold the mutex.
  2634      **
  2635      ** Clean out and delete the BtShared object.
  2636      */
  2637      assert( !pBt->pCursor );
  2638      sqlite3PagerClose(pBt->pPager, p->db);
  2639      if( pBt->xFreeSchema && pBt->pSchema ){
  2640        pBt->xFreeSchema(pBt->pSchema);
  2641      }
  2642      sqlite3DbFree(0, pBt->pSchema);
  2643      freeTempSpace(pBt);
  2644      sqlite3_free(pBt);
  2645    }
  2646  
  2647  #ifndef SQLITE_OMIT_SHARED_CACHE
  2648    assert( p->wantToLock==0 );
  2649    assert( p->locked==0 );
  2650    if( p->pPrev ) p->pPrev->pNext = p->pNext;
  2651    if( p->pNext ) p->pNext->pPrev = p->pPrev;
  2652  #endif
  2653  
  2654    sqlite3_free(p);
  2655    return SQLITE_OK;
  2656  }
  2657  
  2658  /*
  2659  ** Change the "soft" limit on the number of pages in the cache.
  2660  ** Unused and unmodified pages will be recycled when the number of
  2661  ** pages in the cache exceeds this soft limit.  But the size of the
  2662  ** cache is allowed to grow larger than this limit if it contains
  2663  ** dirty pages or pages still in active use.
  2664  */
  2665  int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
  2666    BtShared *pBt = p->pBt;
  2667    assert( sqlite3_mutex_held(p->db->mutex) );
  2668    sqlite3BtreeEnter(p);
  2669    sqlite3PagerSetCachesize(pBt->pPager, mxPage);
  2670    sqlite3BtreeLeave(p);
  2671    return SQLITE_OK;
  2672  }
  2673  
  2674  /*
  2675  ** Change the "spill" limit on the number of pages in the cache.
  2676  ** If the number of pages exceeds this limit during a write transaction,
  2677  ** the pager might attempt to "spill" pages to the journal early in
  2678  ** order to free up memory.
  2679  **
  2680  ** The value returned is the current spill size.  If zero is passed
  2681  ** as an argument, no changes are made to the spill size setting, so
  2682  ** using mxPage of 0 is a way to query the current spill size.
  2683  */
  2684  int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
  2685    BtShared *pBt = p->pBt;
  2686    int res;
  2687    assert( sqlite3_mutex_held(p->db->mutex) );
  2688    sqlite3BtreeEnter(p);
  2689    res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
  2690    sqlite3BtreeLeave(p);
  2691    return res;
  2692  }
  2693  
  2694  #if SQLITE_MAX_MMAP_SIZE>0
  2695  /*
  2696  ** Change the limit on the amount of the database file that may be
  2697  ** memory mapped.
  2698  */
  2699  int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
  2700    BtShared *pBt = p->pBt;
  2701    assert( sqlite3_mutex_held(p->db->mutex) );
  2702    sqlite3BtreeEnter(p);
  2703    sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
  2704    sqlite3BtreeLeave(p);
  2705    return SQLITE_OK;
  2706  }
  2707  #endif /* SQLITE_MAX_MMAP_SIZE>0 */
  2708  
  2709  /*
  2710  ** Change the way data is synced to disk in order to increase or decrease
  2711  ** how well the database resists damage due to OS crashes and power
  2712  ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
  2713  ** there is a high probability of damage)  Level 2 is the default.  There
  2714  ** is a very low but non-zero probability of damage.  Level 3 reduces the
  2715  ** probability of damage to near zero but with a write performance reduction.
  2716  */
  2717  #ifndef SQLITE_OMIT_PAGER_PRAGMAS
  2718  int sqlite3BtreeSetPagerFlags(
  2719    Btree *p,              /* The btree to set the safety level on */
  2720    unsigned pgFlags       /* Various PAGER_* flags */
  2721  ){
  2722    BtShared *pBt = p->pBt;
  2723    assert( sqlite3_mutex_held(p->db->mutex) );
  2724    sqlite3BtreeEnter(p);
  2725    sqlite3PagerSetFlags(pBt->pPager, pgFlags);
  2726    sqlite3BtreeLeave(p);
  2727    return SQLITE_OK;
  2728  }
  2729  #endif
  2730  
  2731  /*
  2732  ** Change the default pages size and the number of reserved bytes per page.
  2733  ** Or, if the page size has already been fixed, return SQLITE_READONLY 
  2734  ** without changing anything.
  2735  **
  2736  ** The page size must be a power of 2 between 512 and 65536.  If the page
  2737  ** size supplied does not meet this constraint then the page size is not
  2738  ** changed.
  2739  **
  2740  ** Page sizes are constrained to be a power of two so that the region
  2741  ** of the database file used for locking (beginning at PENDING_BYTE,
  2742  ** the first byte past the 1GB boundary, 0x40000000) needs to occur
  2743  ** at the beginning of a page.
  2744  **
  2745  ** If parameter nReserve is less than zero, then the number of reserved
  2746  ** bytes per page is left unchanged.
  2747  **
  2748  ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
  2749  ** and autovacuum mode can no longer be changed.
  2750  */
  2751  int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
  2752    int rc = SQLITE_OK;
  2753    BtShared *pBt = p->pBt;
  2754    assert( nReserve>=-1 && nReserve<=255 );
  2755    sqlite3BtreeEnter(p);
  2756  #if SQLITE_HAS_CODEC
  2757    if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
  2758  #endif
  2759    if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
  2760      sqlite3BtreeLeave(p);
  2761      return SQLITE_READONLY;
  2762    }
  2763    if( nReserve<0 ){
  2764      nReserve = pBt->pageSize - pBt->usableSize;
  2765    }
  2766    assert( nReserve>=0 && nReserve<=255 );
  2767    if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
  2768          ((pageSize-1)&pageSize)==0 ){
  2769      assert( (pageSize & 7)==0 );
  2770      assert( !pBt->pCursor );
  2771      pBt->pageSize = (u32)pageSize;
  2772      freeTempSpace(pBt);
  2773    }
  2774    rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
  2775    pBt->usableSize = pBt->pageSize - (u16)nReserve;
  2776    if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  2777    sqlite3BtreeLeave(p);
  2778    return rc;
  2779  }
  2780  
  2781  /*
  2782  ** Return the currently defined page size
  2783  */
  2784  int sqlite3BtreeGetPageSize(Btree *p){
  2785    return p->pBt->pageSize;
  2786  }
  2787  
  2788  /*
  2789  ** This function is similar to sqlite3BtreeGetReserve(), except that it
  2790  ** may only be called if it is guaranteed that the b-tree mutex is already
  2791  ** held.
  2792  **
  2793  ** This is useful in one special case in the backup API code where it is
  2794  ** known that the shared b-tree mutex is held, but the mutex on the 
  2795  ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
  2796  ** were to be called, it might collide with some other operation on the
  2797  ** database handle that owns *p, causing undefined behavior.
  2798  */
  2799  int sqlite3BtreeGetReserveNoMutex(Btree *p){
  2800    int n;
  2801    assert( sqlite3_mutex_held(p->pBt->mutex) );
  2802    n = p->pBt->pageSize - p->pBt->usableSize;
  2803    return n;
  2804  }
  2805  
  2806  /*
  2807  ** Return the number of bytes of space at the end of every page that
  2808  ** are intentually left unused.  This is the "reserved" space that is
  2809  ** sometimes used by extensions.
  2810  **
  2811  ** If SQLITE_HAS_MUTEX is defined then the number returned is the
  2812  ** greater of the current reserved space and the maximum requested
  2813  ** reserve space.
  2814  */
  2815  int sqlite3BtreeGetOptimalReserve(Btree *p){
  2816    int n;
  2817    sqlite3BtreeEnter(p);
  2818    n = sqlite3BtreeGetReserveNoMutex(p);
  2819  #ifdef SQLITE_HAS_CODEC
  2820    if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
  2821  #endif
  2822    sqlite3BtreeLeave(p);
  2823    return n;
  2824  }
  2825  
  2826  
  2827  /*
  2828  ** Set the maximum page count for a database if mxPage is positive.
  2829  ** No changes are made if mxPage is 0 or negative.
  2830  ** Regardless of the value of mxPage, return the maximum page count.
  2831  */
  2832  int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
  2833    int n;
  2834    sqlite3BtreeEnter(p);
  2835    n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
  2836    sqlite3BtreeLeave(p);
  2837    return n;
  2838  }
  2839  
  2840  /*
  2841  ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
  2842  **
  2843  **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
  2844  **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
  2845  **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
  2846  **    newFlag==(-1)    No changes
  2847  **
  2848  ** This routine acts as a query if newFlag is less than zero
  2849  **
  2850  ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
  2851  ** freelist leaf pages are not written back to the database.  Thus in-page
  2852  ** deleted content is cleared, but freelist deleted content is not.
  2853  **
  2854  ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
  2855  ** that freelist leaf pages are written back into the database, increasing
  2856  ** the amount of disk I/O.
  2857  */
  2858  int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
  2859    int b;
  2860    if( p==0 ) return 0;
  2861    sqlite3BtreeEnter(p);
  2862    assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
  2863    assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
  2864    if( newFlag>=0 ){
  2865      p->pBt->btsFlags &= ~BTS_FAST_SECURE;
  2866      p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
  2867    }
  2868    b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
  2869    sqlite3BtreeLeave(p);
  2870    return b;
  2871  }
  2872  
  2873  /*
  2874  ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
  2875  ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
  2876  ** is disabled. The default value for the auto-vacuum property is 
  2877  ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
  2878  */
  2879  int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
  2880  #ifdef SQLITE_OMIT_AUTOVACUUM
  2881    return SQLITE_READONLY;
  2882  #else
  2883    BtShared *pBt = p->pBt;
  2884    int rc = SQLITE_OK;
  2885    u8 av = (u8)autoVacuum;
  2886  
  2887    sqlite3BtreeEnter(p);
  2888    if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
  2889      rc = SQLITE_READONLY;
  2890    }else{
  2891      pBt->autoVacuum = av ?1:0;
  2892      pBt->incrVacuum = av==2 ?1:0;
  2893    }
  2894    sqlite3BtreeLeave(p);
  2895    return rc;
  2896  #endif
  2897  }
  2898  
  2899  /*
  2900  ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
  2901  ** enabled 1 is returned. Otherwise 0.
  2902  */
  2903  int sqlite3BtreeGetAutoVacuum(Btree *p){
  2904  #ifdef SQLITE_OMIT_AUTOVACUUM
  2905    return BTREE_AUTOVACUUM_NONE;
  2906  #else
  2907    int rc;
  2908    sqlite3BtreeEnter(p);
  2909    rc = (
  2910      (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
  2911      (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
  2912      BTREE_AUTOVACUUM_INCR
  2913    );
  2914    sqlite3BtreeLeave(p);
  2915    return rc;
  2916  #endif
  2917  }
  2918  
  2919  /*
  2920  ** If the user has not set the safety-level for this database connection
  2921  ** using "PRAGMA synchronous", and if the safety-level is not already
  2922  ** set to the value passed to this function as the second parameter,
  2923  ** set it so.
  2924  */
  2925  #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
  2926      && !defined(SQLITE_OMIT_WAL)
  2927  static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
  2928    sqlite3 *db;
  2929    Db *pDb;
  2930    if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
  2931      while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
  2932      if( pDb->bSyncSet==0 
  2933       && pDb->safety_level!=safety_level 
  2934       && pDb!=&db->aDb[1] 
  2935      ){
  2936        pDb->safety_level = safety_level;
  2937        sqlite3PagerSetFlags(pBt->pPager,
  2938            pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
  2939      }
  2940    }
  2941  }
  2942  #else
  2943  # define setDefaultSyncFlag(pBt,safety_level)
  2944  #endif
  2945  
  2946  /*
  2947  ** Get a reference to pPage1 of the database file.  This will
  2948  ** also acquire a readlock on that file.
  2949  **
  2950  ** SQLITE_OK is returned on success.  If the file is not a
  2951  ** well-formed database file, then SQLITE_CORRUPT is returned.
  2952  ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
  2953  ** is returned if we run out of memory. 
  2954  */
  2955  static int lockBtree(BtShared *pBt){
  2956    int rc;              /* Result code from subfunctions */
  2957    MemPage *pPage1;     /* Page 1 of the database file */
  2958    int nPage;           /* Number of pages in the database */
  2959    int nPageFile = 0;   /* Number of pages in the database file */
  2960    int nPageHeader;     /* Number of pages in the database according to hdr */
  2961  
  2962    assert( sqlite3_mutex_held(pBt->mutex) );
  2963    assert( pBt->pPage1==0 );
  2964    rc = sqlite3PagerSharedLock(pBt->pPager);
  2965    if( rc!=SQLITE_OK ) return rc;
  2966    rc = btreeGetPage(pBt, 1, &pPage1, 0);
  2967    if( rc!=SQLITE_OK ) return rc;
  2968  
  2969    /* Do some checking to help insure the file we opened really is
  2970    ** a valid database file. 
  2971    */
  2972    nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
  2973    sqlite3PagerPagecount(pBt->pPager, &nPageFile);
  2974    if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
  2975      nPage = nPageFile;
  2976    }
  2977    if( nPage>0 ){
  2978      u32 pageSize;
  2979      u32 usableSize;
  2980      u8 *page1 = pPage1->aData;
  2981      rc = SQLITE_NOTADB;
  2982      /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
  2983      ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
  2984      ** 61 74 20 33 00. */
  2985      if( memcmp(page1, zMagicHeader, 16)!=0 ){
  2986        goto page1_init_failed;
  2987      }
  2988  
  2989  #ifdef SQLITE_OMIT_WAL
  2990      if( page1[18]>1 ){
  2991        pBt->btsFlags |= BTS_READ_ONLY;
  2992      }
  2993      if( page1[19]>1 ){
  2994        goto page1_init_failed;
  2995      }
  2996  #else
  2997      if( page1[18]>2 ){
  2998        pBt->btsFlags |= BTS_READ_ONLY;
  2999      }
  3000      if( page1[19]>2 ){
  3001        goto page1_init_failed;
  3002      }
  3003  
  3004      /* If the write version is set to 2, this database should be accessed
  3005      ** in WAL mode. If the log is not already open, open it now. Then 
  3006      ** return SQLITE_OK and return without populating BtShared.pPage1.
  3007      ** The caller detects this and calls this function again. This is
  3008      ** required as the version of page 1 currently in the page1 buffer
  3009      ** may not be the latest version - there may be a newer one in the log
  3010      ** file.
  3011      */
  3012      if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
  3013        int isOpen = 0;
  3014        rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
  3015        if( rc!=SQLITE_OK ){
  3016          goto page1_init_failed;
  3017        }else{
  3018          setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
  3019          if( isOpen==0 ){
  3020            releasePageOne(pPage1);
  3021            return SQLITE_OK;
  3022          }
  3023        }
  3024        rc = SQLITE_NOTADB;
  3025      }else{
  3026        setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
  3027      }
  3028  #endif
  3029  
  3030      /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
  3031      ** fractions and the leaf payload fraction values must be 64, 32, and 32.
  3032      **
  3033      ** The original design allowed these amounts to vary, but as of
  3034      ** version 3.6.0, we require them to be fixed.
  3035      */
  3036      if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
  3037        goto page1_init_failed;
  3038      }
  3039      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
  3040      ** determined by the 2-byte integer located at an offset of 16 bytes from
  3041      ** the beginning of the database file. */
  3042      pageSize = (page1[16]<<8) | (page1[17]<<16);
  3043      /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
  3044      ** between 512 and 65536 inclusive. */
  3045      if( ((pageSize-1)&pageSize)!=0
  3046       || pageSize>SQLITE_MAX_PAGE_SIZE 
  3047       || pageSize<=256 
  3048      ){
  3049        goto page1_init_failed;
  3050      }
  3051      assert( (pageSize & 7)==0 );
  3052      /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
  3053      ** integer at offset 20 is the number of bytes of space at the end of
  3054      ** each page to reserve for extensions. 
  3055      **
  3056      ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
  3057      ** determined by the one-byte unsigned integer found at an offset of 20
  3058      ** into the database file header. */
  3059      usableSize = pageSize - page1[20];
  3060      if( (u32)pageSize!=pBt->pageSize ){
  3061        /* After reading the first page of the database assuming a page size
  3062        ** of BtShared.pageSize, we have discovered that the page-size is
  3063        ** actually pageSize. Unlock the database, leave pBt->pPage1 at
  3064        ** zero and return SQLITE_OK. The caller will call this function
  3065        ** again with the correct page-size.
  3066        */
  3067        releasePageOne(pPage1);
  3068        pBt->usableSize = usableSize;
  3069        pBt->pageSize = pageSize;
  3070        freeTempSpace(pBt);
  3071        rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
  3072                                     pageSize-usableSize);
  3073        return rc;
  3074      }
  3075      if( (pBt->db->flags & SQLITE_WriteSchema)==0 && nPage>nPageFile ){
  3076        rc = SQLITE_CORRUPT_BKPT;
  3077        goto page1_init_failed;
  3078      }
  3079      /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
  3080      ** be less than 480. In other words, if the page size is 512, then the
  3081      ** reserved space size cannot exceed 32. */
  3082      if( usableSize<480 ){
  3083        goto page1_init_failed;
  3084      }
  3085      pBt->pageSize = pageSize;
  3086      pBt->usableSize = usableSize;
  3087  #ifndef SQLITE_OMIT_AUTOVACUUM
  3088      pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
  3089      pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
  3090  #endif
  3091    }
  3092  
  3093    /* maxLocal is the maximum amount of payload to store locally for
  3094    ** a cell.  Make sure it is small enough so that at least minFanout
  3095    ** cells can will fit on one page.  We assume a 10-byte page header.
  3096    ** Besides the payload, the cell must store:
  3097    **     2-byte pointer to the cell
  3098    **     4-byte child pointer
  3099    **     9-byte nKey value
  3100    **     4-byte nData value
  3101    **     4-byte overflow page pointer
  3102    ** So a cell consists of a 2-byte pointer, a header which is as much as
  3103    ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
  3104    ** page pointer.
  3105    */
  3106    pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
  3107    pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
  3108    pBt->maxLeaf = (u16)(pBt->usableSize - 35);
  3109    pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
  3110    if( pBt->maxLocal>127 ){
  3111      pBt->max1bytePayload = 127;
  3112    }else{
  3113      pBt->max1bytePayload = (u8)pBt->maxLocal;
  3114    }
  3115    assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
  3116    pBt->pPage1 = pPage1;
  3117    pBt->nPage = nPage;
  3118    return SQLITE_OK;
  3119  
  3120  page1_init_failed:
  3121    releasePageOne(pPage1);
  3122    pBt->pPage1 = 0;
  3123    return rc;
  3124  }
  3125  
  3126  #ifndef NDEBUG
  3127  /*
  3128  ** Return the number of cursors open on pBt. This is for use
  3129  ** in assert() expressions, so it is only compiled if NDEBUG is not
  3130  ** defined.
  3131  **
  3132  ** Only write cursors are counted if wrOnly is true.  If wrOnly is
  3133  ** false then all cursors are counted.
  3134  **
  3135  ** For the purposes of this routine, a cursor is any cursor that
  3136  ** is capable of reading or writing to the database.  Cursors that
  3137  ** have been tripped into the CURSOR_FAULT state are not counted.
  3138  */
  3139  static int countValidCursors(BtShared *pBt, int wrOnly){
  3140    BtCursor *pCur;
  3141    int r = 0;
  3142    for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
  3143      if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
  3144       && pCur->eState!=CURSOR_FAULT ) r++; 
  3145    }
  3146    return r;
  3147  }
  3148  #endif
  3149  
  3150  /*
  3151  ** If there are no outstanding cursors and we are not in the middle
  3152  ** of a transaction but there is a read lock on the database, then
  3153  ** this routine unrefs the first page of the database file which 
  3154  ** has the effect of releasing the read lock.
  3155  **
  3156  ** If there is a transaction in progress, this routine is a no-op.
  3157  */
  3158  static void unlockBtreeIfUnused(BtShared *pBt){
  3159    assert( sqlite3_mutex_held(pBt->mutex) );
  3160    assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
  3161    if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
  3162      MemPage *pPage1 = pBt->pPage1;
  3163      assert( pPage1->aData );
  3164      assert( sqlite3PagerRefcount(pBt->pPager)==1 );
  3165      pBt->pPage1 = 0;
  3166      releasePageOne(pPage1);
  3167    }
  3168  }
  3169  
  3170  /*
  3171  ** If pBt points to an empty file then convert that empty file
  3172  ** into a new empty database by initializing the first page of
  3173  ** the database.
  3174  */
  3175  static int newDatabase(BtShared *pBt){
  3176    MemPage *pP1;
  3177    unsigned char *data;
  3178    int rc;
  3179  
  3180    assert( sqlite3_mutex_held(pBt->mutex) );
  3181    if( pBt->nPage>0 ){
  3182      return SQLITE_OK;
  3183    }
  3184    pP1 = pBt->pPage1;
  3185    assert( pP1!=0 );
  3186    data = pP1->aData;
  3187    rc = sqlite3PagerWrite(pP1->pDbPage);
  3188    if( rc ) return rc;
  3189    memcpy(data, zMagicHeader, sizeof(zMagicHeader));
  3190    assert( sizeof(zMagicHeader)==16 );
  3191    data[16] = (u8)((pBt->pageSize>>8)&0xff);
  3192    data[17] = (u8)((pBt->pageSize>>16)&0xff);
  3193    data[18] = 1;
  3194    data[19] = 1;
  3195    assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
  3196    data[20] = (u8)(pBt->pageSize - pBt->usableSize);
  3197    data[21] = 64;
  3198    data[22] = 32;
  3199    data[23] = 32;
  3200    memset(&data[24], 0, 100-24);
  3201    zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
  3202    pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  3203  #ifndef SQLITE_OMIT_AUTOVACUUM
  3204    assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
  3205    assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
  3206    put4byte(&data[36 + 4*4], pBt->autoVacuum);
  3207    put4byte(&data[36 + 7*4], pBt->incrVacuum);
  3208  #endif
  3209    pBt->nPage = 1;
  3210    data[31] = 1;
  3211    return SQLITE_OK;
  3212  }
  3213  
  3214  /*
  3215  ** Initialize the first page of the database file (creating a database
  3216  ** consisting of a single page and no schema objects). Return SQLITE_OK
  3217  ** if successful, or an SQLite error code otherwise.
  3218  */
  3219  int sqlite3BtreeNewDb(Btree *p){
  3220    int rc;
  3221    sqlite3BtreeEnter(p);
  3222    p->pBt->nPage = 0;
  3223    rc = newDatabase(p->pBt);
  3224    sqlite3BtreeLeave(p);
  3225    return rc;
  3226  }
  3227  
  3228  /*
  3229  ** Attempt to start a new transaction. A write-transaction
  3230  ** is started if the second argument is nonzero, otherwise a read-
  3231  ** transaction.  If the second argument is 2 or more and exclusive
  3232  ** transaction is started, meaning that no other process is allowed
  3233  ** to access the database.  A preexisting transaction may not be
  3234  ** upgraded to exclusive by calling this routine a second time - the
  3235  ** exclusivity flag only works for a new transaction.
  3236  **
  3237  ** A write-transaction must be started before attempting any 
  3238  ** changes to the database.  None of the following routines 
  3239  ** will work unless a transaction is started first:
  3240  **
  3241  **      sqlite3BtreeCreateTable()
  3242  **      sqlite3BtreeCreateIndex()
  3243  **      sqlite3BtreeClearTable()
  3244  **      sqlite3BtreeDropTable()
  3245  **      sqlite3BtreeInsert()
  3246  **      sqlite3BtreeDelete()
  3247  **      sqlite3BtreeUpdateMeta()
  3248  **
  3249  ** If an initial attempt to acquire the lock fails because of lock contention
  3250  ** and the database was previously unlocked, then invoke the busy handler
  3251  ** if there is one.  But if there was previously a read-lock, do not
  3252  ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
  3253  ** returned when there is already a read-lock in order to avoid a deadlock.
  3254  **
  3255  ** Suppose there are two processes A and B.  A has a read lock and B has
  3256  ** a reserved lock.  B tries to promote to exclusive but is blocked because
  3257  ** of A's read lock.  A tries to promote to reserved but is blocked by B.
  3258  ** One or the other of the two processes must give way or there can be
  3259  ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
  3260  ** when A already has a read lock, we encourage A to give up and let B
  3261  ** proceed.
  3262  */
  3263  int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
  3264    BtShared *pBt = p->pBt;
  3265    int rc = SQLITE_OK;
  3266  
  3267    sqlite3BtreeEnter(p);
  3268    btreeIntegrity(p);
  3269  
  3270    /* If the btree is already in a write-transaction, or it
  3271    ** is already in a read-transaction and a read-transaction
  3272    ** is requested, this is a no-op.
  3273    */
  3274    if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
  3275      goto trans_begun;
  3276    }
  3277    assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
  3278  
  3279    /* Write transactions are not possible on a read-only database */
  3280    if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
  3281      rc = SQLITE_READONLY;
  3282      goto trans_begun;
  3283    }
  3284  
  3285  #ifndef SQLITE_OMIT_SHARED_CACHE
  3286    {
  3287      sqlite3 *pBlock = 0;
  3288      /* If another database handle has already opened a write transaction 
  3289      ** on this shared-btree structure and a second write transaction is
  3290      ** requested, return SQLITE_LOCKED.
  3291      */
  3292      if( (wrflag && pBt->inTransaction==TRANS_WRITE)
  3293       || (pBt->btsFlags & BTS_PENDING)!=0
  3294      ){
  3295        pBlock = pBt->pWriter->db;
  3296      }else if( wrflag>1 ){
  3297        BtLock *pIter;
  3298        for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
  3299          if( pIter->pBtree!=p ){
  3300            pBlock = pIter->pBtree->db;
  3301            break;
  3302          }
  3303        }
  3304      }
  3305      if( pBlock ){
  3306        sqlite3ConnectionBlocked(p->db, pBlock);
  3307        rc = SQLITE_LOCKED_SHAREDCACHE;
  3308        goto trans_begun;
  3309      }
  3310    }
  3311  #endif
  3312  
  3313    /* Any read-only or read-write transaction implies a read-lock on 
  3314    ** page 1. So if some other shared-cache client already has a write-lock 
  3315    ** on page 1, the transaction cannot be opened. */
  3316    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
  3317    if( SQLITE_OK!=rc ) goto trans_begun;
  3318  
  3319    pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
  3320    if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
  3321    do {
  3322      /* Call lockBtree() until either pBt->pPage1 is populated or
  3323      ** lockBtree() returns something other than SQLITE_OK. lockBtree()
  3324      ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
  3325      ** reading page 1 it discovers that the page-size of the database 
  3326      ** file is not pBt->pageSize. In this case lockBtree() will update
  3327      ** pBt->pageSize to the page-size of the file on disk.
  3328      */
  3329      while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
  3330  
  3331      if( rc==SQLITE_OK && wrflag ){
  3332        if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
  3333          rc = SQLITE_READONLY;
  3334        }else{
  3335          rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
  3336          if( rc==SQLITE_OK ){
  3337            rc = newDatabase(pBt);
  3338          }
  3339        }
  3340      }
  3341    
  3342      if( rc!=SQLITE_OK ){
  3343        unlockBtreeIfUnused(pBt);
  3344      }
  3345    }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
  3346            btreeInvokeBusyHandler(pBt) );
  3347  
  3348    if( rc==SQLITE_OK ){
  3349      if( p->inTrans==TRANS_NONE ){
  3350        pBt->nTransaction++;
  3351  #ifndef SQLITE_OMIT_SHARED_CACHE
  3352        if( p->sharable ){
  3353          assert( p->lock.pBtree==p && p->lock.iTable==1 );
  3354          p->lock.eLock = READ_LOCK;
  3355          p->lock.pNext = pBt->pLock;
  3356          pBt->pLock = &p->lock;
  3357        }
  3358  #endif
  3359      }
  3360      p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
  3361      if( p->inTrans>pBt->inTransaction ){
  3362        pBt->inTransaction = p->inTrans;
  3363      }
  3364      if( wrflag ){
  3365        MemPage *pPage1 = pBt->pPage1;
  3366  #ifndef SQLITE_OMIT_SHARED_CACHE
  3367        assert( !pBt->pWriter );
  3368        pBt->pWriter = p;
  3369        pBt->btsFlags &= ~BTS_EXCLUSIVE;
  3370        if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
  3371  #endif
  3372  
  3373        /* If the db-size header field is incorrect (as it may be if an old
  3374        ** client has been writing the database file), update it now. Doing
  3375        ** this sooner rather than later means the database size can safely 
  3376        ** re-read the database size from page 1 if a savepoint or transaction
  3377        ** rollback occurs within the transaction.
  3378        */
  3379        if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
  3380          rc = sqlite3PagerWrite(pPage1->pDbPage);
  3381          if( rc==SQLITE_OK ){
  3382            put4byte(&pPage1->aData[28], pBt->nPage);
  3383          }
  3384        }
  3385      }
  3386    }
  3387  
  3388  
  3389  trans_begun:
  3390    if( rc==SQLITE_OK && wrflag ){
  3391      /* This call makes sure that the pager has the correct number of
  3392      ** open savepoints. If the second parameter is greater than 0 and
  3393      ** the sub-journal is not already open, then it will be opened here.
  3394      */
  3395      rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
  3396    }
  3397  
  3398    btreeIntegrity(p);
  3399    sqlite3BtreeLeave(p);
  3400    return rc;
  3401  }
  3402  
  3403  #ifndef SQLITE_OMIT_AUTOVACUUM
  3404  
  3405  /*
  3406  ** Set the pointer-map entries for all children of page pPage. Also, if
  3407  ** pPage contains cells that point to overflow pages, set the pointer
  3408  ** map entries for the overflow pages as well.
  3409  */
  3410  static int setChildPtrmaps(MemPage *pPage){
  3411    int i;                             /* Counter variable */
  3412    int nCell;                         /* Number of cells in page pPage */
  3413    int rc;                            /* Return code */
  3414    BtShared *pBt = pPage->pBt;
  3415    Pgno pgno = pPage->pgno;
  3416  
  3417    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  3418    rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
  3419    if( rc!=SQLITE_OK ) return rc;
  3420    nCell = pPage->nCell;
  3421  
  3422    for(i=0; i<nCell; i++){
  3423      u8 *pCell = findCell(pPage, i);
  3424  
  3425      ptrmapPutOvflPtr(pPage, pCell, &rc);
  3426  
  3427      if( !pPage->leaf ){
  3428        Pgno childPgno = get4byte(pCell);
  3429        ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  3430      }
  3431    }
  3432  
  3433    if( !pPage->leaf ){
  3434      Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  3435      ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  3436    }
  3437  
  3438    return rc;
  3439  }
  3440  
  3441  /*
  3442  ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
  3443  ** that it points to iTo. Parameter eType describes the type of pointer to
  3444  ** be modified, as  follows:
  3445  **
  3446  ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
  3447  **                   page of pPage.
  3448  **
  3449  ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
  3450  **                   page pointed to by one of the cells on pPage.
  3451  **
  3452  ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
  3453  **                   overflow page in the list.
  3454  */
  3455  static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
  3456    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  3457    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  3458    if( eType==PTRMAP_OVERFLOW2 ){
  3459      /* The pointer is always the first 4 bytes of the page in this case.  */
  3460      if( get4byte(pPage->aData)!=iFrom ){
  3461        return SQLITE_CORRUPT_PGNO(pPage->pgno);
  3462      }
  3463      put4byte(pPage->aData, iTo);
  3464    }else{
  3465      int i;
  3466      int nCell;
  3467      int rc;
  3468  
  3469      rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
  3470      if( rc ) return rc;
  3471      nCell = pPage->nCell;
  3472  
  3473      for(i=0; i<nCell; i++){
  3474        u8 *pCell = findCell(pPage, i);
  3475        if( eType==PTRMAP_OVERFLOW1 ){
  3476          CellInfo info;
  3477          pPage->xParseCell(pPage, pCell, &info);
  3478          if( info.nLocal<info.nPayload ){
  3479            if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
  3480              return SQLITE_CORRUPT_PGNO(pPage->pgno);
  3481            }
  3482            if( iFrom==get4byte(pCell+info.nSize-4) ){
  3483              put4byte(pCell+info.nSize-4, iTo);
  3484              break;
  3485            }
  3486          }
  3487        }else{
  3488          if( get4byte(pCell)==iFrom ){
  3489            put4byte(pCell, iTo);
  3490            break;
  3491          }
  3492        }
  3493      }
  3494    
  3495      if( i==nCell ){
  3496        if( eType!=PTRMAP_BTREE || 
  3497            get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
  3498          return SQLITE_CORRUPT_PGNO(pPage->pgno);
  3499        }
  3500        put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
  3501      }
  3502    }
  3503    return SQLITE_OK;
  3504  }
  3505  
  3506  
  3507  /*
  3508  ** Move the open database page pDbPage to location iFreePage in the 
  3509  ** database. The pDbPage reference remains valid.
  3510  **
  3511  ** The isCommit flag indicates that there is no need to remember that
  3512  ** the journal needs to be sync()ed before database page pDbPage->pgno 
  3513  ** can be written to. The caller has already promised not to write to that
  3514  ** page.
  3515  */
  3516  static int relocatePage(
  3517    BtShared *pBt,           /* Btree */
  3518    MemPage *pDbPage,        /* Open page to move */
  3519    u8 eType,                /* Pointer map 'type' entry for pDbPage */
  3520    Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
  3521    Pgno iFreePage,          /* The location to move pDbPage to */
  3522    int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
  3523  ){
  3524    MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
  3525    Pgno iDbPage = pDbPage->pgno;
  3526    Pager *pPager = pBt->pPager;
  3527    int rc;
  3528  
  3529    assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
  3530        eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
  3531    assert( sqlite3_mutex_held(pBt->mutex) );
  3532    assert( pDbPage->pBt==pBt );
  3533  
  3534    /* Move page iDbPage from its current location to page number iFreePage */
  3535    TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
  3536        iDbPage, iFreePage, iPtrPage, eType));
  3537    rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
  3538    if( rc!=SQLITE_OK ){
  3539      return rc;
  3540    }
  3541    pDbPage->pgno = iFreePage;
  3542  
  3543    /* If pDbPage was a btree-page, then it may have child pages and/or cells
  3544    ** that point to overflow pages. The pointer map entries for all these
  3545    ** pages need to be changed.
  3546    **
  3547    ** If pDbPage is an overflow page, then the first 4 bytes may store a
  3548    ** pointer to a subsequent overflow page. If this is the case, then
  3549    ** the pointer map needs to be updated for the subsequent overflow page.
  3550    */
  3551    if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
  3552      rc = setChildPtrmaps(pDbPage);
  3553      if( rc!=SQLITE_OK ){
  3554        return rc;
  3555      }
  3556    }else{
  3557      Pgno nextOvfl = get4byte(pDbPage->aData);
  3558      if( nextOvfl!=0 ){
  3559        ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
  3560        if( rc!=SQLITE_OK ){
  3561          return rc;
  3562        }
  3563      }
  3564    }
  3565  
  3566    /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
  3567    ** that it points at iFreePage. Also fix the pointer map entry for
  3568    ** iPtrPage.
  3569    */
  3570    if( eType!=PTRMAP_ROOTPAGE ){
  3571      rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
  3572      if( rc!=SQLITE_OK ){
  3573        return rc;
  3574      }
  3575      rc = sqlite3PagerWrite(pPtrPage->pDbPage);
  3576      if( rc!=SQLITE_OK ){
  3577        releasePage(pPtrPage);
  3578        return rc;
  3579      }
  3580      rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
  3581      releasePage(pPtrPage);
  3582      if( rc==SQLITE_OK ){
  3583        ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
  3584      }
  3585    }
  3586    return rc;
  3587  }
  3588  
  3589  /* Forward declaration required by incrVacuumStep(). */
  3590  static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
  3591  
  3592  /*
  3593  ** Perform a single step of an incremental-vacuum. If successful, return
  3594  ** SQLITE_OK. If there is no work to do (and therefore no point in 
  3595  ** calling this function again), return SQLITE_DONE. Or, if an error 
  3596  ** occurs, return some other error code.
  3597  **
  3598  ** More specifically, this function attempts to re-organize the database so 
  3599  ** that the last page of the file currently in use is no longer in use.
  3600  **
  3601  ** Parameter nFin is the number of pages that this database would contain
  3602  ** were this function called until it returns SQLITE_DONE.
  3603  **
  3604  ** If the bCommit parameter is non-zero, this function assumes that the 
  3605  ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 
  3606  ** or an error. bCommit is passed true for an auto-vacuum-on-commit 
  3607  ** operation, or false for an incremental vacuum.
  3608  */
  3609  static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
  3610    Pgno nFreeList;           /* Number of pages still on the free-list */
  3611    int rc;
  3612  
  3613    assert( sqlite3_mutex_held(pBt->mutex) );
  3614    assert( iLastPg>nFin );
  3615  
  3616    if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
  3617      u8 eType;
  3618      Pgno iPtrPage;
  3619  
  3620      nFreeList = get4byte(&pBt->pPage1->aData[36]);
  3621      if( nFreeList==0 ){
  3622        return SQLITE_DONE;
  3623      }
  3624  
  3625      rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
  3626      if( rc!=SQLITE_OK ){
  3627        return rc;
  3628      }
  3629      if( eType==PTRMAP_ROOTPAGE ){
  3630        return SQLITE_CORRUPT_BKPT;
  3631      }
  3632  
  3633      if( eType==PTRMAP_FREEPAGE ){
  3634        if( bCommit==0 ){
  3635          /* Remove the page from the files free-list. This is not required
  3636          ** if bCommit is non-zero. In that case, the free-list will be
  3637          ** truncated to zero after this function returns, so it doesn't 
  3638          ** matter if it still contains some garbage entries.
  3639          */
  3640          Pgno iFreePg;
  3641          MemPage *pFreePg;
  3642          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
  3643          if( rc!=SQLITE_OK ){
  3644            return rc;
  3645          }
  3646          assert( iFreePg==iLastPg );
  3647          releasePage(pFreePg);
  3648        }
  3649      } else {
  3650        Pgno iFreePg;             /* Index of free page to move pLastPg to */
  3651        MemPage *pLastPg;
  3652        u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
  3653        Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
  3654  
  3655        rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
  3656        if( rc!=SQLITE_OK ){
  3657          return rc;
  3658        }
  3659  
  3660        /* If bCommit is zero, this loop runs exactly once and page pLastPg
  3661        ** is swapped with the first free page pulled off the free list.
  3662        **
  3663        ** On the other hand, if bCommit is greater than zero, then keep
  3664        ** looping until a free-page located within the first nFin pages
  3665        ** of the file is found.
  3666        */
  3667        if( bCommit==0 ){
  3668          eMode = BTALLOC_LE;
  3669          iNear = nFin;
  3670        }
  3671        do {
  3672          MemPage *pFreePg;
  3673          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
  3674          if( rc!=SQLITE_OK ){
  3675            releasePage(pLastPg);
  3676            return rc;
  3677          }
  3678          releasePage(pFreePg);
  3679        }while( bCommit && iFreePg>nFin );
  3680        assert( iFreePg<iLastPg );
  3681        
  3682        rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
  3683        releasePage(pLastPg);
  3684        if( rc!=SQLITE_OK ){
  3685          return rc;
  3686        }
  3687      }
  3688    }
  3689  
  3690    if( bCommit==0 ){
  3691      do {
  3692        iLastPg--;
  3693      }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
  3694      pBt->bDoTruncate = 1;
  3695      pBt->nPage = iLastPg;
  3696    }
  3697    return SQLITE_OK;
  3698  }
  3699  
  3700  /*
  3701  ** The database opened by the first argument is an auto-vacuum database
  3702  ** nOrig pages in size containing nFree free pages. Return the expected 
  3703  ** size of the database in pages following an auto-vacuum operation.
  3704  */
  3705  static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
  3706    int nEntry;                     /* Number of entries on one ptrmap page */
  3707    Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
  3708    Pgno nFin;                      /* Return value */
  3709  
  3710    nEntry = pBt->usableSize/5;
  3711    nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
  3712    nFin = nOrig - nFree - nPtrmap;
  3713    if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
  3714      nFin--;
  3715    }
  3716    while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
  3717      nFin--;
  3718    }
  3719  
  3720    return nFin;
  3721  }
  3722  
  3723  /*
  3724  ** A write-transaction must be opened before calling this function.
  3725  ** It performs a single unit of work towards an incremental vacuum.
  3726  **
  3727  ** If the incremental vacuum is finished after this function has run,
  3728  ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
  3729  ** SQLITE_OK is returned. Otherwise an SQLite error code. 
  3730  */
  3731  int sqlite3BtreeIncrVacuum(Btree *p){
  3732    int rc;
  3733    BtShared *pBt = p->pBt;
  3734  
  3735    sqlite3BtreeEnter(p);
  3736    assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
  3737    if( !pBt->autoVacuum ){
  3738      rc = SQLITE_DONE;
  3739    }else{
  3740      Pgno nOrig = btreePagecount(pBt);
  3741      Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
  3742      Pgno nFin = finalDbSize(pBt, nOrig, nFree);
  3743  
  3744      if( nOrig<nFin ){
  3745        rc = SQLITE_CORRUPT_BKPT;
  3746      }else if( nFree>0 ){
  3747        rc = saveAllCursors(pBt, 0, 0);
  3748        if( rc==SQLITE_OK ){
  3749          invalidateAllOverflowCache(pBt);
  3750          rc = incrVacuumStep(pBt, nFin, nOrig, 0);
  3751        }
  3752        if( rc==SQLITE_OK ){
  3753          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  3754          put4byte(&pBt->pPage1->aData[28], pBt->nPage);
  3755        }
  3756      }else{
  3757        rc = SQLITE_DONE;
  3758      }
  3759    }
  3760    sqlite3BtreeLeave(p);
  3761    return rc;
  3762  }
  3763  
  3764  /*
  3765  ** This routine is called prior to sqlite3PagerCommit when a transaction
  3766  ** is committed for an auto-vacuum database.
  3767  **
  3768  ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
  3769  ** the database file should be truncated to during the commit process. 
  3770  ** i.e. the database has been reorganized so that only the first *pnTrunc
  3771  ** pages are in use.
  3772  */
  3773  static int autoVacuumCommit(BtShared *pBt){
  3774    int rc = SQLITE_OK;
  3775    Pager *pPager = pBt->pPager;
  3776    VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
  3777  
  3778    assert( sqlite3_mutex_held(pBt->mutex) );
  3779    invalidateAllOverflowCache(pBt);
  3780    assert(pBt->autoVacuum);
  3781    if( !pBt->incrVacuum ){
  3782      Pgno nFin;         /* Number of pages in database after autovacuuming */
  3783      Pgno nFree;        /* Number of pages on the freelist initially */
  3784      Pgno iFree;        /* The next page to be freed */
  3785      Pgno nOrig;        /* Database size before freeing */
  3786  
  3787      nOrig = btreePagecount(pBt);
  3788      if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
  3789        /* It is not possible to create a database for which the final page
  3790        ** is either a pointer-map page or the pending-byte page. If one
  3791        ** is encountered, this indicates corruption.
  3792        */
  3793        return SQLITE_CORRUPT_BKPT;
  3794      }
  3795  
  3796      nFree = get4byte(&pBt->pPage1->aData[36]);
  3797      nFin = finalDbSize(pBt, nOrig, nFree);
  3798      if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
  3799      if( nFin<nOrig ){
  3800        rc = saveAllCursors(pBt, 0, 0);
  3801      }
  3802      for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
  3803        rc = incrVacuumStep(pBt, nFin, iFree, 1);
  3804      }
  3805      if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
  3806        rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  3807        put4byte(&pBt->pPage1->aData[32], 0);
  3808        put4byte(&pBt->pPage1->aData[36], 0);
  3809        put4byte(&pBt->pPage1->aData[28], nFin);
  3810        pBt->bDoTruncate = 1;
  3811        pBt->nPage = nFin;
  3812      }
  3813      if( rc!=SQLITE_OK ){
  3814        sqlite3PagerRollback(pPager);
  3815      }
  3816    }
  3817  
  3818    assert( nRef>=sqlite3PagerRefcount(pPager) );
  3819    return rc;
  3820  }
  3821  
  3822  #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
  3823  # define setChildPtrmaps(x) SQLITE_OK
  3824  #endif
  3825  
  3826  /*
  3827  ** This routine does the first phase of a two-phase commit.  This routine
  3828  ** causes a rollback journal to be created (if it does not already exist)
  3829  ** and populated with enough information so that if a power loss occurs
  3830  ** the database can be restored to its original state by playing back
  3831  ** the journal.  Then the contents of the journal are flushed out to
  3832  ** the disk.  After the journal is safely on oxide, the changes to the
  3833  ** database are written into the database file and flushed to oxide.
  3834  ** At the end of this call, the rollback journal still exists on the
  3835  ** disk and we are still holding all locks, so the transaction has not
  3836  ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
  3837  ** commit process.
  3838  **
  3839  ** This call is a no-op if no write-transaction is currently active on pBt.
  3840  **
  3841  ** Otherwise, sync the database file for the btree pBt. zMaster points to
  3842  ** the name of a master journal file that should be written into the
  3843  ** individual journal file, or is NULL, indicating no master journal file 
  3844  ** (single database transaction).
  3845  **
  3846  ** When this is called, the master journal should already have been
  3847  ** created, populated with this journal pointer and synced to disk.
  3848  **
  3849  ** Once this is routine has returned, the only thing required to commit
  3850  ** the write-transaction for this database file is to delete the journal.
  3851  */
  3852  int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
  3853    int rc = SQLITE_OK;
  3854    if( p->inTrans==TRANS_WRITE ){
  3855      BtShared *pBt = p->pBt;
  3856      sqlite3BtreeEnter(p);
  3857  #ifndef SQLITE_OMIT_AUTOVACUUM
  3858      if( pBt->autoVacuum ){
  3859        rc = autoVacuumCommit(pBt);
  3860        if( rc!=SQLITE_OK ){
  3861          sqlite3BtreeLeave(p);
  3862          return rc;
  3863        }
  3864      }
  3865      if( pBt->bDoTruncate ){
  3866        sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
  3867      }
  3868  #endif
  3869      rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
  3870      sqlite3BtreeLeave(p);
  3871    }
  3872    return rc;
  3873  }
  3874  
  3875  /*
  3876  ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
  3877  ** at the conclusion of a transaction.
  3878  */
  3879  static void btreeEndTransaction(Btree *p){
  3880    BtShared *pBt = p->pBt;
  3881    sqlite3 *db = p->db;
  3882    assert( sqlite3BtreeHoldsMutex(p) );
  3883  
  3884  #ifndef SQLITE_OMIT_AUTOVACUUM
  3885    pBt->bDoTruncate = 0;
  3886  #endif
  3887    if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
  3888      /* If there are other active statements that belong to this database
  3889      ** handle, downgrade to a read-only transaction. The other statements
  3890      ** may still be reading from the database.  */
  3891      downgradeAllSharedCacheTableLocks(p);
  3892      p->inTrans = TRANS_READ;
  3893    }else{
  3894      /* If the handle had any kind of transaction open, decrement the 
  3895      ** transaction count of the shared btree. If the transaction count 
  3896      ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
  3897      ** call below will unlock the pager.  */
  3898      if( p->inTrans!=TRANS_NONE ){
  3899        clearAllSharedCacheTableLocks(p);
  3900        pBt->nTransaction--;
  3901        if( 0==pBt->nTransaction ){
  3902          pBt->inTransaction = TRANS_NONE;
  3903        }
  3904      }
  3905  
  3906      /* Set the current transaction state to TRANS_NONE and unlock the 
  3907      ** pager if this call closed the only read or write transaction.  */
  3908      p->inTrans = TRANS_NONE;
  3909      unlockBtreeIfUnused(pBt);
  3910    }
  3911  
  3912    btreeIntegrity(p);
  3913  }
  3914  
  3915  /*
  3916  ** Commit the transaction currently in progress.
  3917  **
  3918  ** This routine implements the second phase of a 2-phase commit.  The
  3919  ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
  3920  ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
  3921  ** routine did all the work of writing information out to disk and flushing the
  3922  ** contents so that they are written onto the disk platter.  All this
  3923  ** routine has to do is delete or truncate or zero the header in the
  3924  ** the rollback journal (which causes the transaction to commit) and
  3925  ** drop locks.
  3926  **
  3927  ** Normally, if an error occurs while the pager layer is attempting to 
  3928  ** finalize the underlying journal file, this function returns an error and
  3929  ** the upper layer will attempt a rollback. However, if the second argument
  3930  ** is non-zero then this b-tree transaction is part of a multi-file 
  3931  ** transaction. In this case, the transaction has already been committed 
  3932  ** (by deleting a master journal file) and the caller will ignore this 
  3933  ** functions return code. So, even if an error occurs in the pager layer,
  3934  ** reset the b-tree objects internal state to indicate that the write
  3935  ** transaction has been closed. This is quite safe, as the pager will have
  3936  ** transitioned to the error state.
  3937  **
  3938  ** This will release the write lock on the database file.  If there
  3939  ** are no active cursors, it also releases the read lock.
  3940  */
  3941  int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
  3942  
  3943    if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
  3944    sqlite3BtreeEnter(p);
  3945    btreeIntegrity(p);
  3946  
  3947    /* If the handle has a write-transaction open, commit the shared-btrees 
  3948    ** transaction and set the shared state to TRANS_READ.
  3949    */
  3950    if( p->inTrans==TRANS_WRITE ){
  3951      int rc;
  3952      BtShared *pBt = p->pBt;
  3953      assert( pBt->inTransaction==TRANS_WRITE );
  3954      assert( pBt->nTransaction>0 );
  3955      rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
  3956      if( rc!=SQLITE_OK && bCleanup==0 ){
  3957        sqlite3BtreeLeave(p);
  3958        return rc;
  3959      }
  3960      p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
  3961      pBt->inTransaction = TRANS_READ;
  3962      btreeClearHasContent(pBt);
  3963    }
  3964  
  3965    btreeEndTransaction(p);
  3966    sqlite3BtreeLeave(p);
  3967    return SQLITE_OK;
  3968  }
  3969  
  3970  /*
  3971  ** Do both phases of a commit.
  3972  */
  3973  int sqlite3BtreeCommit(Btree *p){
  3974    int rc;
  3975    sqlite3BtreeEnter(p);
  3976    rc = sqlite3BtreeCommitPhaseOne(p, 0);
  3977    if( rc==SQLITE_OK ){
  3978      rc = sqlite3BtreeCommitPhaseTwo(p, 0);
  3979    }
  3980    sqlite3BtreeLeave(p);
  3981    return rc;
  3982  }
  3983  
  3984  /*
  3985  ** This routine sets the state to CURSOR_FAULT and the error
  3986  ** code to errCode for every cursor on any BtShared that pBtree
  3987  ** references.  Or if the writeOnly flag is set to 1, then only
  3988  ** trip write cursors and leave read cursors unchanged.
  3989  **
  3990  ** Every cursor is a candidate to be tripped, including cursors
  3991  ** that belong to other database connections that happen to be
  3992  ** sharing the cache with pBtree.
  3993  **
  3994  ** This routine gets called when a rollback occurs. If the writeOnly
  3995  ** flag is true, then only write-cursors need be tripped - read-only
  3996  ** cursors save their current positions so that they may continue 
  3997  ** following the rollback. Or, if writeOnly is false, all cursors are 
  3998  ** tripped. In general, writeOnly is false if the transaction being
  3999  ** rolled back modified the database schema. In this case b-tree root
  4000  ** pages may be moved or deleted from the database altogether, making
  4001  ** it unsafe for read cursors to continue.
  4002  **
  4003  ** If the writeOnly flag is true and an error is encountered while 
  4004  ** saving the current position of a read-only cursor, all cursors, 
  4005  ** including all read-cursors are tripped.
  4006  **
  4007  ** SQLITE_OK is returned if successful, or if an error occurs while
  4008  ** saving a cursor position, an SQLite error code.
  4009  */
  4010  int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
  4011    BtCursor *p;
  4012    int rc = SQLITE_OK;
  4013  
  4014    assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
  4015    if( pBtree ){
  4016      sqlite3BtreeEnter(pBtree);
  4017      for(p=pBtree->pBt->pCursor; p; p=p->pNext){
  4018        if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
  4019          if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
  4020            rc = saveCursorPosition(p);
  4021            if( rc!=SQLITE_OK ){
  4022              (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
  4023              break;
  4024            }
  4025          }
  4026        }else{
  4027          sqlite3BtreeClearCursor(p);
  4028          p->eState = CURSOR_FAULT;
  4029          p->skipNext = errCode;
  4030        }
  4031        btreeReleaseAllCursorPages(p);
  4032      }
  4033      sqlite3BtreeLeave(pBtree);
  4034    }
  4035    return rc;
  4036  }
  4037  
  4038  /*
  4039  ** Rollback the transaction in progress.
  4040  **
  4041  ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
  4042  ** Only write cursors are tripped if writeOnly is true but all cursors are
  4043  ** tripped if writeOnly is false.  Any attempt to use
  4044  ** a tripped cursor will result in an error.
  4045  **
  4046  ** This will release the write lock on the database file.  If there
  4047  ** are no active cursors, it also releases the read lock.
  4048  */
  4049  int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
  4050    int rc;
  4051    BtShared *pBt = p->pBt;
  4052    MemPage *pPage1;
  4053  
  4054    assert( writeOnly==1 || writeOnly==0 );
  4055    assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
  4056    sqlite3BtreeEnter(p);
  4057    if( tripCode==SQLITE_OK ){
  4058      rc = tripCode = saveAllCursors(pBt, 0, 0);
  4059      if( rc ) writeOnly = 0;
  4060    }else{
  4061      rc = SQLITE_OK;
  4062    }
  4063    if( tripCode ){
  4064      int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
  4065      assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
  4066      if( rc2!=SQLITE_OK ) rc = rc2;
  4067    }
  4068    btreeIntegrity(p);
  4069  
  4070    if( p->inTrans==TRANS_WRITE ){
  4071      int rc2;
  4072  
  4073      assert( TRANS_WRITE==pBt->inTransaction );
  4074      rc2 = sqlite3PagerRollback(pBt->pPager);
  4075      if( rc2!=SQLITE_OK ){
  4076        rc = rc2;
  4077      }
  4078  
  4079      /* The rollback may have destroyed the pPage1->aData value.  So
  4080      ** call btreeGetPage() on page 1 again to make
  4081      ** sure pPage1->aData is set correctly. */
  4082      if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
  4083        int nPage = get4byte(28+(u8*)pPage1->aData);
  4084        testcase( nPage==0 );
  4085        if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
  4086        testcase( pBt->nPage!=nPage );
  4087        pBt->nPage = nPage;
  4088        releasePageOne(pPage1);
  4089      }
  4090      assert( countValidCursors(pBt, 1)==0 );
  4091      pBt->inTransaction = TRANS_READ;
  4092      btreeClearHasContent(pBt);
  4093    }
  4094  
  4095    btreeEndTransaction(p);
  4096    sqlite3BtreeLeave(p);
  4097    return rc;
  4098  }
  4099  
  4100  /*
  4101  ** Start a statement subtransaction. The subtransaction can be rolled
  4102  ** back independently of the main transaction. You must start a transaction 
  4103  ** before starting a subtransaction. The subtransaction is ended automatically 
  4104  ** if the main transaction commits or rolls back.
  4105  **
  4106  ** Statement subtransactions are used around individual SQL statements
  4107  ** that are contained within a BEGIN...COMMIT block.  If a constraint
  4108  ** error occurs within the statement, the effect of that one statement
  4109  ** can be rolled back without having to rollback the entire transaction.
  4110  **
  4111  ** A statement sub-transaction is implemented as an anonymous savepoint. The
  4112  ** value passed as the second parameter is the total number of savepoints,
  4113  ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
  4114  ** are no active savepoints and no other statement-transactions open,
  4115  ** iStatement is 1. This anonymous savepoint can be released or rolled back
  4116  ** using the sqlite3BtreeSavepoint() function.
  4117  */
  4118  int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
  4119    int rc;
  4120    BtShared *pBt = p->pBt;
  4121    sqlite3BtreeEnter(p);
  4122    assert( p->inTrans==TRANS_WRITE );
  4123    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
  4124    assert( iStatement>0 );
  4125    assert( iStatement>p->db->nSavepoint );
  4126    assert( pBt->inTransaction==TRANS_WRITE );
  4127    /* At the pager level, a statement transaction is a savepoint with
  4128    ** an index greater than all savepoints created explicitly using
  4129    ** SQL statements. It is illegal to open, release or rollback any
  4130    ** such savepoints while the statement transaction savepoint is active.
  4131    */
  4132    rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
  4133    sqlite3BtreeLeave(p);
  4134    return rc;
  4135  }
  4136  
  4137  /*
  4138  ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
  4139  ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
  4140  ** savepoint identified by parameter iSavepoint, depending on the value 
  4141  ** of op.
  4142  **
  4143  ** Normally, iSavepoint is greater than or equal to zero. However, if op is
  4144  ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 
  4145  ** contents of the entire transaction are rolled back. This is different
  4146  ** from a normal transaction rollback, as no locks are released and the
  4147  ** transaction remains open.
  4148  */
  4149  int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
  4150    int rc = SQLITE_OK;
  4151    if( p && p->inTrans==TRANS_WRITE ){
  4152      BtShared *pBt = p->pBt;
  4153      assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
  4154      assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
  4155      sqlite3BtreeEnter(p);
  4156      if( op==SAVEPOINT_ROLLBACK ){
  4157        rc = saveAllCursors(pBt, 0, 0);
  4158      }
  4159      if( rc==SQLITE_OK ){
  4160        rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
  4161      }
  4162      if( rc==SQLITE_OK ){
  4163        if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
  4164          pBt->nPage = 0;
  4165        }
  4166        rc = newDatabase(pBt);
  4167        pBt->nPage = get4byte(28 + pBt->pPage1->aData);
  4168  
  4169        /* The database size was written into the offset 28 of the header
  4170        ** when the transaction started, so we know that the value at offset
  4171        ** 28 is nonzero. */
  4172        assert( pBt->nPage>0 );
  4173      }
  4174      sqlite3BtreeLeave(p);
  4175    }
  4176    return rc;
  4177  }
  4178  
  4179  /*
  4180  ** Create a new cursor for the BTree whose root is on the page
  4181  ** iTable. If a read-only cursor is requested, it is assumed that
  4182  ** the caller already has at least a read-only transaction open
  4183  ** on the database already. If a write-cursor is requested, then
  4184  ** the caller is assumed to have an open write transaction.
  4185  **
  4186  ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
  4187  ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
  4188  ** can be used for reading or for writing if other conditions for writing
  4189  ** are also met.  These are the conditions that must be met in order
  4190  ** for writing to be allowed:
  4191  **
  4192  ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
  4193  **
  4194  ** 2:  Other database connections that share the same pager cache
  4195  **     but which are not in the READ_UNCOMMITTED state may not have
  4196  **     cursors open with wrFlag==0 on the same table.  Otherwise
  4197  **     the changes made by this write cursor would be visible to
  4198  **     the read cursors in the other database connection.
  4199  **
  4200  ** 3:  The database must be writable (not on read-only media)
  4201  **
  4202  ** 4:  There must be an active transaction.
  4203  **
  4204  ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
  4205  ** is set.  If FORDELETE is set, that is a hint to the implementation that
  4206  ** this cursor will only be used to seek to and delete entries of an index
  4207  ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
  4208  ** this implementation.  But in a hypothetical alternative storage engine 
  4209  ** in which index entries are automatically deleted when corresponding table
  4210  ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
  4211  ** operations on this cursor can be no-ops and all READ operations can 
  4212  ** return a null row (2-bytes: 0x01 0x00).
  4213  **
  4214  ** No checking is done to make sure that page iTable really is the
  4215  ** root page of a b-tree.  If it is not, then the cursor acquired
  4216  ** will not work correctly.
  4217  **
  4218  ** It is assumed that the sqlite3BtreeCursorZero() has been called
  4219  ** on pCur to initialize the memory space prior to invoking this routine.
  4220  */
  4221  static int btreeCursor(
  4222    Btree *p,                              /* The btree */
  4223    int iTable,                            /* Root page of table to open */
  4224    int wrFlag,                            /* 1 to write. 0 read-only */
  4225    struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
  4226    BtCursor *pCur                         /* Space for new cursor */
  4227  ){
  4228    BtShared *pBt = p->pBt;                /* Shared b-tree handle */
  4229    BtCursor *pX;                          /* Looping over other all cursors */
  4230  
  4231    assert( sqlite3BtreeHoldsMutex(p) );
  4232    assert( wrFlag==0 
  4233         || wrFlag==BTREE_WRCSR 
  4234         || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 
  4235    );
  4236  
  4237    /* The following assert statements verify that if this is a sharable 
  4238    ** b-tree database, the connection is holding the required table locks, 
  4239    ** and that no other connection has any open cursor that conflicts with 
  4240    ** this lock.  */
  4241    assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
  4242    assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
  4243  
  4244    /* Assert that the caller has opened the required transaction. */
  4245    assert( p->inTrans>TRANS_NONE );
  4246    assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
  4247    assert( pBt->pPage1 && pBt->pPage1->aData );
  4248    assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
  4249  
  4250    if( wrFlag ){
  4251      allocateTempSpace(pBt);
  4252      if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
  4253    }
  4254    if( iTable==1 && btreePagecount(pBt)==0 ){
  4255      assert( wrFlag==0 );
  4256      iTable = 0;
  4257    }
  4258  
  4259    /* Now that no other errors can occur, finish filling in the BtCursor
  4260    ** variables and link the cursor into the BtShared list.  */
  4261    pCur->pgnoRoot = (Pgno)iTable;
  4262    pCur->iPage = -1;
  4263    pCur->pKeyInfo = pKeyInfo;
  4264    pCur->pBtree = p;
  4265    pCur->pBt = pBt;
  4266    pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
  4267    pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
  4268    /* If there are two or more cursors on the same btree, then all such
  4269    ** cursors *must* have the BTCF_Multiple flag set. */
  4270    for(pX=pBt->pCursor; pX; pX=pX->pNext){
  4271      if( pX->pgnoRoot==(Pgno)iTable ){
  4272        pX->curFlags |= BTCF_Multiple;
  4273        pCur->curFlags |= BTCF_Multiple;
  4274      }
  4275    }
  4276    pCur->pNext = pBt->pCursor;
  4277    pBt->pCursor = pCur;
  4278    pCur->eState = CURSOR_INVALID;
  4279    return SQLITE_OK;
  4280  }
  4281  int sqlite3BtreeCursor(
  4282    Btree *p,                                   /* The btree */
  4283    int iTable,                                 /* Root page of table to open */
  4284    int wrFlag,                                 /* 1 to write. 0 read-only */
  4285    struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
  4286    BtCursor *pCur                              /* Write new cursor here */
  4287  ){
  4288    int rc;
  4289    if( iTable<1 ){
  4290      rc = SQLITE_CORRUPT_BKPT;
  4291    }else{
  4292      sqlite3BtreeEnter(p);
  4293      rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
  4294      sqlite3BtreeLeave(p);
  4295    }
  4296    return rc;
  4297  }
  4298  
  4299  /*
  4300  ** Return the size of a BtCursor object in bytes.
  4301  **
  4302  ** This interfaces is needed so that users of cursors can preallocate
  4303  ** sufficient storage to hold a cursor.  The BtCursor object is opaque
  4304  ** to users so they cannot do the sizeof() themselves - they must call
  4305  ** this routine.
  4306  */
  4307  int sqlite3BtreeCursorSize(void){
  4308    return ROUND8(sizeof(BtCursor));
  4309  }
  4310  
  4311  /*
  4312  ** Initialize memory that will be converted into a BtCursor object.
  4313  **
  4314  ** The simple approach here would be to memset() the entire object
  4315  ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
  4316  ** do not need to be zeroed and they are large, so we can save a lot
  4317  ** of run-time by skipping the initialization of those elements.
  4318  */
  4319  void sqlite3BtreeCursorZero(BtCursor *p){
  4320    memset(p, 0, offsetof(BtCursor, iPage));
  4321  }
  4322  
  4323  /*
  4324  ** Close a cursor.  The read lock on the database file is released
  4325  ** when the last cursor is closed.
  4326  */
  4327  int sqlite3BtreeCloseCursor(BtCursor *pCur){
  4328    Btree *pBtree = pCur->pBtree;
  4329    if( pBtree ){
  4330      BtShared *pBt = pCur->pBt;
  4331      sqlite3BtreeEnter(pBtree);
  4332      assert( pBt->pCursor!=0 );
  4333      if( pBt->pCursor==pCur ){
  4334        pBt->pCursor = pCur->pNext;
  4335      }else{
  4336        BtCursor *pPrev = pBt->pCursor;
  4337        do{
  4338          if( pPrev->pNext==pCur ){
  4339            pPrev->pNext = pCur->pNext;
  4340            break;
  4341          }
  4342          pPrev = pPrev->pNext;
  4343        }while( ALWAYS(pPrev) );
  4344      }
  4345      btreeReleaseAllCursorPages(pCur);
  4346      unlockBtreeIfUnused(pBt);
  4347      sqlite3_free(pCur->aOverflow);
  4348      sqlite3_free(pCur->pKey);
  4349      sqlite3BtreeLeave(pBtree);
  4350    }
  4351    return SQLITE_OK;
  4352  }
  4353  
  4354  /*
  4355  ** Make sure the BtCursor* given in the argument has a valid
  4356  ** BtCursor.info structure.  If it is not already valid, call
  4357  ** btreeParseCell() to fill it in.
  4358  **
  4359  ** BtCursor.info is a cache of the information in the current cell.
  4360  ** Using this cache reduces the number of calls to btreeParseCell().
  4361  */
  4362  #ifndef NDEBUG
  4363    static void assertCellInfo(BtCursor *pCur){
  4364      CellInfo info;
  4365      memset(&info, 0, sizeof(info));
  4366      btreeParseCell(pCur->pPage, pCur->ix, &info);
  4367      assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
  4368    }
  4369  #else
  4370    #define assertCellInfo(x)
  4371  #endif
  4372  static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
  4373    if( pCur->info.nSize==0 ){
  4374      pCur->curFlags |= BTCF_ValidNKey;
  4375      btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
  4376    }else{
  4377      assertCellInfo(pCur);
  4378    }
  4379  }
  4380  
  4381  #ifndef NDEBUG  /* The next routine used only within assert() statements */
  4382  /*
  4383  ** Return true if the given BtCursor is valid.  A valid cursor is one
  4384  ** that is currently pointing to a row in a (non-empty) table.
  4385  ** This is a verification routine is used only within assert() statements.
  4386  */
  4387  int sqlite3BtreeCursorIsValid(BtCursor *pCur){
  4388    return pCur && pCur->eState==CURSOR_VALID;
  4389  }
  4390  #endif /* NDEBUG */
  4391  int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
  4392    assert( pCur!=0 );
  4393    return pCur->eState==CURSOR_VALID;
  4394  }
  4395  
  4396  /*
  4397  ** Return the value of the integer key or "rowid" for a table btree.
  4398  ** This routine is only valid for a cursor that is pointing into a
  4399  ** ordinary table btree.  If the cursor points to an index btree or
  4400  ** is invalid, the result of this routine is undefined.
  4401  */
  4402  i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
  4403    assert( cursorHoldsMutex(pCur) );
  4404    assert( pCur->eState==CURSOR_VALID );
  4405    assert( pCur->curIntKey );
  4406    getCellInfo(pCur);
  4407    return pCur->info.nKey;
  4408  }
  4409  
  4410  /*
  4411  ** Return the number of bytes of payload for the entry that pCur is
  4412  ** currently pointing to.  For table btrees, this will be the amount
  4413  ** of data.  For index btrees, this will be the size of the key.
  4414  **
  4415  ** The caller must guarantee that the cursor is pointing to a non-NULL
  4416  ** valid entry.  In other words, the calling procedure must guarantee
  4417  ** that the cursor has Cursor.eState==CURSOR_VALID.
  4418  */
  4419  u32 sqlite3BtreePayloadSize(BtCursor *pCur){
  4420    assert( cursorHoldsMutex(pCur) );
  4421    assert( pCur->eState==CURSOR_VALID );
  4422    getCellInfo(pCur);
  4423    return pCur->info.nPayload;
  4424  }
  4425  
  4426  /*
  4427  ** Given the page number of an overflow page in the database (parameter
  4428  ** ovfl), this function finds the page number of the next page in the 
  4429  ** linked list of overflow pages. If possible, it uses the auto-vacuum
  4430  ** pointer-map data instead of reading the content of page ovfl to do so. 
  4431  **
  4432  ** If an error occurs an SQLite error code is returned. Otherwise:
  4433  **
  4434  ** The page number of the next overflow page in the linked list is 
  4435  ** written to *pPgnoNext. If page ovfl is the last page in its linked 
  4436  ** list, *pPgnoNext is set to zero. 
  4437  **
  4438  ** If ppPage is not NULL, and a reference to the MemPage object corresponding
  4439  ** to page number pOvfl was obtained, then *ppPage is set to point to that
  4440  ** reference. It is the responsibility of the caller to call releasePage()
  4441  ** on *ppPage to free the reference. In no reference was obtained (because
  4442  ** the pointer-map was used to obtain the value for *pPgnoNext), then
  4443  ** *ppPage is set to zero.
  4444  */
  4445  static int getOverflowPage(
  4446    BtShared *pBt,               /* The database file */
  4447    Pgno ovfl,                   /* Current overflow page number */
  4448    MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
  4449    Pgno *pPgnoNext              /* OUT: Next overflow page number */
  4450  ){
  4451    Pgno next = 0;
  4452    MemPage *pPage = 0;
  4453    int rc = SQLITE_OK;
  4454  
  4455    assert( sqlite3_mutex_held(pBt->mutex) );
  4456    assert(pPgnoNext);
  4457  
  4458  #ifndef SQLITE_OMIT_AUTOVACUUM
  4459    /* Try to find the next page in the overflow list using the
  4460    ** autovacuum pointer-map pages. Guess that the next page in 
  4461    ** the overflow list is page number (ovfl+1). If that guess turns 
  4462    ** out to be wrong, fall back to loading the data of page 
  4463    ** number ovfl to determine the next page number.
  4464    */
  4465    if( pBt->autoVacuum ){
  4466      Pgno pgno;
  4467      Pgno iGuess = ovfl+1;
  4468      u8 eType;
  4469  
  4470      while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
  4471        iGuess++;
  4472      }
  4473  
  4474      if( iGuess<=btreePagecount(pBt) ){
  4475        rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
  4476        if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
  4477          next = iGuess;
  4478          rc = SQLITE_DONE;
  4479        }
  4480      }
  4481    }
  4482  #endif
  4483  
  4484    assert( next==0 || rc==SQLITE_DONE );
  4485    if( rc==SQLITE_OK ){
  4486      rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
  4487      assert( rc==SQLITE_OK || pPage==0 );
  4488      if( rc==SQLITE_OK ){
  4489        next = get4byte(pPage->aData);
  4490      }
  4491    }
  4492  
  4493    *pPgnoNext = next;
  4494    if( ppPage ){
  4495      *ppPage = pPage;
  4496    }else{
  4497      releasePage(pPage);
  4498    }
  4499    return (rc==SQLITE_DONE ? SQLITE_OK : rc);
  4500  }
  4501  
  4502  /*
  4503  ** Copy data from a buffer to a page, or from a page to a buffer.
  4504  **
  4505  ** pPayload is a pointer to data stored on database page pDbPage.
  4506  ** If argument eOp is false, then nByte bytes of data are copied
  4507  ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
  4508  ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
  4509  ** of data are copied from the buffer pBuf to pPayload.
  4510  **
  4511  ** SQLITE_OK is returned on success, otherwise an error code.
  4512  */
  4513  static int copyPayload(
  4514    void *pPayload,           /* Pointer to page data */
  4515    void *pBuf,               /* Pointer to buffer */
  4516    int nByte,                /* Number of bytes to copy */
  4517    int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
  4518    DbPage *pDbPage           /* Page containing pPayload */
  4519  ){
  4520    if( eOp ){
  4521      /* Copy data from buffer to page (a write operation) */
  4522      int rc = sqlite3PagerWrite(pDbPage);
  4523      if( rc!=SQLITE_OK ){
  4524        return rc;
  4525      }
  4526      memcpy(pPayload, pBuf, nByte);
  4527    }else{
  4528      /* Copy data from page to buffer (a read operation) */
  4529      memcpy(pBuf, pPayload, nByte);
  4530    }
  4531    return SQLITE_OK;
  4532  }
  4533  
  4534  /*
  4535  ** This function is used to read or overwrite payload information
  4536  ** for the entry that the pCur cursor is pointing to. The eOp
  4537  ** argument is interpreted as follows:
  4538  **
  4539  **   0: The operation is a read. Populate the overflow cache.
  4540  **   1: The operation is a write. Populate the overflow cache.
  4541  **
  4542  ** A total of "amt" bytes are read or written beginning at "offset".
  4543  ** Data is read to or from the buffer pBuf.
  4544  **
  4545  ** The content being read or written might appear on the main page
  4546  ** or be scattered out on multiple overflow pages.
  4547  **
  4548  ** If the current cursor entry uses one or more overflow pages
  4549  ** this function may allocate space for and lazily populate
  4550  ** the overflow page-list cache array (BtCursor.aOverflow). 
  4551  ** Subsequent calls use this cache to make seeking to the supplied offset 
  4552  ** more efficient.
  4553  **
  4554  ** Once an overflow page-list cache has been allocated, it must be
  4555  ** invalidated if some other cursor writes to the same table, or if
  4556  ** the cursor is moved to a different row. Additionally, in auto-vacuum
  4557  ** mode, the following events may invalidate an overflow page-list cache.
  4558  **
  4559  **   * An incremental vacuum,
  4560  **   * A commit in auto_vacuum="full" mode,
  4561  **   * Creating a table (may require moving an overflow page).
  4562  */
  4563  static int accessPayload(
  4564    BtCursor *pCur,      /* Cursor pointing to entry to read from */
  4565    u32 offset,          /* Begin reading this far into payload */
  4566    u32 amt,             /* Read this many bytes */
  4567    unsigned char *pBuf, /* Write the bytes into this buffer */ 
  4568    int eOp              /* zero to read. non-zero to write. */
  4569  ){
  4570    unsigned char *aPayload;
  4571    int rc = SQLITE_OK;
  4572    int iIdx = 0;
  4573    MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
  4574    BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
  4575  #ifdef SQLITE_DIRECT_OVERFLOW_READ
  4576    unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
  4577  #endif
  4578  
  4579    assert( pPage );
  4580    assert( eOp==0 || eOp==1 );
  4581    assert( pCur->eState==CURSOR_VALID );
  4582    assert( pCur->ix<pPage->nCell );
  4583    assert( cursorHoldsMutex(pCur) );
  4584  
  4585    getCellInfo(pCur);
  4586    aPayload = pCur->info.pPayload;
  4587    assert( offset+amt <= pCur->info.nPayload );
  4588  
  4589    assert( aPayload > pPage->aData );
  4590    if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
  4591      /* Trying to read or write past the end of the data is an error.  The
  4592      ** conditional above is really:
  4593      **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
  4594      ** but is recast into its current form to avoid integer overflow problems
  4595      */
  4596      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  4597    }
  4598  
  4599    /* Check if data must be read/written to/from the btree page itself. */
  4600    if( offset<pCur->info.nLocal ){
  4601      int a = amt;
  4602      if( a+offset>pCur->info.nLocal ){
  4603        a = pCur->info.nLocal - offset;
  4604      }
  4605      rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
  4606      offset = 0;
  4607      pBuf += a;
  4608      amt -= a;
  4609    }else{
  4610      offset -= pCur->info.nLocal;
  4611    }
  4612  
  4613  
  4614    if( rc==SQLITE_OK && amt>0 ){
  4615      const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
  4616      Pgno nextPage;
  4617  
  4618      nextPage = get4byte(&aPayload[pCur->info.nLocal]);
  4619  
  4620      /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
  4621      **
  4622      ** The aOverflow[] array is sized at one entry for each overflow page
  4623      ** in the overflow chain. The page number of the first overflow page is
  4624      ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
  4625      ** means "not yet known" (the cache is lazily populated).
  4626      */
  4627      if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
  4628        int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
  4629        if( nOvfl>pCur->nOvflAlloc ){
  4630          Pgno *aNew = (Pgno*)sqlite3Realloc(
  4631              pCur->aOverflow, nOvfl*2*sizeof(Pgno)
  4632          );
  4633          if( aNew==0 ){
  4634            return SQLITE_NOMEM_BKPT;
  4635          }else{
  4636            pCur->nOvflAlloc = nOvfl*2;
  4637            pCur->aOverflow = aNew;
  4638          }
  4639        }
  4640        memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
  4641        pCur->curFlags |= BTCF_ValidOvfl;
  4642      }else{
  4643        /* If the overflow page-list cache has been allocated and the
  4644        ** entry for the first required overflow page is valid, skip
  4645        ** directly to it.
  4646        */
  4647        if( pCur->aOverflow[offset/ovflSize] ){
  4648          iIdx = (offset/ovflSize);
  4649          nextPage = pCur->aOverflow[iIdx];
  4650          offset = (offset%ovflSize);
  4651        }
  4652      }
  4653  
  4654      assert( rc==SQLITE_OK && amt>0 );
  4655      while( nextPage ){
  4656        /* If required, populate the overflow page-list cache. */
  4657        assert( pCur->aOverflow[iIdx]==0
  4658                || pCur->aOverflow[iIdx]==nextPage
  4659                || CORRUPT_DB );
  4660        pCur->aOverflow[iIdx] = nextPage;
  4661  
  4662        if( offset>=ovflSize ){
  4663          /* The only reason to read this page is to obtain the page
  4664          ** number for the next page in the overflow chain. The page
  4665          ** data is not required. So first try to lookup the overflow
  4666          ** page-list cache, if any, then fall back to the getOverflowPage()
  4667          ** function.
  4668          */
  4669          assert( pCur->curFlags & BTCF_ValidOvfl );
  4670          assert( pCur->pBtree->db==pBt->db );
  4671          if( pCur->aOverflow[iIdx+1] ){
  4672            nextPage = pCur->aOverflow[iIdx+1];
  4673          }else{
  4674            rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
  4675          }
  4676          offset -= ovflSize;
  4677        }else{
  4678          /* Need to read this page properly. It contains some of the
  4679          ** range of data that is being read (eOp==0) or written (eOp!=0).
  4680          */
  4681  #ifdef SQLITE_DIRECT_OVERFLOW_READ
  4682          sqlite3_file *fd;      /* File from which to do direct overflow read */
  4683  #endif
  4684          int a = amt;
  4685          if( a + offset > ovflSize ){
  4686            a = ovflSize - offset;
  4687          }
  4688  
  4689  #ifdef SQLITE_DIRECT_OVERFLOW_READ
  4690          /* If all the following are true:
  4691          **
  4692          **   1) this is a read operation, and 
  4693          **   2) data is required from the start of this overflow page, and
  4694          **   3) there is no open write-transaction, and
  4695          **   4) the database is file-backed, and
  4696          **   5) the page is not in the WAL file
  4697          **   6) at least 4 bytes have already been read into the output buffer 
  4698          **
  4699          ** then data can be read directly from the database file into the
  4700          ** output buffer, bypassing the page-cache altogether. This speeds
  4701          ** up loading large records that span many overflow pages.
  4702          */
  4703          if( eOp==0                                             /* (1) */
  4704           && offset==0                                          /* (2) */
  4705           && pBt->inTransaction==TRANS_READ                     /* (3) */
  4706           && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (4) */
  4707           && 0==sqlite3PagerUseWal(pBt->pPager, nextPage)       /* (5) */
  4708           && &pBuf[-4]>=pBufStart                               /* (6) */
  4709          ){
  4710            u8 aSave[4];
  4711            u8 *aWrite = &pBuf[-4];
  4712            assert( aWrite>=pBufStart );                         /* due to (6) */
  4713            memcpy(aSave, aWrite, 4);
  4714            rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
  4715            nextPage = get4byte(aWrite);
  4716            memcpy(aWrite, aSave, 4);
  4717          }else
  4718  #endif
  4719  
  4720          {
  4721            DbPage *pDbPage;
  4722            rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
  4723                (eOp==0 ? PAGER_GET_READONLY : 0)
  4724            );
  4725            if( rc==SQLITE_OK ){
  4726              aPayload = sqlite3PagerGetData(pDbPage);
  4727              nextPage = get4byte(aPayload);
  4728              rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
  4729              sqlite3PagerUnref(pDbPage);
  4730              offset = 0;
  4731            }
  4732          }
  4733          amt -= a;
  4734          if( amt==0 ) return rc;
  4735          pBuf += a;
  4736        }
  4737        if( rc ) break;
  4738        iIdx++;
  4739      }
  4740    }
  4741  
  4742    if( rc==SQLITE_OK && amt>0 ){
  4743      /* Overflow chain ends prematurely */
  4744      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  4745    }
  4746    return rc;
  4747  }
  4748  
  4749  /*
  4750  ** Read part of the payload for the row at which that cursor pCur is currently
  4751  ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
  4752  ** begins at "offset".
  4753  **
  4754  ** pCur can be pointing to either a table or an index b-tree.
  4755  ** If pointing to a table btree, then the content section is read.  If
  4756  ** pCur is pointing to an index b-tree then the key section is read.
  4757  **
  4758  ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
  4759  ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
  4760  ** cursor might be invalid or might need to be restored before being read.
  4761  **
  4762  ** Return SQLITE_OK on success or an error code if anything goes
  4763  ** wrong.  An error is returned if "offset+amt" is larger than
  4764  ** the available payload.
  4765  */
  4766  int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  4767    assert( cursorHoldsMutex(pCur) );
  4768    assert( pCur->eState==CURSOR_VALID );
  4769    assert( pCur->iPage>=0 && pCur->pPage );
  4770    assert( pCur->ix<pCur->pPage->nCell );
  4771    return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
  4772  }
  4773  
  4774  /*
  4775  ** This variant of sqlite3BtreePayload() works even if the cursor has not
  4776  ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
  4777  ** interface.
  4778  */
  4779  #ifndef SQLITE_OMIT_INCRBLOB
  4780  static SQLITE_NOINLINE int accessPayloadChecked(
  4781    BtCursor *pCur,
  4782    u32 offset,
  4783    u32 amt,
  4784    void *pBuf
  4785  ){
  4786    int rc;
  4787    if ( pCur->eState==CURSOR_INVALID ){
  4788      return SQLITE_ABORT;
  4789    }
  4790    assert( cursorOwnsBtShared(pCur) );
  4791    rc = btreeRestoreCursorPosition(pCur);
  4792    return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
  4793  }
  4794  int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  4795    if( pCur->eState==CURSOR_VALID ){
  4796      assert( cursorOwnsBtShared(pCur) );
  4797      return accessPayload(pCur, offset, amt, pBuf, 0);
  4798    }else{
  4799      return accessPayloadChecked(pCur, offset, amt, pBuf);
  4800    }
  4801  }
  4802  #endif /* SQLITE_OMIT_INCRBLOB */
  4803  
  4804  /*
  4805  ** Return a pointer to payload information from the entry that the 
  4806  ** pCur cursor is pointing to.  The pointer is to the beginning of
  4807  ** the key if index btrees (pPage->intKey==0) and is the data for
  4808  ** table btrees (pPage->intKey==1). The number of bytes of available
  4809  ** key/data is written into *pAmt.  If *pAmt==0, then the value
  4810  ** returned will not be a valid pointer.
  4811  **
  4812  ** This routine is an optimization.  It is common for the entire key
  4813  ** and data to fit on the local page and for there to be no overflow
  4814  ** pages.  When that is so, this routine can be used to access the
  4815  ** key and data without making a copy.  If the key and/or data spills
  4816  ** onto overflow pages, then accessPayload() must be used to reassemble
  4817  ** the key/data and copy it into a preallocated buffer.
  4818  **
  4819  ** The pointer returned by this routine looks directly into the cached
  4820  ** page of the database.  The data might change or move the next time
  4821  ** any btree routine is called.
  4822  */
  4823  static const void *fetchPayload(
  4824    BtCursor *pCur,      /* Cursor pointing to entry to read from */
  4825    u32 *pAmt            /* Write the number of available bytes here */
  4826  ){
  4827    int amt;
  4828    assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
  4829    assert( pCur->eState==CURSOR_VALID );
  4830    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  4831    assert( cursorOwnsBtShared(pCur) );
  4832    assert( pCur->ix<pCur->pPage->nCell );
  4833    assert( pCur->info.nSize>0 );
  4834    assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
  4835    assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
  4836    amt = pCur->info.nLocal;
  4837    if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
  4838      /* There is too little space on the page for the expected amount
  4839      ** of local content. Database must be corrupt. */
  4840      assert( CORRUPT_DB );
  4841      amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
  4842    }
  4843    *pAmt = (u32)amt;
  4844    return (void*)pCur->info.pPayload;
  4845  }
  4846  
  4847  
  4848  /*
  4849  ** For the entry that cursor pCur is point to, return as
  4850  ** many bytes of the key or data as are available on the local
  4851  ** b-tree page.  Write the number of available bytes into *pAmt.
  4852  **
  4853  ** The pointer returned is ephemeral.  The key/data may move
  4854  ** or be destroyed on the next call to any Btree routine,
  4855  ** including calls from other threads against the same cache.
  4856  ** Hence, a mutex on the BtShared should be held prior to calling
  4857  ** this routine.
  4858  **
  4859  ** These routines is used to get quick access to key and data
  4860  ** in the common case where no overflow pages are used.
  4861  */
  4862  const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
  4863    return fetchPayload(pCur, pAmt);
  4864  }
  4865  
  4866  
  4867  /*
  4868  ** Move the cursor down to a new child page.  The newPgno argument is the
  4869  ** page number of the child page to move to.
  4870  **
  4871  ** This function returns SQLITE_CORRUPT if the page-header flags field of
  4872  ** the new child page does not match the flags field of the parent (i.e.
  4873  ** if an intkey page appears to be the parent of a non-intkey page, or
  4874  ** vice-versa).
  4875  */
  4876  static int moveToChild(BtCursor *pCur, u32 newPgno){
  4877    BtShared *pBt = pCur->pBt;
  4878  
  4879    assert( cursorOwnsBtShared(pCur) );
  4880    assert( pCur->eState==CURSOR_VALID );
  4881    assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
  4882    assert( pCur->iPage>=0 );
  4883    if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
  4884      return SQLITE_CORRUPT_BKPT;
  4885    }
  4886    pCur->info.nSize = 0;
  4887    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
  4888    pCur->aiIdx[pCur->iPage] = pCur->ix;
  4889    pCur->apPage[pCur->iPage] = pCur->pPage;
  4890    pCur->ix = 0;
  4891    pCur->iPage++;
  4892    return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
  4893  }
  4894  
  4895  #ifdef SQLITE_DEBUG
  4896  /*
  4897  ** Page pParent is an internal (non-leaf) tree page. This function 
  4898  ** asserts that page number iChild is the left-child if the iIdx'th
  4899  ** cell in page pParent. Or, if iIdx is equal to the total number of
  4900  ** cells in pParent, that page number iChild is the right-child of
  4901  ** the page.
  4902  */
  4903  static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
  4904    if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
  4905                              ** in a corrupt database */
  4906    assert( iIdx<=pParent->nCell );
  4907    if( iIdx==pParent->nCell ){
  4908      assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
  4909    }else{
  4910      assert( get4byte(findCell(pParent, iIdx))==iChild );
  4911    }
  4912  }
  4913  #else
  4914  #  define assertParentIndex(x,y,z) 
  4915  #endif
  4916  
  4917  /*
  4918  ** Move the cursor up to the parent page.
  4919  **
  4920  ** pCur->idx is set to the cell index that contains the pointer
  4921  ** to the page we are coming from.  If we are coming from the
  4922  ** right-most child page then pCur->idx is set to one more than
  4923  ** the largest cell index.
  4924  */
  4925  static void moveToParent(BtCursor *pCur){
  4926    MemPage *pLeaf;
  4927    assert( cursorOwnsBtShared(pCur) );
  4928    assert( pCur->eState==CURSOR_VALID );
  4929    assert( pCur->iPage>0 );
  4930    assert( pCur->pPage );
  4931    assertParentIndex(
  4932      pCur->apPage[pCur->iPage-1], 
  4933      pCur->aiIdx[pCur->iPage-1], 
  4934      pCur->pPage->pgno
  4935    );
  4936    testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
  4937    pCur->info.nSize = 0;
  4938    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
  4939    pCur->ix = pCur->aiIdx[pCur->iPage-1];
  4940    pLeaf = pCur->pPage;
  4941    pCur->pPage = pCur->apPage[--pCur->iPage];
  4942    releasePageNotNull(pLeaf);
  4943  }
  4944  
  4945  /*
  4946  ** Move the cursor to point to the root page of its b-tree structure.
  4947  **
  4948  ** If the table has a virtual root page, then the cursor is moved to point
  4949  ** to the virtual root page instead of the actual root page. A table has a
  4950  ** virtual root page when the actual root page contains no cells and a 
  4951  ** single child page. This can only happen with the table rooted at page 1.
  4952  **
  4953  ** If the b-tree structure is empty, the cursor state is set to 
  4954  ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
  4955  ** the cursor is set to point to the first cell located on the root
  4956  ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
  4957  **
  4958  ** If this function returns successfully, it may be assumed that the
  4959  ** page-header flags indicate that the [virtual] root-page is the expected 
  4960  ** kind of b-tree page (i.e. if when opening the cursor the caller did not
  4961  ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
  4962  ** indicating a table b-tree, or if the caller did specify a KeyInfo 
  4963  ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
  4964  ** b-tree).
  4965  */
  4966  static int moveToRoot(BtCursor *pCur){
  4967    MemPage *pRoot;
  4968    int rc = SQLITE_OK;
  4969  
  4970    assert( cursorOwnsBtShared(pCur) );
  4971    assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
  4972    assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
  4973    assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
  4974    assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
  4975    assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
  4976  
  4977    if( pCur->iPage>=0 ){
  4978      if( pCur->iPage ){
  4979        releasePageNotNull(pCur->pPage);
  4980        while( --pCur->iPage ){
  4981          releasePageNotNull(pCur->apPage[pCur->iPage]);
  4982        }
  4983        pCur->pPage = pCur->apPage[0];
  4984        goto skip_init;
  4985      }
  4986    }else if( pCur->pgnoRoot==0 ){
  4987      pCur->eState = CURSOR_INVALID;
  4988      return SQLITE_EMPTY;
  4989    }else{
  4990      assert( pCur->iPage==(-1) );
  4991      if( pCur->eState>=CURSOR_REQUIRESEEK ){
  4992        if( pCur->eState==CURSOR_FAULT ){
  4993          assert( pCur->skipNext!=SQLITE_OK );
  4994          return pCur->skipNext;
  4995        }
  4996        sqlite3BtreeClearCursor(pCur);
  4997      }
  4998      rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
  4999                          0, pCur->curPagerFlags);
  5000      if( rc!=SQLITE_OK ){
  5001        pCur->eState = CURSOR_INVALID;
  5002        return rc;
  5003      }
  5004      pCur->iPage = 0;
  5005      pCur->curIntKey = pCur->pPage->intKey;
  5006    }
  5007    pRoot = pCur->pPage;
  5008    assert( pRoot->pgno==pCur->pgnoRoot );
  5009  
  5010    /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
  5011    ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
  5012    ** NULL, the caller expects a table b-tree. If this is not the case,
  5013    ** return an SQLITE_CORRUPT error. 
  5014    **
  5015    ** Earlier versions of SQLite assumed that this test could not fail
  5016    ** if the root page was already loaded when this function was called (i.e.
  5017    ** if pCur->iPage>=0). But this is not so if the database is corrupted 
  5018    ** in such a way that page pRoot is linked into a second b-tree table 
  5019    ** (or the freelist).  */
  5020    assert( pRoot->intKey==1 || pRoot->intKey==0 );
  5021    if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
  5022      return SQLITE_CORRUPT_PGNO(pCur->pPage->pgno);
  5023    }
  5024  
  5025  skip_init:  
  5026    pCur->ix = 0;
  5027    pCur->info.nSize = 0;
  5028    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
  5029  
  5030    pRoot = pCur->pPage;
  5031    if( pRoot->nCell>0 ){
  5032      pCur->eState = CURSOR_VALID;
  5033    }else if( !pRoot->leaf ){
  5034      Pgno subpage;
  5035      if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
  5036      subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
  5037      pCur->eState = CURSOR_VALID;
  5038      rc = moveToChild(pCur, subpage);
  5039    }else{
  5040      pCur->eState = CURSOR_INVALID;
  5041      rc = SQLITE_EMPTY;
  5042    }
  5043    return rc;
  5044  }
  5045  
  5046  /*
  5047  ** Move the cursor down to the left-most leaf entry beneath the
  5048  ** entry to which it is currently pointing.
  5049  **
  5050  ** The left-most leaf is the one with the smallest key - the first
  5051  ** in ascending order.
  5052  */
  5053  static int moveToLeftmost(BtCursor *pCur){
  5054    Pgno pgno;
  5055    int rc = SQLITE_OK;
  5056    MemPage *pPage;
  5057  
  5058    assert( cursorOwnsBtShared(pCur) );
  5059    assert( pCur->eState==CURSOR_VALID );
  5060    while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
  5061      assert( pCur->ix<pPage->nCell );
  5062      pgno = get4byte(findCell(pPage, pCur->ix));
  5063      rc = moveToChild(pCur, pgno);
  5064    }
  5065    return rc;
  5066  }
  5067  
  5068  /*
  5069  ** Move the cursor down to the right-most leaf entry beneath the
  5070  ** page to which it is currently pointing.  Notice the difference
  5071  ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
  5072  ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
  5073  ** finds the right-most entry beneath the *page*.
  5074  **
  5075  ** The right-most entry is the one with the largest key - the last
  5076  ** key in ascending order.
  5077  */
  5078  static int moveToRightmost(BtCursor *pCur){
  5079    Pgno pgno;
  5080    int rc = SQLITE_OK;
  5081    MemPage *pPage = 0;
  5082  
  5083    assert( cursorOwnsBtShared(pCur) );
  5084    assert( pCur->eState==CURSOR_VALID );
  5085    while( !(pPage = pCur->pPage)->leaf ){
  5086      pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  5087      pCur->ix = pPage->nCell;
  5088      rc = moveToChild(pCur, pgno);
  5089      if( rc ) return rc;
  5090    }
  5091    pCur->ix = pPage->nCell-1;
  5092    assert( pCur->info.nSize==0 );
  5093    assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
  5094    return SQLITE_OK;
  5095  }
  5096  
  5097  /* Move the cursor to the first entry in the table.  Return SQLITE_OK
  5098  ** on success.  Set *pRes to 0 if the cursor actually points to something
  5099  ** or set *pRes to 1 if the table is empty.
  5100  */
  5101  int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
  5102    int rc;
  5103  
  5104    assert( cursorOwnsBtShared(pCur) );
  5105    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  5106    rc = moveToRoot(pCur);
  5107    if( rc==SQLITE_OK ){
  5108      assert( pCur->pPage->nCell>0 );
  5109      *pRes = 0;
  5110      rc = moveToLeftmost(pCur);
  5111    }else if( rc==SQLITE_EMPTY ){
  5112      assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
  5113      *pRes = 1;
  5114      rc = SQLITE_OK;
  5115    }
  5116    return rc;
  5117  }
  5118  
  5119  /* Move the cursor to the last entry in the table.  Return SQLITE_OK
  5120  ** on success.  Set *pRes to 0 if the cursor actually points to something
  5121  ** or set *pRes to 1 if the table is empty.
  5122  */
  5123  int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
  5124    int rc;
  5125   
  5126    assert( cursorOwnsBtShared(pCur) );
  5127    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  5128  
  5129    /* If the cursor already points to the last entry, this is a no-op. */
  5130    if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
  5131  #ifdef SQLITE_DEBUG
  5132      /* This block serves to assert() that the cursor really does point 
  5133      ** to the last entry in the b-tree. */
  5134      int ii;
  5135      for(ii=0; ii<pCur->iPage; ii++){
  5136        assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
  5137      }
  5138      assert( pCur->ix==pCur->pPage->nCell-1 );
  5139      assert( pCur->pPage->leaf );
  5140  #endif
  5141      return SQLITE_OK;
  5142    }
  5143  
  5144    rc = moveToRoot(pCur);
  5145    if( rc==SQLITE_OK ){
  5146      assert( pCur->eState==CURSOR_VALID );
  5147      *pRes = 0;
  5148      rc = moveToRightmost(pCur);
  5149      if( rc==SQLITE_OK ){
  5150        pCur->curFlags |= BTCF_AtLast;
  5151      }else{
  5152        pCur->curFlags &= ~BTCF_AtLast;
  5153      }
  5154    }else if( rc==SQLITE_EMPTY ){
  5155      assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
  5156      *pRes = 1;
  5157      rc = SQLITE_OK;
  5158    }
  5159    return rc;
  5160  }
  5161  
  5162  /* Move the cursor so that it points to an entry near the key 
  5163  ** specified by pIdxKey or intKey.   Return a success code.
  5164  **
  5165  ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
  5166  ** must be NULL.  For index tables, pIdxKey is used and intKey
  5167  ** is ignored.
  5168  **
  5169  ** If an exact match is not found, then the cursor is always
  5170  ** left pointing at a leaf page which would hold the entry if it
  5171  ** were present.  The cursor might point to an entry that comes
  5172  ** before or after the key.
  5173  **
  5174  ** An integer is written into *pRes which is the result of
  5175  ** comparing the key with the entry to which the cursor is 
  5176  ** pointing.  The meaning of the integer written into
  5177  ** *pRes is as follows:
  5178  **
  5179  **     *pRes<0      The cursor is left pointing at an entry that
  5180  **                  is smaller than intKey/pIdxKey or if the table is empty
  5181  **                  and the cursor is therefore left point to nothing.
  5182  **
  5183  **     *pRes==0     The cursor is left pointing at an entry that
  5184  **                  exactly matches intKey/pIdxKey.
  5185  **
  5186  **     *pRes>0      The cursor is left pointing at an entry that
  5187  **                  is larger than intKey/pIdxKey.
  5188  **
  5189  ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
  5190  ** exists an entry in the table that exactly matches pIdxKey.  
  5191  */
  5192  int sqlite3BtreeMovetoUnpacked(
  5193    BtCursor *pCur,          /* The cursor to be moved */
  5194    UnpackedRecord *pIdxKey, /* Unpacked index key */
  5195    i64 intKey,              /* The table key */
  5196    int biasRight,           /* If true, bias the search to the high end */
  5197    int *pRes                /* Write search results here */
  5198  ){
  5199    int rc;
  5200    RecordCompare xRecordCompare;
  5201  
  5202    assert( cursorOwnsBtShared(pCur) );
  5203    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  5204    assert( pRes );
  5205    assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
  5206    assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
  5207  
  5208    /* If the cursor is already positioned at the point we are trying
  5209    ** to move to, then just return without doing any work */
  5210    if( pIdxKey==0
  5211     && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
  5212    ){
  5213      if( pCur->info.nKey==intKey ){
  5214        *pRes = 0;
  5215        return SQLITE_OK;
  5216      }
  5217      if( pCur->info.nKey<intKey ){
  5218        if( (pCur->curFlags & BTCF_AtLast)!=0 ){
  5219          *pRes = -1;
  5220          return SQLITE_OK;
  5221        }
  5222        /* If the requested key is one more than the previous key, then
  5223        ** try to get there using sqlite3BtreeNext() rather than a full
  5224        ** binary search.  This is an optimization only.  The correct answer
  5225        ** is still obtained without this case, only a little more slowely */
  5226        if( pCur->info.nKey+1==intKey && !pCur->skipNext ){
  5227          *pRes = 0;
  5228          rc = sqlite3BtreeNext(pCur, 0);
  5229          if( rc==SQLITE_OK ){
  5230            getCellInfo(pCur);
  5231            if( pCur->info.nKey==intKey ){
  5232              return SQLITE_OK;
  5233            }
  5234          }else if( rc==SQLITE_DONE ){
  5235            rc = SQLITE_OK;
  5236          }else{
  5237            return rc;
  5238          }
  5239        }
  5240      }
  5241    }
  5242  
  5243    if( pIdxKey ){
  5244      xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
  5245      pIdxKey->errCode = 0;
  5246      assert( pIdxKey->default_rc==1 
  5247           || pIdxKey->default_rc==0 
  5248           || pIdxKey->default_rc==-1
  5249      );
  5250    }else{
  5251      xRecordCompare = 0; /* All keys are integers */
  5252    }
  5253  
  5254    rc = moveToRoot(pCur);
  5255    if( rc ){
  5256      if( rc==SQLITE_EMPTY ){
  5257        assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
  5258        *pRes = -1;
  5259        return SQLITE_OK;
  5260      }
  5261      return rc;
  5262    }
  5263    assert( pCur->pPage );
  5264    assert( pCur->pPage->isInit );
  5265    assert( pCur->eState==CURSOR_VALID );
  5266    assert( pCur->pPage->nCell > 0 );
  5267    assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
  5268    assert( pCur->curIntKey || pIdxKey );
  5269    for(;;){
  5270      int lwr, upr, idx, c;
  5271      Pgno chldPg;
  5272      MemPage *pPage = pCur->pPage;
  5273      u8 *pCell;                          /* Pointer to current cell in pPage */
  5274  
  5275      /* pPage->nCell must be greater than zero. If this is the root-page
  5276      ** the cursor would have been INVALID above and this for(;;) loop
  5277      ** not run. If this is not the root-page, then the moveToChild() routine
  5278      ** would have already detected db corruption. Similarly, pPage must
  5279      ** be the right kind (index or table) of b-tree page. Otherwise
  5280      ** a moveToChild() or moveToRoot() call would have detected corruption.  */
  5281      assert( pPage->nCell>0 );
  5282      assert( pPage->intKey==(pIdxKey==0) );
  5283      lwr = 0;
  5284      upr = pPage->nCell-1;
  5285      assert( biasRight==0 || biasRight==1 );
  5286      idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
  5287      pCur->ix = (u16)idx;
  5288      if( xRecordCompare==0 ){
  5289        for(;;){
  5290          i64 nCellKey;
  5291          pCell = findCellPastPtr(pPage, idx);
  5292          if( pPage->intKeyLeaf ){
  5293            while( 0x80 <= *(pCell++) ){
  5294              if( pCell>=pPage->aDataEnd ){
  5295                return SQLITE_CORRUPT_PGNO(pPage->pgno);
  5296              }
  5297            }
  5298          }
  5299          getVarint(pCell, (u64*)&nCellKey);
  5300          if( nCellKey<intKey ){
  5301            lwr = idx+1;
  5302            if( lwr>upr ){ c = -1; break; }
  5303          }else if( nCellKey>intKey ){
  5304            upr = idx-1;
  5305            if( lwr>upr ){ c = +1; break; }
  5306          }else{
  5307            assert( nCellKey==intKey );
  5308            pCur->ix = (u16)idx;
  5309            if( !pPage->leaf ){
  5310              lwr = idx;
  5311              goto moveto_next_layer;
  5312            }else{
  5313              pCur->curFlags |= BTCF_ValidNKey;
  5314              pCur->info.nKey = nCellKey;
  5315              pCur->info.nSize = 0;
  5316              *pRes = 0;
  5317              return SQLITE_OK;
  5318            }
  5319          }
  5320          assert( lwr+upr>=0 );
  5321          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
  5322        }
  5323      }else{
  5324        for(;;){
  5325          int nCell;  /* Size of the pCell cell in bytes */
  5326          pCell = findCellPastPtr(pPage, idx);
  5327  
  5328          /* The maximum supported page-size is 65536 bytes. This means that
  5329          ** the maximum number of record bytes stored on an index B-Tree
  5330          ** page is less than 16384 bytes and may be stored as a 2-byte
  5331          ** varint. This information is used to attempt to avoid parsing 
  5332          ** the entire cell by checking for the cases where the record is 
  5333          ** stored entirely within the b-tree page by inspecting the first 
  5334          ** 2 bytes of the cell.
  5335          */
  5336          nCell = pCell[0];
  5337          if( nCell<=pPage->max1bytePayload ){
  5338            /* This branch runs if the record-size field of the cell is a
  5339            ** single byte varint and the record fits entirely on the main
  5340            ** b-tree page.  */
  5341            testcase( pCell+nCell+1==pPage->aDataEnd );
  5342            c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
  5343          }else if( !(pCell[1] & 0x80) 
  5344            && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
  5345          ){
  5346            /* The record-size field is a 2 byte varint and the record 
  5347            ** fits entirely on the main b-tree page.  */
  5348            testcase( pCell+nCell+2==pPage->aDataEnd );
  5349            c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
  5350          }else{
  5351            /* The record flows over onto one or more overflow pages. In
  5352            ** this case the whole cell needs to be parsed, a buffer allocated
  5353            ** and accessPayload() used to retrieve the record into the
  5354            ** buffer before VdbeRecordCompare() can be called. 
  5355            **
  5356            ** If the record is corrupt, the xRecordCompare routine may read
  5357            ** up to two varints past the end of the buffer. An extra 18 
  5358            ** bytes of padding is allocated at the end of the buffer in
  5359            ** case this happens.  */
  5360            void *pCellKey;
  5361            u8 * const pCellBody = pCell - pPage->childPtrSize;
  5362            pPage->xParseCell(pPage, pCellBody, &pCur->info);
  5363            nCell = (int)pCur->info.nKey;
  5364            testcase( nCell<0 );   /* True if key size is 2^32 or more */
  5365            testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
  5366            testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
  5367            testcase( nCell==2 );  /* Minimum legal index key size */
  5368            if( nCell<2 ){
  5369              rc = SQLITE_CORRUPT_PGNO(pPage->pgno);
  5370              goto moveto_finish;
  5371            }
  5372            pCellKey = sqlite3Malloc( nCell+18 );
  5373            if( pCellKey==0 ){
  5374              rc = SQLITE_NOMEM_BKPT;
  5375              goto moveto_finish;
  5376            }
  5377            pCur->ix = (u16)idx;
  5378            rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
  5379            pCur->curFlags &= ~BTCF_ValidOvfl;
  5380            if( rc ){
  5381              sqlite3_free(pCellKey);
  5382              goto moveto_finish;
  5383            }
  5384            c = xRecordCompare(nCell, pCellKey, pIdxKey);
  5385            sqlite3_free(pCellKey);
  5386          }
  5387          assert( 
  5388              (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
  5389           && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
  5390          );
  5391          if( c<0 ){
  5392            lwr = idx+1;
  5393          }else if( c>0 ){
  5394            upr = idx-1;
  5395          }else{
  5396            assert( c==0 );
  5397            *pRes = 0;
  5398            rc = SQLITE_OK;
  5399            pCur->ix = (u16)idx;
  5400            if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
  5401            goto moveto_finish;
  5402          }
  5403          if( lwr>upr ) break;
  5404          assert( lwr+upr>=0 );
  5405          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
  5406        }
  5407      }
  5408      assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
  5409      assert( pPage->isInit );
  5410      if( pPage->leaf ){
  5411        assert( pCur->ix<pCur->pPage->nCell );
  5412        pCur->ix = (u16)idx;
  5413        *pRes = c;
  5414        rc = SQLITE_OK;
  5415        goto moveto_finish;
  5416      }
  5417  moveto_next_layer:
  5418      if( lwr>=pPage->nCell ){
  5419        chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  5420      }else{
  5421        chldPg = get4byte(findCell(pPage, lwr));
  5422      }
  5423      pCur->ix = (u16)lwr;
  5424      rc = moveToChild(pCur, chldPg);
  5425      if( rc ) break;
  5426    }
  5427  moveto_finish:
  5428    pCur->info.nSize = 0;
  5429    assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
  5430    return rc;
  5431  }
  5432  
  5433  
  5434  /*
  5435  ** Return TRUE if the cursor is not pointing at an entry of the table.
  5436  **
  5437  ** TRUE will be returned after a call to sqlite3BtreeNext() moves
  5438  ** past the last entry in the table or sqlite3BtreePrev() moves past
  5439  ** the first entry.  TRUE is also returned if the table is empty.
  5440  */
  5441  int sqlite3BtreeEof(BtCursor *pCur){
  5442    /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
  5443    ** have been deleted? This API will need to change to return an error code
  5444    ** as well as the boolean result value.
  5445    */
  5446    return (CURSOR_VALID!=pCur->eState);
  5447  }
  5448  
  5449  /*
  5450  ** Return an estimate for the number of rows in the table that pCur is
  5451  ** pointing to.  Return a negative number if no estimate is currently 
  5452  ** available.
  5453  */
  5454  i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
  5455    i64 n;
  5456    u8 i;
  5457  
  5458    assert( cursorOwnsBtShared(pCur) );
  5459    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  5460  
  5461    /* Currently this interface is only called by the OP_IfSmaller
  5462    ** opcode, and it that case the cursor will always be valid and
  5463    ** will always point to a leaf node. */
  5464    if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
  5465    if( NEVER(pCur->pPage->leaf==0) ) return -1;
  5466  
  5467    n = pCur->pPage->nCell;
  5468    for(i=0; i<pCur->iPage; i++){
  5469      n *= pCur->apPage[i]->nCell;
  5470    }
  5471    return n;
  5472  }
  5473  
  5474  /*
  5475  ** Advance the cursor to the next entry in the database. 
  5476  ** Return value:
  5477  **
  5478  **    SQLITE_OK        success
  5479  **    SQLITE_DONE      cursor is already pointing at the last element
  5480  **    otherwise        some kind of error occurred
  5481  **
  5482  ** The main entry point is sqlite3BtreeNext().  That routine is optimized
  5483  ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
  5484  ** to the next cell on the current page.  The (slower) btreeNext() helper
  5485  ** routine is called when it is necessary to move to a different page or
  5486  ** to restore the cursor.
  5487  **
  5488  ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
  5489  ** cursor corresponds to an SQL index and this routine could have been
  5490  ** skipped if the SQL index had been a unique index.  The F argument
  5491  ** is a hint to the implement.  SQLite btree implementation does not use
  5492  ** this hint, but COMDB2 does.
  5493  */
  5494  static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
  5495    int rc;
  5496    int idx;
  5497    MemPage *pPage;
  5498  
  5499    assert( cursorOwnsBtShared(pCur) );
  5500    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
  5501    if( pCur->eState!=CURSOR_VALID ){
  5502      assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
  5503      rc = restoreCursorPosition(pCur);
  5504      if( rc!=SQLITE_OK ){
  5505        return rc;
  5506      }
  5507      if( CURSOR_INVALID==pCur->eState ){
  5508        return SQLITE_DONE;
  5509      }
  5510      if( pCur->skipNext ){
  5511        assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
  5512        pCur->eState = CURSOR_VALID;
  5513        if( pCur->skipNext>0 ){
  5514          pCur->skipNext = 0;
  5515          return SQLITE_OK;
  5516        }
  5517        pCur->skipNext = 0;
  5518      }
  5519    }
  5520  
  5521    pPage = pCur->pPage;
  5522    idx = ++pCur->ix;
  5523    assert( pPage->isInit );
  5524  
  5525    /* If the database file is corrupt, it is possible for the value of idx 
  5526    ** to be invalid here. This can only occur if a second cursor modifies
  5527    ** the page while cursor pCur is holding a reference to it. Which can
  5528    ** only happen if the database is corrupt in such a way as to link the
  5529    ** page into more than one b-tree structure. */
  5530    testcase( idx>pPage->nCell );
  5531  
  5532    if( idx>=pPage->nCell ){
  5533      if( !pPage->leaf ){
  5534        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
  5535        if( rc ) return rc;
  5536        return moveToLeftmost(pCur);
  5537      }
  5538      do{
  5539        if( pCur->iPage==0 ){
  5540          pCur->eState = CURSOR_INVALID;
  5541          return SQLITE_DONE;
  5542        }
  5543        moveToParent(pCur);
  5544        pPage = pCur->pPage;
  5545      }while( pCur->ix>=pPage->nCell );
  5546      if( pPage->intKey ){
  5547        return sqlite3BtreeNext(pCur, 0);
  5548      }else{
  5549        return SQLITE_OK;
  5550      }
  5551    }
  5552    if( pPage->leaf ){
  5553      return SQLITE_OK;
  5554    }else{
  5555      return moveToLeftmost(pCur);
  5556    }
  5557  }
  5558  int sqlite3BtreeNext(BtCursor *pCur, int flags){
  5559    MemPage *pPage;
  5560    UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
  5561    assert( cursorOwnsBtShared(pCur) );
  5562    assert( flags==0 || flags==1 );
  5563    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
  5564    pCur->info.nSize = 0;
  5565    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
  5566    if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
  5567    pPage = pCur->pPage;
  5568    if( (++pCur->ix)>=pPage->nCell ){
  5569      pCur->ix--;
  5570      return btreeNext(pCur);
  5571    }
  5572    if( pPage->leaf ){
  5573      return SQLITE_OK;
  5574    }else{
  5575      return moveToLeftmost(pCur);
  5576    }
  5577  }
  5578  
  5579  /*
  5580  ** Step the cursor to the back to the previous entry in the database.
  5581  ** Return values:
  5582  **
  5583  **     SQLITE_OK     success
  5584  **     SQLITE_DONE   the cursor is already on the first element of the table
  5585  **     otherwise     some kind of error occurred
  5586  **
  5587  ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
  5588  ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
  5589  ** to the previous cell on the current page.  The (slower) btreePrevious()
  5590  ** helper routine is called when it is necessary to move to a different page
  5591  ** or to restore the cursor.
  5592  **
  5593  ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
  5594  ** the cursor corresponds to an SQL index and this routine could have been
  5595  ** skipped if the SQL index had been a unique index.  The F argument is a
  5596  ** hint to the implement.  The native SQLite btree implementation does not
  5597  ** use this hint, but COMDB2 does.
  5598  */
  5599  static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
  5600    int rc;
  5601    MemPage *pPage;
  5602  
  5603    assert( cursorOwnsBtShared(pCur) );
  5604    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
  5605    assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
  5606    assert( pCur->info.nSize==0 );
  5607    if( pCur->eState!=CURSOR_VALID ){
  5608      rc = restoreCursorPosition(pCur);
  5609      if( rc!=SQLITE_OK ){
  5610        return rc;
  5611      }
  5612      if( CURSOR_INVALID==pCur->eState ){
  5613        return SQLITE_DONE;
  5614      }
  5615      if( pCur->skipNext ){
  5616        assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
  5617        pCur->eState = CURSOR_VALID;
  5618        if( pCur->skipNext<0 ){
  5619          pCur->skipNext = 0;
  5620          return SQLITE_OK;
  5621        }
  5622        pCur->skipNext = 0;
  5623      }
  5624    }
  5625  
  5626    pPage = pCur->pPage;
  5627    assert( pPage->isInit );
  5628    if( !pPage->leaf ){
  5629      int idx = pCur->ix;
  5630      rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
  5631      if( rc ) return rc;
  5632      rc = moveToRightmost(pCur);
  5633    }else{
  5634      while( pCur->ix==0 ){
  5635        if( pCur->iPage==0 ){
  5636          pCur->eState = CURSOR_INVALID;
  5637          return SQLITE_DONE;
  5638        }
  5639        moveToParent(pCur);
  5640      }
  5641      assert( pCur->info.nSize==0 );
  5642      assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
  5643  
  5644      pCur->ix--;
  5645      pPage = pCur->pPage;
  5646      if( pPage->intKey && !pPage->leaf ){
  5647        rc = sqlite3BtreePrevious(pCur, 0);
  5648      }else{
  5649        rc = SQLITE_OK;
  5650      }
  5651    }
  5652    return rc;
  5653  }
  5654  int sqlite3BtreePrevious(BtCursor *pCur, int flags){
  5655    assert( cursorOwnsBtShared(pCur) );
  5656    assert( flags==0 || flags==1 );
  5657    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
  5658    UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
  5659    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
  5660    pCur->info.nSize = 0;
  5661    if( pCur->eState!=CURSOR_VALID
  5662     || pCur->ix==0
  5663     || pCur->pPage->leaf==0
  5664    ){
  5665      return btreePrevious(pCur);
  5666    }
  5667    pCur->ix--;
  5668    return SQLITE_OK;
  5669  }
  5670  
  5671  /*
  5672  ** Allocate a new page from the database file.
  5673  **
  5674  ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
  5675  ** has already been called on the new page.)  The new page has also
  5676  ** been referenced and the calling routine is responsible for calling
  5677  ** sqlite3PagerUnref() on the new page when it is done.
  5678  **
  5679  ** SQLITE_OK is returned on success.  Any other return value indicates
  5680  ** an error.  *ppPage is set to NULL in the event of an error.
  5681  **
  5682  ** If the "nearby" parameter is not 0, then an effort is made to 
  5683  ** locate a page close to the page number "nearby".  This can be used in an
  5684  ** attempt to keep related pages close to each other in the database file,
  5685  ** which in turn can make database access faster.
  5686  **
  5687  ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
  5688  ** anywhere on the free-list, then it is guaranteed to be returned.  If
  5689  ** eMode is BTALLOC_LT then the page returned will be less than or equal
  5690  ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
  5691  ** are no restrictions on which page is returned.
  5692  */
  5693  static int allocateBtreePage(
  5694    BtShared *pBt,         /* The btree */
  5695    MemPage **ppPage,      /* Store pointer to the allocated page here */
  5696    Pgno *pPgno,           /* Store the page number here */
  5697    Pgno nearby,           /* Search for a page near this one */
  5698    u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
  5699  ){
  5700    MemPage *pPage1;
  5701    int rc;
  5702    u32 n;     /* Number of pages on the freelist */
  5703    u32 k;     /* Number of leaves on the trunk of the freelist */
  5704    MemPage *pTrunk = 0;
  5705    MemPage *pPrevTrunk = 0;
  5706    Pgno mxPage;     /* Total size of the database file */
  5707  
  5708    assert( sqlite3_mutex_held(pBt->mutex) );
  5709    assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
  5710    pPage1 = pBt->pPage1;
  5711    mxPage = btreePagecount(pBt);
  5712    /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
  5713    ** stores stores the total number of pages on the freelist. */
  5714    n = get4byte(&pPage1->aData[36]);
  5715    testcase( n==mxPage-1 );
  5716    if( n>=mxPage ){
  5717      return SQLITE_CORRUPT_BKPT;
  5718    }
  5719    if( n>0 ){
  5720      /* There are pages on the freelist.  Reuse one of those pages. */
  5721      Pgno iTrunk;
  5722      u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
  5723      u32 nSearch = 0;   /* Count of the number of search attempts */
  5724      
  5725      /* If eMode==BTALLOC_EXACT and a query of the pointer-map
  5726      ** shows that the page 'nearby' is somewhere on the free-list, then
  5727      ** the entire-list will be searched for that page.
  5728      */
  5729  #ifndef SQLITE_OMIT_AUTOVACUUM
  5730      if( eMode==BTALLOC_EXACT ){
  5731        if( nearby<=mxPage ){
  5732          u8 eType;
  5733          assert( nearby>0 );
  5734          assert( pBt->autoVacuum );
  5735          rc = ptrmapGet(pBt, nearby, &eType, 0);
  5736          if( rc ) return rc;
  5737          if( eType==PTRMAP_FREEPAGE ){
  5738            searchList = 1;
  5739          }
  5740        }
  5741      }else if( eMode==BTALLOC_LE ){
  5742        searchList = 1;
  5743      }
  5744  #endif
  5745  
  5746      /* Decrement the free-list count by 1. Set iTrunk to the index of the
  5747      ** first free-list trunk page. iPrevTrunk is initially 1.
  5748      */
  5749      rc = sqlite3PagerWrite(pPage1->pDbPage);
  5750      if( rc ) return rc;
  5751      put4byte(&pPage1->aData[36], n-1);
  5752  
  5753      /* The code within this loop is run only once if the 'searchList' variable
  5754      ** is not true. Otherwise, it runs once for each trunk-page on the
  5755      ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
  5756      ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
  5757      */
  5758      do {
  5759        pPrevTrunk = pTrunk;
  5760        if( pPrevTrunk ){
  5761          /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
  5762          ** is the page number of the next freelist trunk page in the list or
  5763          ** zero if this is the last freelist trunk page. */
  5764          iTrunk = get4byte(&pPrevTrunk->aData[0]);
  5765        }else{
  5766          /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
  5767          ** stores the page number of the first page of the freelist, or zero if
  5768          ** the freelist is empty. */
  5769          iTrunk = get4byte(&pPage1->aData[32]);
  5770        }
  5771        testcase( iTrunk==mxPage );
  5772        if( iTrunk>mxPage || nSearch++ > n ){
  5773          rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
  5774        }else{
  5775          rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
  5776        }
  5777        if( rc ){
  5778          pTrunk = 0;
  5779          goto end_allocate_page;
  5780        }
  5781        assert( pTrunk!=0 );
  5782        assert( pTrunk->aData!=0 );
  5783        /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
  5784        ** is the number of leaf page pointers to follow. */
  5785        k = get4byte(&pTrunk->aData[4]);
  5786        if( k==0 && !searchList ){
  5787          /* The trunk has no leaves and the list is not being searched. 
  5788          ** So extract the trunk page itself and use it as the newly 
  5789          ** allocated page */
  5790          assert( pPrevTrunk==0 );
  5791          rc = sqlite3PagerWrite(pTrunk->pDbPage);
  5792          if( rc ){
  5793            goto end_allocate_page;
  5794          }
  5795          *pPgno = iTrunk;
  5796          memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  5797          *ppPage = pTrunk;
  5798          pTrunk = 0;
  5799          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  5800        }else if( k>(u32)(pBt->usableSize/4 - 2) ){
  5801          /* Value of k is out of range.  Database corruption */
  5802          rc = SQLITE_CORRUPT_PGNO(iTrunk);
  5803          goto end_allocate_page;
  5804  #ifndef SQLITE_OMIT_AUTOVACUUM
  5805        }else if( searchList 
  5806              && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 
  5807        ){
  5808          /* The list is being searched and this trunk page is the page
  5809          ** to allocate, regardless of whether it has leaves.
  5810          */
  5811          *pPgno = iTrunk;
  5812          *ppPage = pTrunk;
  5813          searchList = 0;
  5814          rc = sqlite3PagerWrite(pTrunk->pDbPage);
  5815          if( rc ){
  5816            goto end_allocate_page;
  5817          }
  5818          if( k==0 ){
  5819            if( !pPrevTrunk ){
  5820              memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  5821            }else{
  5822              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
  5823              if( rc!=SQLITE_OK ){
  5824                goto end_allocate_page;
  5825              }
  5826              memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
  5827            }
  5828          }else{
  5829            /* The trunk page is required by the caller but it contains 
  5830            ** pointers to free-list leaves. The first leaf becomes a trunk
  5831            ** page in this case.
  5832            */
  5833            MemPage *pNewTrunk;
  5834            Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
  5835            if( iNewTrunk>mxPage ){ 
  5836              rc = SQLITE_CORRUPT_PGNO(iTrunk);
  5837              goto end_allocate_page;
  5838            }
  5839            testcase( iNewTrunk==mxPage );
  5840            rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
  5841            if( rc!=SQLITE_OK ){
  5842              goto end_allocate_page;
  5843            }
  5844            rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
  5845            if( rc!=SQLITE_OK ){
  5846              releasePage(pNewTrunk);
  5847              goto end_allocate_page;
  5848            }
  5849            memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
  5850            put4byte(&pNewTrunk->aData[4], k-1);
  5851            memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
  5852            releasePage(pNewTrunk);
  5853            if( !pPrevTrunk ){
  5854              assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
  5855              put4byte(&pPage1->aData[32], iNewTrunk);
  5856            }else{
  5857              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
  5858              if( rc ){
  5859                goto end_allocate_page;
  5860              }
  5861              put4byte(&pPrevTrunk->aData[0], iNewTrunk);
  5862            }
  5863          }
  5864          pTrunk = 0;
  5865          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  5866  #endif
  5867        }else if( k>0 ){
  5868          /* Extract a leaf from the trunk */
  5869          u32 closest;
  5870          Pgno iPage;
  5871          unsigned char *aData = pTrunk->aData;
  5872          if( nearby>0 ){
  5873            u32 i;
  5874            closest = 0;
  5875            if( eMode==BTALLOC_LE ){
  5876              for(i=0; i<k; i++){
  5877                iPage = get4byte(&aData[8+i*4]);
  5878                if( iPage<=nearby ){
  5879                  closest = i;
  5880                  break;
  5881                }
  5882              }
  5883            }else{
  5884              int dist;
  5885              dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
  5886              for(i=1; i<k; i++){
  5887                int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
  5888                if( d2<dist ){
  5889                  closest = i;
  5890                  dist = d2;
  5891                }
  5892              }
  5893            }
  5894          }else{
  5895            closest = 0;
  5896          }
  5897  
  5898          iPage = get4byte(&aData[8+closest*4]);
  5899          testcase( iPage==mxPage );
  5900          if( iPage>mxPage ){
  5901            rc = SQLITE_CORRUPT_PGNO(iTrunk);
  5902            goto end_allocate_page;
  5903          }
  5904          testcase( iPage==mxPage );
  5905          if( !searchList 
  5906           || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 
  5907          ){
  5908            int noContent;
  5909            *pPgno = iPage;
  5910            TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
  5911                   ": %d more free pages\n",
  5912                   *pPgno, closest+1, k, pTrunk->pgno, n-1));
  5913            rc = sqlite3PagerWrite(pTrunk->pDbPage);
  5914            if( rc ) goto end_allocate_page;
  5915            if( closest<k-1 ){
  5916              memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
  5917            }
  5918            put4byte(&aData[4], k-1);
  5919            noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
  5920            rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
  5921            if( rc==SQLITE_OK ){
  5922              rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  5923              if( rc!=SQLITE_OK ){
  5924                releasePage(*ppPage);
  5925                *ppPage = 0;
  5926              }
  5927            }
  5928            searchList = 0;
  5929          }
  5930        }
  5931        releasePage(pPrevTrunk);
  5932        pPrevTrunk = 0;
  5933      }while( searchList );
  5934    }else{
  5935      /* There are no pages on the freelist, so append a new page to the
  5936      ** database image.
  5937      **
  5938      ** Normally, new pages allocated by this block can be requested from the
  5939      ** pager layer with the 'no-content' flag set. This prevents the pager
  5940      ** from trying to read the pages content from disk. However, if the
  5941      ** current transaction has already run one or more incremental-vacuum
  5942      ** steps, then the page we are about to allocate may contain content
  5943      ** that is required in the event of a rollback. In this case, do
  5944      ** not set the no-content flag. This causes the pager to load and journal
  5945      ** the current page content before overwriting it.
  5946      **
  5947      ** Note that the pager will not actually attempt to load or journal 
  5948      ** content for any page that really does lie past the end of the database
  5949      ** file on disk. So the effects of disabling the no-content optimization
  5950      ** here are confined to those pages that lie between the end of the
  5951      ** database image and the end of the database file.
  5952      */
  5953      int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
  5954  
  5955      rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  5956      if( rc ) return rc;
  5957      pBt->nPage++;
  5958      if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
  5959  
  5960  #ifndef SQLITE_OMIT_AUTOVACUUM
  5961      if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
  5962        /* If *pPgno refers to a pointer-map page, allocate two new pages
  5963        ** at the end of the file instead of one. The first allocated page
  5964        ** becomes a new pointer-map page, the second is used by the caller.
  5965        */
  5966        MemPage *pPg = 0;
  5967        TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
  5968        assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
  5969        rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
  5970        if( rc==SQLITE_OK ){
  5971          rc = sqlite3PagerWrite(pPg->pDbPage);
  5972          releasePage(pPg);
  5973        }
  5974        if( rc ) return rc;
  5975        pBt->nPage++;
  5976        if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
  5977      }
  5978  #endif
  5979      put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
  5980      *pPgno = pBt->nPage;
  5981  
  5982      assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  5983      rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
  5984      if( rc ) return rc;
  5985      rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  5986      if( rc!=SQLITE_OK ){
  5987        releasePage(*ppPage);
  5988        *ppPage = 0;
  5989      }
  5990      TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
  5991    }
  5992  
  5993    assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  5994  
  5995  end_allocate_page:
  5996    releasePage(pTrunk);
  5997    releasePage(pPrevTrunk);
  5998    assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
  5999    assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
  6000    return rc;
  6001  }
  6002  
  6003  /*
  6004  ** This function is used to add page iPage to the database file free-list. 
  6005  ** It is assumed that the page is not already a part of the free-list.
  6006  **
  6007  ** The value passed as the second argument to this function is optional.
  6008  ** If the caller happens to have a pointer to the MemPage object 
  6009  ** corresponding to page iPage handy, it may pass it as the second value. 
  6010  ** Otherwise, it may pass NULL.
  6011  **
  6012  ** If a pointer to a MemPage object is passed as the second argument,
  6013  ** its reference count is not altered by this function.
  6014  */
  6015  static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
  6016    MemPage *pTrunk = 0;                /* Free-list trunk page */
  6017    Pgno iTrunk = 0;                    /* Page number of free-list trunk page */ 
  6018    MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
  6019    MemPage *pPage;                     /* Page being freed. May be NULL. */
  6020    int rc;                             /* Return Code */
  6021    int nFree;                          /* Initial number of pages on free-list */
  6022  
  6023    assert( sqlite3_mutex_held(pBt->mutex) );
  6024    assert( CORRUPT_DB || iPage>1 );
  6025    assert( !pMemPage || pMemPage->pgno==iPage );
  6026  
  6027    if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
  6028    if( pMemPage ){
  6029      pPage = pMemPage;
  6030      sqlite3PagerRef(pPage->pDbPage);
  6031    }else{
  6032      pPage = btreePageLookup(pBt, iPage);
  6033    }
  6034  
  6035    /* Increment the free page count on pPage1 */
  6036    rc = sqlite3PagerWrite(pPage1->pDbPage);
  6037    if( rc ) goto freepage_out;
  6038    nFree = get4byte(&pPage1->aData[36]);
  6039    put4byte(&pPage1->aData[36], nFree+1);
  6040  
  6041    if( pBt->btsFlags & BTS_SECURE_DELETE ){
  6042      /* If the secure_delete option is enabled, then
  6043      ** always fully overwrite deleted information with zeros.
  6044      */
  6045      if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
  6046       ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
  6047      ){
  6048        goto freepage_out;
  6049      }
  6050      memset(pPage->aData, 0, pPage->pBt->pageSize);
  6051    }
  6052  
  6053    /* If the database supports auto-vacuum, write an entry in the pointer-map
  6054    ** to indicate that the page is free.
  6055    */
  6056    if( ISAUTOVACUUM ){
  6057      ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
  6058      if( rc ) goto freepage_out;
  6059    }
  6060  
  6061    /* Now manipulate the actual database free-list structure. There are two
  6062    ** possibilities. If the free-list is currently empty, or if the first
  6063    ** trunk page in the free-list is full, then this page will become a
  6064    ** new free-list trunk page. Otherwise, it will become a leaf of the
  6065    ** first trunk page in the current free-list. This block tests if it
  6066    ** is possible to add the page as a new free-list leaf.
  6067    */
  6068    if( nFree!=0 ){
  6069      u32 nLeaf;                /* Initial number of leaf cells on trunk page */
  6070  
  6071      iTrunk = get4byte(&pPage1->aData[32]);
  6072      rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
  6073      if( rc!=SQLITE_OK ){
  6074        goto freepage_out;
  6075      }
  6076  
  6077      nLeaf = get4byte(&pTrunk->aData[4]);
  6078      assert( pBt->usableSize>32 );
  6079      if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
  6080        rc = SQLITE_CORRUPT_BKPT;
  6081        goto freepage_out;
  6082      }
  6083      if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
  6084        /* In this case there is room on the trunk page to insert the page
  6085        ** being freed as a new leaf.
  6086        **
  6087        ** Note that the trunk page is not really full until it contains
  6088        ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
  6089        ** coded.  But due to a coding error in versions of SQLite prior to
  6090        ** 3.6.0, databases with freelist trunk pages holding more than
  6091        ** usableSize/4 - 8 entries will be reported as corrupt.  In order
  6092        ** to maintain backwards compatibility with older versions of SQLite,
  6093        ** we will continue to restrict the number of entries to usableSize/4 - 8
  6094        ** for now.  At some point in the future (once everyone has upgraded
  6095        ** to 3.6.0 or later) we should consider fixing the conditional above
  6096        ** to read "usableSize/4-2" instead of "usableSize/4-8".
  6097        **
  6098        ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
  6099        ** avoid using the last six entries in the freelist trunk page array in
  6100        ** order that database files created by newer versions of SQLite can be
  6101        ** read by older versions of SQLite.
  6102        */
  6103        rc = sqlite3PagerWrite(pTrunk->pDbPage);
  6104        if( rc==SQLITE_OK ){
  6105          put4byte(&pTrunk->aData[4], nLeaf+1);
  6106          put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
  6107          if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
  6108            sqlite3PagerDontWrite(pPage->pDbPage);
  6109          }
  6110          rc = btreeSetHasContent(pBt, iPage);
  6111        }
  6112        TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
  6113        goto freepage_out;
  6114      }
  6115    }
  6116  
  6117    /* If control flows to this point, then it was not possible to add the
  6118    ** the page being freed as a leaf page of the first trunk in the free-list.
  6119    ** Possibly because the free-list is empty, or possibly because the 
  6120    ** first trunk in the free-list is full. Either way, the page being freed
  6121    ** will become the new first trunk page in the free-list.
  6122    */
  6123    if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
  6124      goto freepage_out;
  6125    }
  6126    rc = sqlite3PagerWrite(pPage->pDbPage);
  6127    if( rc!=SQLITE_OK ){
  6128      goto freepage_out;
  6129    }
  6130    put4byte(pPage->aData, iTrunk);
  6131    put4byte(&pPage->aData[4], 0);
  6132    put4byte(&pPage1->aData[32], iPage);
  6133    TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
  6134  
  6135  freepage_out:
  6136    if( pPage ){
  6137      pPage->isInit = 0;
  6138    }
  6139    releasePage(pPage);
  6140    releasePage(pTrunk);
  6141    return rc;
  6142  }
  6143  static void freePage(MemPage *pPage, int *pRC){
  6144    if( (*pRC)==SQLITE_OK ){
  6145      *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
  6146    }
  6147  }
  6148  
  6149  /*
  6150  ** Free any overflow pages associated with the given Cell.  Write the
  6151  ** local Cell size (the number of bytes on the original page, omitting
  6152  ** overflow) into *pnSize.
  6153  */
  6154  static int clearCell(
  6155    MemPage *pPage,          /* The page that contains the Cell */
  6156    unsigned char *pCell,    /* First byte of the Cell */
  6157    CellInfo *pInfo          /* Size information about the cell */
  6158  ){
  6159    BtShared *pBt;
  6160    Pgno ovflPgno;
  6161    int rc;
  6162    int nOvfl;
  6163    u32 ovflPageSize;
  6164  
  6165    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  6166    pPage->xParseCell(pPage, pCell, pInfo);
  6167    if( pInfo->nLocal==pInfo->nPayload ){
  6168      return SQLITE_OK;  /* No overflow pages. Return without doing anything */
  6169    }
  6170    if( pCell+pInfo->nSize-1 > pPage->aData+pPage->maskPage ){
  6171      /* Cell extends past end of page */
  6172      return SQLITE_CORRUPT_PGNO(pPage->pgno);
  6173    }
  6174    ovflPgno = get4byte(pCell + pInfo->nSize - 4);
  6175    pBt = pPage->pBt;
  6176    assert( pBt->usableSize > 4 );
  6177    ovflPageSize = pBt->usableSize - 4;
  6178    nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
  6179    assert( nOvfl>0 || 
  6180      (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
  6181    );
  6182    while( nOvfl-- ){
  6183      Pgno iNext = 0;
  6184      MemPage *pOvfl = 0;
  6185      if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
  6186        /* 0 is not a legal page number and page 1 cannot be an 
  6187        ** overflow page. Therefore if ovflPgno<2 or past the end of the 
  6188        ** file the database must be corrupt. */
  6189        return SQLITE_CORRUPT_BKPT;
  6190      }
  6191      if( nOvfl ){
  6192        rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
  6193        if( rc ) return rc;
  6194      }
  6195  
  6196      if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
  6197       && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
  6198      ){
  6199        /* There is no reason any cursor should have an outstanding reference 
  6200        ** to an overflow page belonging to a cell that is being deleted/updated.
  6201        ** So if there exists more than one reference to this page, then it 
  6202        ** must not really be an overflow page and the database must be corrupt. 
  6203        ** It is helpful to detect this before calling freePage2(), as 
  6204        ** freePage2() may zero the page contents if secure-delete mode is
  6205        ** enabled. If this 'overflow' page happens to be a page that the
  6206        ** caller is iterating through or using in some other way, this
  6207        ** can be problematic.
  6208        */
  6209        rc = SQLITE_CORRUPT_BKPT;
  6210      }else{
  6211        rc = freePage2(pBt, pOvfl, ovflPgno);
  6212      }
  6213  
  6214      if( pOvfl ){
  6215        sqlite3PagerUnref(pOvfl->pDbPage);
  6216      }
  6217      if( rc ) return rc;
  6218      ovflPgno = iNext;
  6219    }
  6220    return SQLITE_OK;
  6221  }
  6222  
  6223  /*
  6224  ** Create the byte sequence used to represent a cell on page pPage
  6225  ** and write that byte sequence into pCell[].  Overflow pages are
  6226  ** allocated and filled in as necessary.  The calling procedure
  6227  ** is responsible for making sure sufficient space has been allocated
  6228  ** for pCell[].
  6229  **
  6230  ** Note that pCell does not necessary need to point to the pPage->aData
  6231  ** area.  pCell might point to some temporary storage.  The cell will
  6232  ** be constructed in this temporary area then copied into pPage->aData
  6233  ** later.
  6234  */
  6235  static int fillInCell(
  6236    MemPage *pPage,                /* The page that contains the cell */
  6237    unsigned char *pCell,          /* Complete text of the cell */
  6238    const BtreePayload *pX,        /* Payload with which to construct the cell */
  6239    int *pnSize                    /* Write cell size here */
  6240  ){
  6241    int nPayload;
  6242    const u8 *pSrc;
  6243    int nSrc, n, rc, mn;
  6244    int spaceLeft;
  6245    MemPage *pToRelease;
  6246    unsigned char *pPrior;
  6247    unsigned char *pPayload;
  6248    BtShared *pBt;
  6249    Pgno pgnoOvfl;
  6250    int nHeader;
  6251  
  6252    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  6253  
  6254    /* pPage is not necessarily writeable since pCell might be auxiliary
  6255    ** buffer space that is separate from the pPage buffer area */
  6256    assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
  6257              || sqlite3PagerIswriteable(pPage->pDbPage) );
  6258  
  6259    /* Fill in the header. */
  6260    nHeader = pPage->childPtrSize;
  6261    if( pPage->intKey ){
  6262      nPayload = pX->nData + pX->nZero;
  6263      pSrc = pX->pData;
  6264      nSrc = pX->nData;
  6265      assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
  6266      nHeader += putVarint32(&pCell[nHeader], nPayload);
  6267      nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
  6268    }else{
  6269      assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
  6270      nSrc = nPayload = (int)pX->nKey;
  6271      pSrc = pX->pKey;
  6272      nHeader += putVarint32(&pCell[nHeader], nPayload);
  6273    }
  6274    
  6275    /* Fill in the payload */
  6276    pPayload = &pCell[nHeader];
  6277    if( nPayload<=pPage->maxLocal ){
  6278      /* This is the common case where everything fits on the btree page
  6279      ** and no overflow pages are required. */
  6280      n = nHeader + nPayload;
  6281      testcase( n==3 );
  6282      testcase( n==4 );
  6283      if( n<4 ) n = 4;
  6284      *pnSize = n;
  6285      assert( nSrc<=nPayload );
  6286      testcase( nSrc<nPayload );
  6287      memcpy(pPayload, pSrc, nSrc);
  6288      memset(pPayload+nSrc, 0, nPayload-nSrc);
  6289      return SQLITE_OK;
  6290    }
  6291  
  6292    /* If we reach this point, it means that some of the content will need
  6293    ** to spill onto overflow pages.
  6294    */
  6295    mn = pPage->minLocal;
  6296    n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
  6297    testcase( n==pPage->maxLocal );
  6298    testcase( n==pPage->maxLocal+1 );
  6299    if( n > pPage->maxLocal ) n = mn;
  6300    spaceLeft = n;
  6301    *pnSize = n + nHeader + 4;
  6302    pPrior = &pCell[nHeader+n];
  6303    pToRelease = 0;
  6304    pgnoOvfl = 0;
  6305    pBt = pPage->pBt;
  6306  
  6307    /* At this point variables should be set as follows:
  6308    **
  6309    **   nPayload           Total payload size in bytes
  6310    **   pPayload           Begin writing payload here
  6311    **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
  6312    **                      that means content must spill into overflow pages.
  6313    **   *pnSize            Size of the local cell (not counting overflow pages)
  6314    **   pPrior             Where to write the pgno of the first overflow page
  6315    **
  6316    ** Use a call to btreeParseCellPtr() to verify that the values above
  6317    ** were computed correctly.
  6318    */
  6319  #ifdef SQLITE_DEBUG
  6320    {
  6321      CellInfo info;
  6322      pPage->xParseCell(pPage, pCell, &info);
  6323      assert( nHeader==(int)(info.pPayload - pCell) );
  6324      assert( info.nKey==pX->nKey );
  6325      assert( *pnSize == info.nSize );
  6326      assert( spaceLeft == info.nLocal );
  6327    }
  6328  #endif
  6329  
  6330    /* Write the payload into the local Cell and any extra into overflow pages */
  6331    while( 1 ){
  6332      n = nPayload;
  6333      if( n>spaceLeft ) n = spaceLeft;
  6334  
  6335      /* If pToRelease is not zero than pPayload points into the data area
  6336      ** of pToRelease.  Make sure pToRelease is still writeable. */
  6337      assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
  6338  
  6339      /* If pPayload is part of the data area of pPage, then make sure pPage
  6340      ** is still writeable */
  6341      assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
  6342              || sqlite3PagerIswriteable(pPage->pDbPage) );
  6343  
  6344      if( nSrc>=n ){
  6345        memcpy(pPayload, pSrc, n);
  6346      }else if( nSrc>0 ){
  6347        n = nSrc;
  6348        memcpy(pPayload, pSrc, n);
  6349      }else{
  6350        memset(pPayload, 0, n);
  6351      }
  6352      nPayload -= n;
  6353      if( nPayload<=0 ) break;
  6354      pPayload += n;
  6355      pSrc += n;
  6356      nSrc -= n;
  6357      spaceLeft -= n;
  6358      if( spaceLeft==0 ){
  6359        MemPage *pOvfl = 0;
  6360  #ifndef SQLITE_OMIT_AUTOVACUUM
  6361        Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
  6362        if( pBt->autoVacuum ){
  6363          do{
  6364            pgnoOvfl++;
  6365          } while( 
  6366            PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
  6367          );
  6368        }
  6369  #endif
  6370        rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
  6371  #ifndef SQLITE_OMIT_AUTOVACUUM
  6372        /* If the database supports auto-vacuum, and the second or subsequent
  6373        ** overflow page is being allocated, add an entry to the pointer-map
  6374        ** for that page now. 
  6375        **
  6376        ** If this is the first overflow page, then write a partial entry 
  6377        ** to the pointer-map. If we write nothing to this pointer-map slot,
  6378        ** then the optimistic overflow chain processing in clearCell()
  6379        ** may misinterpret the uninitialized values and delete the
  6380        ** wrong pages from the database.
  6381        */
  6382        if( pBt->autoVacuum && rc==SQLITE_OK ){
  6383          u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
  6384          ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
  6385          if( rc ){
  6386            releasePage(pOvfl);
  6387          }
  6388        }
  6389  #endif
  6390        if( rc ){
  6391          releasePage(pToRelease);
  6392          return rc;
  6393        }
  6394  
  6395        /* If pToRelease is not zero than pPrior points into the data area
  6396        ** of pToRelease.  Make sure pToRelease is still writeable. */
  6397        assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
  6398  
  6399        /* If pPrior is part of the data area of pPage, then make sure pPage
  6400        ** is still writeable */
  6401        assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
  6402              || sqlite3PagerIswriteable(pPage->pDbPage) );
  6403  
  6404        put4byte(pPrior, pgnoOvfl);
  6405        releasePage(pToRelease);
  6406        pToRelease = pOvfl;
  6407        pPrior = pOvfl->aData;
  6408        put4byte(pPrior, 0);
  6409        pPayload = &pOvfl->aData[4];
  6410        spaceLeft = pBt->usableSize - 4;
  6411      }
  6412    }
  6413    releasePage(pToRelease);
  6414    return SQLITE_OK;
  6415  }
  6416  
  6417  /*
  6418  ** Remove the i-th cell from pPage.  This routine effects pPage only.
  6419  ** The cell content is not freed or deallocated.  It is assumed that
  6420  ** the cell content has been copied someplace else.  This routine just
  6421  ** removes the reference to the cell from pPage.
  6422  **
  6423  ** "sz" must be the number of bytes in the cell.
  6424  */
  6425  static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
  6426    u32 pc;         /* Offset to cell content of cell being deleted */
  6427    u8 *data;       /* pPage->aData */
  6428    u8 *ptr;        /* Used to move bytes around within data[] */
  6429    int rc;         /* The return code */
  6430    int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
  6431  
  6432    if( *pRC ) return;
  6433    assert( idx>=0 && idx<pPage->nCell );
  6434    assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
  6435    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  6436    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  6437    data = pPage->aData;
  6438    ptr = &pPage->aCellIdx[2*idx];
  6439    pc = get2byte(ptr);
  6440    hdr = pPage->hdrOffset;
  6441    testcase( pc==get2byte(&data[hdr+5]) );
  6442    testcase( pc+sz==pPage->pBt->usableSize );
  6443    if( pc+sz > pPage->pBt->usableSize ){
  6444      *pRC = SQLITE_CORRUPT_BKPT;
  6445      return;
  6446    }
  6447    rc = freeSpace(pPage, pc, sz);
  6448    if( rc ){
  6449      *pRC = rc;
  6450      return;
  6451    }
  6452    pPage->nCell--;
  6453    if( pPage->nCell==0 ){
  6454      memset(&data[hdr+1], 0, 4);
  6455      data[hdr+7] = 0;
  6456      put2byte(&data[hdr+5], pPage->pBt->usableSize);
  6457      pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
  6458                         - pPage->childPtrSize - 8;
  6459    }else{
  6460      memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
  6461      put2byte(&data[hdr+3], pPage->nCell);
  6462      pPage->nFree += 2;
  6463    }
  6464  }
  6465  
  6466  /*
  6467  ** Insert a new cell on pPage at cell index "i".  pCell points to the
  6468  ** content of the cell.
  6469  **
  6470  ** If the cell content will fit on the page, then put it there.  If it
  6471  ** will not fit, then make a copy of the cell content into pTemp if
  6472  ** pTemp is not null.  Regardless of pTemp, allocate a new entry
  6473  ** in pPage->apOvfl[] and make it point to the cell content (either
  6474  ** in pTemp or the original pCell) and also record its index. 
  6475  ** Allocating a new entry in pPage->aCell[] implies that 
  6476  ** pPage->nOverflow is incremented.
  6477  **
  6478  ** *pRC must be SQLITE_OK when this routine is called.
  6479  */
  6480  static void insertCell(
  6481    MemPage *pPage,   /* Page into which we are copying */
  6482    int i,            /* New cell becomes the i-th cell of the page */
  6483    u8 *pCell,        /* Content of the new cell */
  6484    int sz,           /* Bytes of content in pCell */
  6485    u8 *pTemp,        /* Temp storage space for pCell, if needed */
  6486    Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
  6487    int *pRC          /* Read and write return code from here */
  6488  ){
  6489    int idx = 0;      /* Where to write new cell content in data[] */
  6490    int j;            /* Loop counter */
  6491    u8 *data;         /* The content of the whole page */
  6492    u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
  6493  
  6494    assert( *pRC==SQLITE_OK );
  6495    assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
  6496    assert( MX_CELL(pPage->pBt)<=10921 );
  6497    assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
  6498    assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
  6499    assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
  6500    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  6501    /* The cell should normally be sized correctly.  However, when moving a
  6502    ** malformed cell from a leaf page to an interior page, if the cell size
  6503    ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
  6504    ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
  6505    ** the term after the || in the following assert(). */
  6506    assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
  6507    if( pPage->nOverflow || sz+2>pPage->nFree ){
  6508      if( pTemp ){
  6509        memcpy(pTemp, pCell, sz);
  6510        pCell = pTemp;
  6511      }
  6512      if( iChild ){
  6513        put4byte(pCell, iChild);
  6514      }
  6515      j = pPage->nOverflow++;
  6516      /* Comparison against ArraySize-1 since we hold back one extra slot
  6517      ** as a contingency.  In other words, never need more than 3 overflow
  6518      ** slots but 4 are allocated, just to be safe. */
  6519      assert( j < ArraySize(pPage->apOvfl)-1 );
  6520      pPage->apOvfl[j] = pCell;
  6521      pPage->aiOvfl[j] = (u16)i;
  6522  
  6523      /* When multiple overflows occur, they are always sequential and in
  6524      ** sorted order.  This invariants arise because multiple overflows can
  6525      ** only occur when inserting divider cells into the parent page during
  6526      ** balancing, and the dividers are adjacent and sorted.
  6527      */
  6528      assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
  6529      assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
  6530    }else{
  6531      int rc = sqlite3PagerWrite(pPage->pDbPage);
  6532      if( rc!=SQLITE_OK ){
  6533        *pRC = rc;
  6534        return;
  6535      }
  6536      assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  6537      data = pPage->aData;
  6538      assert( &data[pPage->cellOffset]==pPage->aCellIdx );
  6539      rc = allocateSpace(pPage, sz, &idx);
  6540      if( rc ){ *pRC = rc; return; }
  6541      /* The allocateSpace() routine guarantees the following properties
  6542      ** if it returns successfully */
  6543      assert( idx >= 0 );
  6544      assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
  6545      assert( idx+sz <= (int)pPage->pBt->usableSize );
  6546      pPage->nFree -= (u16)(2 + sz);
  6547      memcpy(&data[idx], pCell, sz);
  6548      if( iChild ){
  6549        put4byte(&data[idx], iChild);
  6550      }
  6551      pIns = pPage->aCellIdx + i*2;
  6552      memmove(pIns+2, pIns, 2*(pPage->nCell - i));
  6553      put2byte(pIns, idx);
  6554      pPage->nCell++;
  6555      /* increment the cell count */
  6556      if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
  6557      assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
  6558  #ifndef SQLITE_OMIT_AUTOVACUUM
  6559      if( pPage->pBt->autoVacuum ){
  6560        /* The cell may contain a pointer to an overflow page. If so, write
  6561        ** the entry for the overflow page into the pointer map.
  6562        */
  6563        ptrmapPutOvflPtr(pPage, pCell, pRC);
  6564      }
  6565  #endif
  6566    }
  6567  }
  6568  
  6569  /*
  6570  ** A CellArray object contains a cache of pointers and sizes for a
  6571  ** consecutive sequence of cells that might be held on multiple pages.
  6572  */
  6573  typedef struct CellArray CellArray;
  6574  struct CellArray {
  6575    int nCell;              /* Number of cells in apCell[] */
  6576    MemPage *pRef;          /* Reference page */
  6577    u8 **apCell;            /* All cells begin balanced */
  6578    u16 *szCell;            /* Local size of all cells in apCell[] */
  6579  };
  6580  
  6581  /*
  6582  ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
  6583  ** computed.
  6584  */
  6585  static void populateCellCache(CellArray *p, int idx, int N){
  6586    assert( idx>=0 && idx+N<=p->nCell );
  6587    while( N>0 ){
  6588      assert( p->apCell[idx]!=0 );
  6589      if( p->szCell[idx]==0 ){
  6590        p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
  6591      }else{
  6592        assert( CORRUPT_DB ||
  6593                p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
  6594      }
  6595      idx++;
  6596      N--;
  6597    }
  6598  }
  6599  
  6600  /*
  6601  ** Return the size of the Nth element of the cell array
  6602  */
  6603  static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
  6604    assert( N>=0 && N<p->nCell );
  6605    assert( p->szCell[N]==0 );
  6606    p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
  6607    return p->szCell[N];
  6608  }
  6609  static u16 cachedCellSize(CellArray *p, int N){
  6610    assert( N>=0 && N<p->nCell );
  6611    if( p->szCell[N] ) return p->szCell[N];
  6612    return computeCellSize(p, N);
  6613  }
  6614  
  6615  /*
  6616  ** Array apCell[] contains pointers to nCell b-tree page cells. The 
  6617  ** szCell[] array contains the size in bytes of each cell. This function
  6618  ** replaces the current contents of page pPg with the contents of the cell
  6619  ** array.
  6620  **
  6621  ** Some of the cells in apCell[] may currently be stored in pPg. This
  6622  ** function works around problems caused by this by making a copy of any 
  6623  ** such cells before overwriting the page data.
  6624  **
  6625  ** The MemPage.nFree field is invalidated by this function. It is the 
  6626  ** responsibility of the caller to set it correctly.
  6627  */
  6628  static int rebuildPage(
  6629    MemPage *pPg,                   /* Edit this page */
  6630    int nCell,                      /* Final number of cells on page */
  6631    u8 **apCell,                    /* Array of cells */
  6632    u16 *szCell                     /* Array of cell sizes */
  6633  ){
  6634    const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
  6635    u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
  6636    const int usableSize = pPg->pBt->usableSize;
  6637    u8 * const pEnd = &aData[usableSize];
  6638    int i;
  6639    u8 *pCellptr = pPg->aCellIdx;
  6640    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
  6641    u8 *pData;
  6642  
  6643    i = get2byte(&aData[hdr+5]);
  6644    memcpy(&pTmp[i], &aData[i], usableSize - i);
  6645  
  6646    pData = pEnd;
  6647    for(i=0; i<nCell; i++){
  6648      u8 *pCell = apCell[i];
  6649      if( SQLITE_WITHIN(pCell,aData,pEnd) ){
  6650        pCell = &pTmp[pCell - aData];
  6651      }
  6652      pData -= szCell[i];
  6653      put2byte(pCellptr, (pData - aData));
  6654      pCellptr += 2;
  6655      if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
  6656      memcpy(pData, pCell, szCell[i]);
  6657      assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
  6658      testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
  6659    }
  6660  
  6661    /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
  6662    pPg->nCell = nCell;
  6663    pPg->nOverflow = 0;
  6664  
  6665    put2byte(&aData[hdr+1], 0);
  6666    put2byte(&aData[hdr+3], pPg->nCell);
  6667    put2byte(&aData[hdr+5], pData - aData);
  6668    aData[hdr+7] = 0x00;
  6669    return SQLITE_OK;
  6670  }
  6671  
  6672  /*
  6673  ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
  6674  ** contains the size in bytes of each such cell. This function attempts to 
  6675  ** add the cells stored in the array to page pPg. If it cannot (because 
  6676  ** the page needs to be defragmented before the cells will fit), non-zero
  6677  ** is returned. Otherwise, if the cells are added successfully, zero is
  6678  ** returned.
  6679  **
  6680  ** Argument pCellptr points to the first entry in the cell-pointer array
  6681  ** (part of page pPg) to populate. After cell apCell[0] is written to the
  6682  ** page body, a 16-bit offset is written to pCellptr. And so on, for each
  6683  ** cell in the array. It is the responsibility of the caller to ensure
  6684  ** that it is safe to overwrite this part of the cell-pointer array.
  6685  **
  6686  ** When this function is called, *ppData points to the start of the 
  6687  ** content area on page pPg. If the size of the content area is extended,
  6688  ** *ppData is updated to point to the new start of the content area
  6689  ** before returning.
  6690  **
  6691  ** Finally, argument pBegin points to the byte immediately following the
  6692  ** end of the space required by this page for the cell-pointer area (for
  6693  ** all cells - not just those inserted by the current call). If the content
  6694  ** area must be extended to before this point in order to accomodate all
  6695  ** cells in apCell[], then the cells do not fit and non-zero is returned.
  6696  */
  6697  static int pageInsertArray(
  6698    MemPage *pPg,                   /* Page to add cells to */
  6699    u8 *pBegin,                     /* End of cell-pointer array */
  6700    u8 **ppData,                    /* IN/OUT: Page content -area pointer */
  6701    u8 *pCellptr,                   /* Pointer to cell-pointer area */
  6702    int iFirst,                     /* Index of first cell to add */
  6703    int nCell,                      /* Number of cells to add to pPg */
  6704    CellArray *pCArray              /* Array of cells */
  6705  ){
  6706    int i;
  6707    u8 *aData = pPg->aData;
  6708    u8 *pData = *ppData;
  6709    int iEnd = iFirst + nCell;
  6710    assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
  6711    for(i=iFirst; i<iEnd; i++){
  6712      int sz, rc;
  6713      u8 *pSlot;
  6714      sz = cachedCellSize(pCArray, i);
  6715      if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
  6716        if( (pData - pBegin)<sz ) return 1;
  6717        pData -= sz;
  6718        pSlot = pData;
  6719      }
  6720      /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
  6721      ** database.  But they might for a corrupt database.  Hence use memmove()
  6722      ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
  6723      assert( (pSlot+sz)<=pCArray->apCell[i]
  6724           || pSlot>=(pCArray->apCell[i]+sz)
  6725           || CORRUPT_DB );
  6726      memmove(pSlot, pCArray->apCell[i], sz);
  6727      put2byte(pCellptr, (pSlot - aData));
  6728      pCellptr += 2;
  6729    }
  6730    *ppData = pData;
  6731    return 0;
  6732  }
  6733  
  6734  /*
  6735  ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell 
  6736  ** contains the size in bytes of each such cell. This function adds the
  6737  ** space associated with each cell in the array that is currently stored 
  6738  ** within the body of pPg to the pPg free-list. The cell-pointers and other
  6739  ** fields of the page are not updated.
  6740  **
  6741  ** This function returns the total number of cells added to the free-list.
  6742  */
  6743  static int pageFreeArray(
  6744    MemPage *pPg,                   /* Page to edit */
  6745    int iFirst,                     /* First cell to delete */
  6746    int nCell,                      /* Cells to delete */
  6747    CellArray *pCArray              /* Array of cells */
  6748  ){
  6749    u8 * const aData = pPg->aData;
  6750    u8 * const pEnd = &aData[pPg->pBt->usableSize];
  6751    u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
  6752    int nRet = 0;
  6753    int i;
  6754    int iEnd = iFirst + nCell;
  6755    u8 *pFree = 0;
  6756    int szFree = 0;
  6757  
  6758    for(i=iFirst; i<iEnd; i++){
  6759      u8 *pCell = pCArray->apCell[i];
  6760      if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
  6761        int sz;
  6762        /* No need to use cachedCellSize() here.  The sizes of all cells that
  6763        ** are to be freed have already been computing while deciding which
  6764        ** cells need freeing */
  6765        sz = pCArray->szCell[i];  assert( sz>0 );
  6766        if( pFree!=(pCell + sz) ){
  6767          if( pFree ){
  6768            assert( pFree>aData && (pFree - aData)<65536 );
  6769            freeSpace(pPg, (u16)(pFree - aData), szFree);
  6770          }
  6771          pFree = pCell;
  6772          szFree = sz;
  6773          if( pFree+sz>pEnd ) return 0;
  6774        }else{
  6775          pFree = pCell;
  6776          szFree += sz;
  6777        }
  6778        nRet++;
  6779      }
  6780    }
  6781    if( pFree ){
  6782      assert( pFree>aData && (pFree - aData)<65536 );
  6783      freeSpace(pPg, (u16)(pFree - aData), szFree);
  6784    }
  6785    return nRet;
  6786  }
  6787  
  6788  /*
  6789  ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
  6790  ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
  6791  ** with apCell[iOld].  After balancing, this page should hold nNew cells
  6792  ** starting at apCell[iNew].
  6793  **
  6794  ** This routine makes the necessary adjustments to pPg so that it contains
  6795  ** the correct cells after being balanced.
  6796  **
  6797  ** The pPg->nFree field is invalid when this function returns. It is the
  6798  ** responsibility of the caller to set it correctly.
  6799  */
  6800  static int editPage(
  6801    MemPage *pPg,                   /* Edit this page */
  6802    int iOld,                       /* Index of first cell currently on page */
  6803    int iNew,                       /* Index of new first cell on page */
  6804    int nNew,                       /* Final number of cells on page */
  6805    CellArray *pCArray              /* Array of cells and sizes */
  6806  ){
  6807    u8 * const aData = pPg->aData;
  6808    const int hdr = pPg->hdrOffset;
  6809    u8 *pBegin = &pPg->aCellIdx[nNew * 2];
  6810    int nCell = pPg->nCell;       /* Cells stored on pPg */
  6811    u8 *pData;
  6812    u8 *pCellptr;
  6813    int i;
  6814    int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
  6815    int iNewEnd = iNew + nNew;
  6816  
  6817  #ifdef SQLITE_DEBUG
  6818    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
  6819    memcpy(pTmp, aData, pPg->pBt->usableSize);
  6820  #endif
  6821  
  6822    /* Remove cells from the start and end of the page */
  6823    if( iOld<iNew ){
  6824      int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
  6825      memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
  6826      nCell -= nShift;
  6827    }
  6828    if( iNewEnd < iOldEnd ){
  6829      nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
  6830    }
  6831  
  6832    pData = &aData[get2byteNotZero(&aData[hdr+5])];
  6833    if( pData<pBegin ) goto editpage_fail;
  6834  
  6835    /* Add cells to the start of the page */
  6836    if( iNew<iOld ){
  6837      int nAdd = MIN(nNew,iOld-iNew);
  6838      assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
  6839      pCellptr = pPg->aCellIdx;
  6840      memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
  6841      if( pageInsertArray(
  6842            pPg, pBegin, &pData, pCellptr,
  6843            iNew, nAdd, pCArray
  6844      ) ) goto editpage_fail;
  6845      nCell += nAdd;
  6846    }
  6847  
  6848    /* Add any overflow cells */
  6849    for(i=0; i<pPg->nOverflow; i++){
  6850      int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
  6851      if( iCell>=0 && iCell<nNew ){
  6852        pCellptr = &pPg->aCellIdx[iCell * 2];
  6853        memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
  6854        nCell++;
  6855        if( pageInsertArray(
  6856              pPg, pBegin, &pData, pCellptr,
  6857              iCell+iNew, 1, pCArray
  6858        ) ) goto editpage_fail;
  6859      }
  6860    }
  6861  
  6862    /* Append cells to the end of the page */
  6863    pCellptr = &pPg->aCellIdx[nCell*2];
  6864    if( pageInsertArray(
  6865          pPg, pBegin, &pData, pCellptr,
  6866          iNew+nCell, nNew-nCell, pCArray
  6867    ) ) goto editpage_fail;
  6868  
  6869    pPg->nCell = nNew;
  6870    pPg->nOverflow = 0;
  6871  
  6872    put2byte(&aData[hdr+3], pPg->nCell);
  6873    put2byte(&aData[hdr+5], pData - aData);
  6874  
  6875  #ifdef SQLITE_DEBUG
  6876    for(i=0; i<nNew && !CORRUPT_DB; i++){
  6877      u8 *pCell = pCArray->apCell[i+iNew];
  6878      int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
  6879      if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
  6880        pCell = &pTmp[pCell - aData];
  6881      }
  6882      assert( 0==memcmp(pCell, &aData[iOff],
  6883              pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
  6884    }
  6885  #endif
  6886  
  6887    return SQLITE_OK;
  6888   editpage_fail:
  6889    /* Unable to edit this page. Rebuild it from scratch instead. */
  6890    populateCellCache(pCArray, iNew, nNew);
  6891    return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
  6892  }
  6893  
  6894  /*
  6895  ** The following parameters determine how many adjacent pages get involved
  6896  ** in a balancing operation.  NN is the number of neighbors on either side
  6897  ** of the page that participate in the balancing operation.  NB is the
  6898  ** total number of pages that participate, including the target page and
  6899  ** NN neighbors on either side.
  6900  **
  6901  ** The minimum value of NN is 1 (of course).  Increasing NN above 1
  6902  ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
  6903  ** in exchange for a larger degradation in INSERT and UPDATE performance.
  6904  ** The value of NN appears to give the best results overall.
  6905  */
  6906  #define NN 1             /* Number of neighbors on either side of pPage */
  6907  #define NB (NN*2+1)      /* Total pages involved in the balance */
  6908  
  6909  
  6910  #ifndef SQLITE_OMIT_QUICKBALANCE
  6911  /*
  6912  ** This version of balance() handles the common special case where
  6913  ** a new entry is being inserted on the extreme right-end of the
  6914  ** tree, in other words, when the new entry will become the largest
  6915  ** entry in the tree.
  6916  **
  6917  ** Instead of trying to balance the 3 right-most leaf pages, just add
  6918  ** a new page to the right-hand side and put the one new entry in
  6919  ** that page.  This leaves the right side of the tree somewhat
  6920  ** unbalanced.  But odds are that we will be inserting new entries
  6921  ** at the end soon afterwards so the nearly empty page will quickly
  6922  ** fill up.  On average.
  6923  **
  6924  ** pPage is the leaf page which is the right-most page in the tree.
  6925  ** pParent is its parent.  pPage must have a single overflow entry
  6926  ** which is also the right-most entry on the page.
  6927  **
  6928  ** The pSpace buffer is used to store a temporary copy of the divider
  6929  ** cell that will be inserted into pParent. Such a cell consists of a 4
  6930  ** byte page number followed by a variable length integer. In other
  6931  ** words, at most 13 bytes. Hence the pSpace buffer must be at
  6932  ** least 13 bytes in size.
  6933  */
  6934  static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
  6935    BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
  6936    MemPage *pNew;                       /* Newly allocated page */
  6937    int rc;                              /* Return Code */
  6938    Pgno pgnoNew;                        /* Page number of pNew */
  6939  
  6940    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  6941    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  6942    assert( pPage->nOverflow==1 );
  6943  
  6944    /* This error condition is now caught prior to reaching this function */
  6945    if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
  6946  
  6947    /* Allocate a new page. This page will become the right-sibling of 
  6948    ** pPage. Make the parent page writable, so that the new divider cell
  6949    ** may be inserted. If both these operations are successful, proceed.
  6950    */
  6951    rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
  6952  
  6953    if( rc==SQLITE_OK ){
  6954  
  6955      u8 *pOut = &pSpace[4];
  6956      u8 *pCell = pPage->apOvfl[0];
  6957      u16 szCell = pPage->xCellSize(pPage, pCell);
  6958      u8 *pStop;
  6959  
  6960      assert( sqlite3PagerIswriteable(pNew->pDbPage) );
  6961      assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
  6962      zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
  6963      rc = rebuildPage(pNew, 1, &pCell, &szCell);
  6964      if( NEVER(rc) ) return rc;
  6965      pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
  6966  
  6967      /* If this is an auto-vacuum database, update the pointer map
  6968      ** with entries for the new page, and any pointer from the 
  6969      ** cell on the page to an overflow page. If either of these
  6970      ** operations fails, the return code is set, but the contents
  6971      ** of the parent page are still manipulated by thh code below.
  6972      ** That is Ok, at this point the parent page is guaranteed to
  6973      ** be marked as dirty. Returning an error code will cause a
  6974      ** rollback, undoing any changes made to the parent page.
  6975      */
  6976      if( ISAUTOVACUUM ){
  6977        ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
  6978        if( szCell>pNew->minLocal ){
  6979          ptrmapPutOvflPtr(pNew, pCell, &rc);
  6980        }
  6981      }
  6982    
  6983      /* Create a divider cell to insert into pParent. The divider cell
  6984      ** consists of a 4-byte page number (the page number of pPage) and
  6985      ** a variable length key value (which must be the same value as the
  6986      ** largest key on pPage).
  6987      **
  6988      ** To find the largest key value on pPage, first find the right-most 
  6989      ** cell on pPage. The first two fields of this cell are the 
  6990      ** record-length (a variable length integer at most 32-bits in size)
  6991      ** and the key value (a variable length integer, may have any value).
  6992      ** The first of the while(...) loops below skips over the record-length
  6993      ** field. The second while(...) loop copies the key value from the
  6994      ** cell on pPage into the pSpace buffer.
  6995      */
  6996      pCell = findCell(pPage, pPage->nCell-1);
  6997      pStop = &pCell[9];
  6998      while( (*(pCell++)&0x80) && pCell<pStop );
  6999      pStop = &pCell[9];
  7000      while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
  7001  
  7002      /* Insert the new divider cell into pParent. */
  7003      if( rc==SQLITE_OK ){
  7004        insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
  7005                     0, pPage->pgno, &rc);
  7006      }
  7007  
  7008      /* Set the right-child pointer of pParent to point to the new page. */
  7009      put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
  7010    
  7011      /* Release the reference to the new page. */
  7012      releasePage(pNew);
  7013    }
  7014  
  7015    return rc;
  7016  }
  7017  #endif /* SQLITE_OMIT_QUICKBALANCE */
  7018  
  7019  #if 0
  7020  /*
  7021  ** This function does not contribute anything to the operation of SQLite.
  7022  ** it is sometimes activated temporarily while debugging code responsible 
  7023  ** for setting pointer-map entries.
  7024  */
  7025  static int ptrmapCheckPages(MemPage **apPage, int nPage){
  7026    int i, j;
  7027    for(i=0; i<nPage; i++){
  7028      Pgno n;
  7029      u8 e;
  7030      MemPage *pPage = apPage[i];
  7031      BtShared *pBt = pPage->pBt;
  7032      assert( pPage->isInit );
  7033  
  7034      for(j=0; j<pPage->nCell; j++){
  7035        CellInfo info;
  7036        u8 *z;
  7037       
  7038        z = findCell(pPage, j);
  7039        pPage->xParseCell(pPage, z, &info);
  7040        if( info.nLocal<info.nPayload ){
  7041          Pgno ovfl = get4byte(&z[info.nSize-4]);
  7042          ptrmapGet(pBt, ovfl, &e, &n);
  7043          assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
  7044        }
  7045        if( !pPage->leaf ){
  7046          Pgno child = get4byte(z);
  7047          ptrmapGet(pBt, child, &e, &n);
  7048          assert( n==pPage->pgno && e==PTRMAP_BTREE );
  7049        }
  7050      }
  7051      if( !pPage->leaf ){
  7052        Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  7053        ptrmapGet(pBt, child, &e, &n);
  7054        assert( n==pPage->pgno && e==PTRMAP_BTREE );
  7055      }
  7056    }
  7057    return 1;
  7058  }
  7059  #endif
  7060  
  7061  /*
  7062  ** This function is used to copy the contents of the b-tree node stored 
  7063  ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
  7064  ** the pointer-map entries for each child page are updated so that the
  7065  ** parent page stored in the pointer map is page pTo. If pFrom contained
  7066  ** any cells with overflow page pointers, then the corresponding pointer
  7067  ** map entries are also updated so that the parent page is page pTo.
  7068  **
  7069  ** If pFrom is currently carrying any overflow cells (entries in the
  7070  ** MemPage.apOvfl[] array), they are not copied to pTo. 
  7071  **
  7072  ** Before returning, page pTo is reinitialized using btreeInitPage().
  7073  **
  7074  ** The performance of this function is not critical. It is only used by 
  7075  ** the balance_shallower() and balance_deeper() procedures, neither of
  7076  ** which are called often under normal circumstances.
  7077  */
  7078  static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
  7079    if( (*pRC)==SQLITE_OK ){
  7080      BtShared * const pBt = pFrom->pBt;
  7081      u8 * const aFrom = pFrom->aData;
  7082      u8 * const aTo = pTo->aData;
  7083      int const iFromHdr = pFrom->hdrOffset;
  7084      int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
  7085      int rc;
  7086      int iData;
  7087    
  7088    
  7089      assert( pFrom->isInit );
  7090      assert( pFrom->nFree>=iToHdr );
  7091      assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
  7092    
  7093      /* Copy the b-tree node content from page pFrom to page pTo. */
  7094      iData = get2byte(&aFrom[iFromHdr+5]);
  7095      memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
  7096      memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
  7097    
  7098      /* Reinitialize page pTo so that the contents of the MemPage structure
  7099      ** match the new data. The initialization of pTo can actually fail under
  7100      ** fairly obscure circumstances, even though it is a copy of initialized 
  7101      ** page pFrom.
  7102      */
  7103      pTo->isInit = 0;
  7104      rc = btreeInitPage(pTo);
  7105      if( rc!=SQLITE_OK ){
  7106        *pRC = rc;
  7107        return;
  7108      }
  7109    
  7110      /* If this is an auto-vacuum database, update the pointer-map entries
  7111      ** for any b-tree or overflow pages that pTo now contains the pointers to.
  7112      */
  7113      if( ISAUTOVACUUM ){
  7114        *pRC = setChildPtrmaps(pTo);
  7115      }
  7116    }
  7117  }
  7118  
  7119  /*
  7120  ** This routine redistributes cells on the iParentIdx'th child of pParent
  7121  ** (hereafter "the page") and up to 2 siblings so that all pages have about the
  7122  ** same amount of free space. Usually a single sibling on either side of the
  7123  ** page are used in the balancing, though both siblings might come from one
  7124  ** side if the page is the first or last child of its parent. If the page 
  7125  ** has fewer than 2 siblings (something which can only happen if the page
  7126  ** is a root page or a child of a root page) then all available siblings
  7127  ** participate in the balancing.
  7128  **
  7129  ** The number of siblings of the page might be increased or decreased by 
  7130  ** one or two in an effort to keep pages nearly full but not over full. 
  7131  **
  7132  ** Note that when this routine is called, some of the cells on the page
  7133  ** might not actually be stored in MemPage.aData[]. This can happen
  7134  ** if the page is overfull. This routine ensures that all cells allocated
  7135  ** to the page and its siblings fit into MemPage.aData[] before returning.
  7136  **
  7137  ** In the course of balancing the page and its siblings, cells may be
  7138  ** inserted into or removed from the parent page (pParent). Doing so
  7139  ** may cause the parent page to become overfull or underfull. If this
  7140  ** happens, it is the responsibility of the caller to invoke the correct
  7141  ** balancing routine to fix this problem (see the balance() routine). 
  7142  **
  7143  ** If this routine fails for any reason, it might leave the database
  7144  ** in a corrupted state. So if this routine fails, the database should
  7145  ** be rolled back.
  7146  **
  7147  ** The third argument to this function, aOvflSpace, is a pointer to a
  7148  ** buffer big enough to hold one page. If while inserting cells into the parent
  7149  ** page (pParent) the parent page becomes overfull, this buffer is
  7150  ** used to store the parent's overflow cells. Because this function inserts
  7151  ** a maximum of four divider cells into the parent page, and the maximum
  7152  ** size of a cell stored within an internal node is always less than 1/4
  7153  ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
  7154  ** enough for all overflow cells.
  7155  **
  7156  ** If aOvflSpace is set to a null pointer, this function returns 
  7157  ** SQLITE_NOMEM.
  7158  */
  7159  static int balance_nonroot(
  7160    MemPage *pParent,               /* Parent page of siblings being balanced */
  7161    int iParentIdx,                 /* Index of "the page" in pParent */
  7162    u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
  7163    int isRoot,                     /* True if pParent is a root-page */
  7164    int bBulk                       /* True if this call is part of a bulk load */
  7165  ){
  7166    BtShared *pBt;               /* The whole database */
  7167    int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  7168    int nNew = 0;                /* Number of pages in apNew[] */
  7169    int nOld;                    /* Number of pages in apOld[] */
  7170    int i, j, k;                 /* Loop counters */
  7171    int nxDiv;                   /* Next divider slot in pParent->aCell[] */
  7172    int rc = SQLITE_OK;          /* The return code */
  7173    u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
  7174    int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
  7175    int usableSpace;             /* Bytes in pPage beyond the header */
  7176    int pageFlags;               /* Value of pPage->aData[0] */
  7177    int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  7178    int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
  7179    int szScratch;               /* Size of scratch memory requested */
  7180    MemPage *apOld[NB];          /* pPage and up to two siblings */
  7181    MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  7182    u8 *pRight;                  /* Location in parent of right-sibling pointer */
  7183    u8 *apDiv[NB-1];             /* Divider cells in pParent */
  7184    int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
  7185    int cntOld[NB+2];            /* Old index in b.apCell[] */
  7186    int szNew[NB+2];             /* Combined size of cells placed on i-th page */
  7187    u8 *aSpace1;                 /* Space for copies of dividers cells */
  7188    Pgno pgno;                   /* Temp var to store a page number in */
  7189    u8 abDone[NB+2];             /* True after i'th new page is populated */
  7190    Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
  7191    Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
  7192    u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
  7193    CellArray b;                  /* Parsed information on cells being balanced */
  7194  
  7195    memset(abDone, 0, sizeof(abDone));
  7196    b.nCell = 0;
  7197    b.apCell = 0;
  7198    pBt = pParent->pBt;
  7199    assert( sqlite3_mutex_held(pBt->mutex) );
  7200    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  7201  
  7202  #if 0
  7203    TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
  7204  #endif
  7205  
  7206    /* At this point pParent may have at most one overflow cell. And if
  7207    ** this overflow cell is present, it must be the cell with 
  7208    ** index iParentIdx. This scenario comes about when this function
  7209    ** is called (indirectly) from sqlite3BtreeDelete().
  7210    */
  7211    assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
  7212    assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
  7213  
  7214    if( !aOvflSpace ){
  7215      return SQLITE_NOMEM_BKPT;
  7216    }
  7217  
  7218    /* Find the sibling pages to balance. Also locate the cells in pParent 
  7219    ** that divide the siblings. An attempt is made to find NN siblings on 
  7220    ** either side of pPage. More siblings are taken from one side, however, 
  7221    ** if there are fewer than NN siblings on the other side. If pParent
  7222    ** has NB or fewer children then all children of pParent are taken.  
  7223    **
  7224    ** This loop also drops the divider cells from the parent page. This
  7225    ** way, the remainder of the function does not have to deal with any
  7226    ** overflow cells in the parent page, since if any existed they will
  7227    ** have already been removed.
  7228    */
  7229    i = pParent->nOverflow + pParent->nCell;
  7230    if( i<2 ){
  7231      nxDiv = 0;
  7232    }else{
  7233      assert( bBulk==0 || bBulk==1 );
  7234      if( iParentIdx==0 ){                 
  7235        nxDiv = 0;
  7236      }else if( iParentIdx==i ){
  7237        nxDiv = i-2+bBulk;
  7238      }else{
  7239        nxDiv = iParentIdx-1;
  7240      }
  7241      i = 2-bBulk;
  7242    }
  7243    nOld = i+1;
  7244    if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
  7245      pRight = &pParent->aData[pParent->hdrOffset+8];
  7246    }else{
  7247      pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
  7248    }
  7249    pgno = get4byte(pRight);
  7250    while( 1 ){
  7251      rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
  7252      if( rc ){
  7253        memset(apOld, 0, (i+1)*sizeof(MemPage*));
  7254        goto balance_cleanup;
  7255      }
  7256      nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  7257      if( (i--)==0 ) break;
  7258  
  7259      if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
  7260        apDiv[i] = pParent->apOvfl[0];
  7261        pgno = get4byte(apDiv[i]);
  7262        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
  7263        pParent->nOverflow = 0;
  7264      }else{
  7265        apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
  7266        pgno = get4byte(apDiv[i]);
  7267        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
  7268  
  7269        /* Drop the cell from the parent page. apDiv[i] still points to
  7270        ** the cell within the parent, even though it has been dropped.
  7271        ** This is safe because dropping a cell only overwrites the first
  7272        ** four bytes of it, and this function does not need the first
  7273        ** four bytes of the divider cell. So the pointer is safe to use
  7274        ** later on.  
  7275        **
  7276        ** But not if we are in secure-delete mode. In secure-delete mode,
  7277        ** the dropCell() routine will overwrite the entire cell with zeroes.
  7278        ** In this case, temporarily copy the cell into the aOvflSpace[]
  7279        ** buffer. It will be copied out again as soon as the aSpace[] buffer
  7280        ** is allocated.  */
  7281        if( pBt->btsFlags & BTS_FAST_SECURE ){
  7282          int iOff;
  7283  
  7284          iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
  7285          if( (iOff+szNew[i])>(int)pBt->usableSize ){
  7286            rc = SQLITE_CORRUPT_BKPT;
  7287            memset(apOld, 0, (i+1)*sizeof(MemPage*));
  7288            goto balance_cleanup;
  7289          }else{
  7290            memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
  7291            apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
  7292          }
  7293        }
  7294        dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
  7295      }
  7296    }
  7297  
  7298    /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  7299    ** alignment */
  7300    nMaxCells = (nMaxCells + 3)&~3;
  7301  
  7302    /*
  7303    ** Allocate space for memory structures
  7304    */
  7305    szScratch =
  7306         nMaxCells*sizeof(u8*)                       /* b.apCell */
  7307       + nMaxCells*sizeof(u16)                       /* b.szCell */
  7308       + pBt->pageSize;                              /* aSpace1 */
  7309  
  7310    assert( szScratch<=6*(int)pBt->pageSize );
  7311    b.apCell = sqlite3StackAllocRaw(0, szScratch );
  7312    if( b.apCell==0 ){
  7313      rc = SQLITE_NOMEM_BKPT;
  7314      goto balance_cleanup;
  7315    }
  7316    b.szCell = (u16*)&b.apCell[nMaxCells];
  7317    aSpace1 = (u8*)&b.szCell[nMaxCells];
  7318    assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
  7319  
  7320    /*
  7321    ** Load pointers to all cells on sibling pages and the divider cells
  7322    ** into the local b.apCell[] array.  Make copies of the divider cells
  7323    ** into space obtained from aSpace1[]. The divider cells have already
  7324    ** been removed from pParent.
  7325    **
  7326    ** If the siblings are on leaf pages, then the child pointers of the
  7327    ** divider cells are stripped from the cells before they are copied
  7328    ** into aSpace1[].  In this way, all cells in b.apCell[] are without
  7329    ** child pointers.  If siblings are not leaves, then all cell in
  7330    ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
  7331    ** are alike.
  7332    **
  7333    ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  7334    **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  7335    */
  7336    b.pRef = apOld[0];
  7337    leafCorrection = b.pRef->leaf*4;
  7338    leafData = b.pRef->intKeyLeaf;
  7339    for(i=0; i<nOld; i++){
  7340      MemPage *pOld = apOld[i];
  7341      int limit = pOld->nCell;
  7342      u8 *aData = pOld->aData;
  7343      u16 maskPage = pOld->maskPage;
  7344      u8 *piCell = aData + pOld->cellOffset;
  7345      u8 *piEnd;
  7346  
  7347      /* Verify that all sibling pages are of the same "type" (table-leaf,
  7348      ** table-interior, index-leaf, or index-interior).
  7349      */
  7350      if( pOld->aData[0]!=apOld[0]->aData[0] ){
  7351        rc = SQLITE_CORRUPT_BKPT;
  7352        goto balance_cleanup;
  7353      }
  7354  
  7355      /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
  7356      ** constains overflow cells, include them in the b.apCell[] array
  7357      ** in the correct spot.
  7358      **
  7359      ** Note that when there are multiple overflow cells, it is always the
  7360      ** case that they are sequential and adjacent.  This invariant arises
  7361      ** because multiple overflows can only occurs when inserting divider
  7362      ** cells into a parent on a prior balance, and divider cells are always
  7363      ** adjacent and are inserted in order.  There is an assert() tagged
  7364      ** with "NOTE 1" in the overflow cell insertion loop to prove this
  7365      ** invariant.
  7366      **
  7367      ** This must be done in advance.  Once the balance starts, the cell
  7368      ** offset section of the btree page will be overwritten and we will no
  7369      ** long be able to find the cells if a pointer to each cell is not saved
  7370      ** first.
  7371      */
  7372      memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
  7373      if( pOld->nOverflow>0 ){
  7374        limit = pOld->aiOvfl[0];
  7375        for(j=0; j<limit; j++){
  7376          b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
  7377          piCell += 2;
  7378          b.nCell++;
  7379        }
  7380        for(k=0; k<pOld->nOverflow; k++){
  7381          assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
  7382          b.apCell[b.nCell] = pOld->apOvfl[k];
  7383          b.nCell++;
  7384        }
  7385      }
  7386      piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
  7387      while( piCell<piEnd ){
  7388        assert( b.nCell<nMaxCells );
  7389        b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
  7390        piCell += 2;
  7391        b.nCell++;
  7392      }
  7393  
  7394      cntOld[i] = b.nCell;
  7395      if( i<nOld-1 && !leafData){
  7396        u16 sz = (u16)szNew[i];
  7397        u8 *pTemp;
  7398        assert( b.nCell<nMaxCells );
  7399        b.szCell[b.nCell] = sz;
  7400        pTemp = &aSpace1[iSpace1];
  7401        iSpace1 += sz;
  7402        assert( sz<=pBt->maxLocal+23 );
  7403        assert( iSpace1 <= (int)pBt->pageSize );
  7404        memcpy(pTemp, apDiv[i], sz);
  7405        b.apCell[b.nCell] = pTemp+leafCorrection;
  7406        assert( leafCorrection==0 || leafCorrection==4 );
  7407        b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
  7408        if( !pOld->leaf ){
  7409          assert( leafCorrection==0 );
  7410          assert( pOld->hdrOffset==0 );
  7411          /* The right pointer of the child page pOld becomes the left
  7412          ** pointer of the divider cell */
  7413          memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
  7414        }else{
  7415          assert( leafCorrection==4 );
  7416          while( b.szCell[b.nCell]<4 ){
  7417            /* Do not allow any cells smaller than 4 bytes. If a smaller cell
  7418            ** does exist, pad it with 0x00 bytes. */
  7419            assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
  7420            assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
  7421            aSpace1[iSpace1++] = 0x00;
  7422            b.szCell[b.nCell]++;
  7423          }
  7424        }
  7425        b.nCell++;
  7426      }
  7427    }
  7428  
  7429    /*
  7430    ** Figure out the number of pages needed to hold all b.nCell cells.
  7431    ** Store this number in "k".  Also compute szNew[] which is the total
  7432    ** size of all cells on the i-th page and cntNew[] which is the index
  7433    ** in b.apCell[] of the cell that divides page i from page i+1.  
  7434    ** cntNew[k] should equal b.nCell.
  7435    **
  7436    ** Values computed by this block:
  7437    **
  7438    **           k: The total number of sibling pages
  7439    **    szNew[i]: Spaced used on the i-th sibling page.
  7440    **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
  7441    **              the right of the i-th sibling page.
  7442    ** usableSpace: Number of bytes of space available on each sibling.
  7443    ** 
  7444    */
  7445    usableSpace = pBt->usableSize - 12 + leafCorrection;
  7446    for(i=0; i<nOld; i++){
  7447      MemPage *p = apOld[i];
  7448      szNew[i] = usableSpace - p->nFree;
  7449      for(j=0; j<p->nOverflow; j++){
  7450        szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
  7451      }
  7452      cntNew[i] = cntOld[i];
  7453    }
  7454    k = nOld;
  7455    for(i=0; i<k; i++){
  7456      int sz;
  7457      while( szNew[i]>usableSpace ){
  7458        if( i+1>=k ){
  7459          k = i+2;
  7460          if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
  7461          szNew[k-1] = 0;
  7462          cntNew[k-1] = b.nCell;
  7463        }
  7464        sz = 2 + cachedCellSize(&b, cntNew[i]-1);
  7465        szNew[i] -= sz;
  7466        if( !leafData ){
  7467          if( cntNew[i]<b.nCell ){
  7468            sz = 2 + cachedCellSize(&b, cntNew[i]);
  7469          }else{
  7470            sz = 0;
  7471          }
  7472        }
  7473        szNew[i+1] += sz;
  7474        cntNew[i]--;
  7475      }
  7476      while( cntNew[i]<b.nCell ){
  7477        sz = 2 + cachedCellSize(&b, cntNew[i]);
  7478        if( szNew[i]+sz>usableSpace ) break;
  7479        szNew[i] += sz;
  7480        cntNew[i]++;
  7481        if( !leafData ){
  7482          if( cntNew[i]<b.nCell ){
  7483            sz = 2 + cachedCellSize(&b, cntNew[i]);
  7484          }else{
  7485            sz = 0;
  7486          }
  7487        }
  7488        szNew[i+1] -= sz;
  7489      }
  7490      if( cntNew[i]>=b.nCell ){
  7491        k = i+1;
  7492      }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
  7493        rc = SQLITE_CORRUPT_BKPT;
  7494        goto balance_cleanup;
  7495      }
  7496    }
  7497  
  7498    /*
  7499    ** The packing computed by the previous block is biased toward the siblings
  7500    ** on the left side (siblings with smaller keys). The left siblings are
  7501    ** always nearly full, while the right-most sibling might be nearly empty.
  7502    ** The next block of code attempts to adjust the packing of siblings to
  7503    ** get a better balance.
  7504    **
  7505    ** This adjustment is more than an optimization.  The packing above might
  7506    ** be so out of balance as to be illegal.  For example, the right-most
  7507    ** sibling might be completely empty.  This adjustment is not optional.
  7508    */
  7509    for(i=k-1; i>0; i--){
  7510      int szRight = szNew[i];  /* Size of sibling on the right */
  7511      int szLeft = szNew[i-1]; /* Size of sibling on the left */
  7512      int r;              /* Index of right-most cell in left sibling */
  7513      int d;              /* Index of first cell to the left of right sibling */
  7514  
  7515      r = cntNew[i-1] - 1;
  7516      d = r + 1 - leafData;
  7517      (void)cachedCellSize(&b, d);
  7518      do{
  7519        assert( d<nMaxCells );
  7520        assert( r<nMaxCells );
  7521        (void)cachedCellSize(&b, r);
  7522        if( szRight!=0
  7523         && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
  7524          break;
  7525        }
  7526        szRight += b.szCell[d] + 2;
  7527        szLeft -= b.szCell[r] + 2;
  7528        cntNew[i-1] = r;
  7529        r--;
  7530        d--;
  7531      }while( r>=0 );
  7532      szNew[i] = szRight;
  7533      szNew[i-1] = szLeft;
  7534      if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
  7535        rc = SQLITE_CORRUPT_BKPT;
  7536        goto balance_cleanup;
  7537      }
  7538    }
  7539  
  7540    /* Sanity check:  For a non-corrupt database file one of the follwing
  7541    ** must be true:
  7542    **    (1) We found one or more cells (cntNew[0])>0), or
  7543    **    (2) pPage is a virtual root page.  A virtual root page is when
  7544    **        the real root page is page 1 and we are the only child of
  7545    **        that page.
  7546    */
  7547    assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
  7548    TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
  7549      apOld[0]->pgno, apOld[0]->nCell,
  7550      nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
  7551      nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
  7552    ));
  7553  
  7554    /*
  7555    ** Allocate k new pages.  Reuse old pages where possible.
  7556    */
  7557    pageFlags = apOld[0]->aData[0];
  7558    for(i=0; i<k; i++){
  7559      MemPage *pNew;
  7560      if( i<nOld ){
  7561        pNew = apNew[i] = apOld[i];
  7562        apOld[i] = 0;
  7563        rc = sqlite3PagerWrite(pNew->pDbPage);
  7564        nNew++;
  7565        if( rc ) goto balance_cleanup;
  7566      }else{
  7567        assert( i>0 );
  7568        rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
  7569        if( rc ) goto balance_cleanup;
  7570        zeroPage(pNew, pageFlags);
  7571        apNew[i] = pNew;
  7572        nNew++;
  7573        cntOld[i] = b.nCell;
  7574  
  7575        /* Set the pointer-map entry for the new sibling page. */
  7576        if( ISAUTOVACUUM ){
  7577          ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
  7578          if( rc!=SQLITE_OK ){
  7579            goto balance_cleanup;
  7580          }
  7581        }
  7582      }
  7583    }
  7584  
  7585    /*
  7586    ** Reassign page numbers so that the new pages are in ascending order. 
  7587    ** This helps to keep entries in the disk file in order so that a scan
  7588    ** of the table is closer to a linear scan through the file. That in turn 
  7589    ** helps the operating system to deliver pages from the disk more rapidly.
  7590    **
  7591    ** An O(n^2) insertion sort algorithm is used, but since n is never more 
  7592    ** than (NB+2) (a small constant), that should not be a problem.
  7593    **
  7594    ** When NB==3, this one optimization makes the database about 25% faster 
  7595    ** for large insertions and deletions.
  7596    */
  7597    for(i=0; i<nNew; i++){
  7598      aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
  7599      aPgFlags[i] = apNew[i]->pDbPage->flags;
  7600      for(j=0; j<i; j++){
  7601        if( aPgno[j]==aPgno[i] ){
  7602          /* This branch is taken if the set of sibling pages somehow contains
  7603          ** duplicate entries. This can happen if the database is corrupt. 
  7604          ** It would be simpler to detect this as part of the loop below, but
  7605          ** we do the detection here in order to avoid populating the pager
  7606          ** cache with two separate objects associated with the same
  7607          ** page number.  */
  7608          assert( CORRUPT_DB );
  7609          rc = SQLITE_CORRUPT_BKPT;
  7610          goto balance_cleanup;
  7611        }
  7612      }
  7613    }
  7614    for(i=0; i<nNew; i++){
  7615      int iBest = 0;                /* aPgno[] index of page number to use */
  7616      for(j=1; j<nNew; j++){
  7617        if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
  7618      }
  7619      pgno = aPgOrder[iBest];
  7620      aPgOrder[iBest] = 0xffffffff;
  7621      if( iBest!=i ){
  7622        if( iBest>i ){
  7623          sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
  7624        }
  7625        sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
  7626        apNew[i]->pgno = pgno;
  7627      }
  7628    }
  7629  
  7630    TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
  7631           "%d(%d nc=%d) %d(%d nc=%d)\n",
  7632      apNew[0]->pgno, szNew[0], cntNew[0],
  7633      nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
  7634      nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
  7635      nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
  7636      nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
  7637      nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
  7638      nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
  7639      nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
  7640      nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
  7641    ));
  7642  
  7643    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  7644    put4byte(pRight, apNew[nNew-1]->pgno);
  7645  
  7646    /* If the sibling pages are not leaves, ensure that the right-child pointer
  7647    ** of the right-most new sibling page is set to the value that was 
  7648    ** originally in the same field of the right-most old sibling page. */
  7649    if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
  7650      MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
  7651      memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
  7652    }
  7653  
  7654    /* Make any required updates to pointer map entries associated with 
  7655    ** cells stored on sibling pages following the balance operation. Pointer
  7656    ** map entries associated with divider cells are set by the insertCell()
  7657    ** routine. The associated pointer map entries are:
  7658    **
  7659    **   a) if the cell contains a reference to an overflow chain, the
  7660    **      entry associated with the first page in the overflow chain, and
  7661    **
  7662    **   b) if the sibling pages are not leaves, the child page associated
  7663    **      with the cell.
  7664    **
  7665    ** If the sibling pages are not leaves, then the pointer map entry 
  7666    ** associated with the right-child of each sibling may also need to be 
  7667    ** updated. This happens below, after the sibling pages have been 
  7668    ** populated, not here.
  7669    */
  7670    if( ISAUTOVACUUM ){
  7671      MemPage *pNew = apNew[0];
  7672      u8 *aOld = pNew->aData;
  7673      int cntOldNext = pNew->nCell + pNew->nOverflow;
  7674      int usableSize = pBt->usableSize;
  7675      int iNew = 0;
  7676      int iOld = 0;
  7677  
  7678      for(i=0; i<b.nCell; i++){
  7679        u8 *pCell = b.apCell[i];
  7680        if( i==cntOldNext ){
  7681          MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
  7682          cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
  7683          aOld = pOld->aData;
  7684        }
  7685        if( i==cntNew[iNew] ){
  7686          pNew = apNew[++iNew];
  7687          if( !leafData ) continue;
  7688        }
  7689  
  7690        /* Cell pCell is destined for new sibling page pNew. Originally, it
  7691        ** was either part of sibling page iOld (possibly an overflow cell), 
  7692        ** or else the divider cell to the left of sibling page iOld. So,
  7693        ** if sibling page iOld had the same page number as pNew, and if
  7694        ** pCell really was a part of sibling page iOld (not a divider or
  7695        ** overflow cell), we can skip updating the pointer map entries.  */
  7696        if( iOld>=nNew
  7697         || pNew->pgno!=aPgno[iOld]
  7698         || !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])
  7699        ){
  7700          if( !leafCorrection ){
  7701            ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
  7702          }
  7703          if( cachedCellSize(&b,i)>pNew->minLocal ){
  7704            ptrmapPutOvflPtr(pNew, pCell, &rc);
  7705          }
  7706          if( rc ) goto balance_cleanup;
  7707        }
  7708      }
  7709    }
  7710  
  7711    /* Insert new divider cells into pParent. */
  7712    for(i=0; i<nNew-1; i++){
  7713      u8 *pCell;
  7714      u8 *pTemp;
  7715      int sz;
  7716      MemPage *pNew = apNew[i];
  7717      j = cntNew[i];
  7718  
  7719      assert( j<nMaxCells );
  7720      assert( b.apCell[j]!=0 );
  7721      pCell = b.apCell[j];
  7722      sz = b.szCell[j] + leafCorrection;
  7723      pTemp = &aOvflSpace[iOvflSpace];
  7724      if( !pNew->leaf ){
  7725        memcpy(&pNew->aData[8], pCell, 4);
  7726      }else if( leafData ){
  7727        /* If the tree is a leaf-data tree, and the siblings are leaves, 
  7728        ** then there is no divider cell in b.apCell[]. Instead, the divider 
  7729        ** cell consists of the integer key for the right-most cell of 
  7730        ** the sibling-page assembled above only.
  7731        */
  7732        CellInfo info;
  7733        j--;
  7734        pNew->xParseCell(pNew, b.apCell[j], &info);
  7735        pCell = pTemp;
  7736        sz = 4 + putVarint(&pCell[4], info.nKey);
  7737        pTemp = 0;
  7738      }else{
  7739        pCell -= 4;
  7740        /* Obscure case for non-leaf-data trees: If the cell at pCell was
  7741        ** previously stored on a leaf node, and its reported size was 4
  7742        ** bytes, then it may actually be smaller than this 
  7743        ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
  7744        ** any cell). But it is important to pass the correct size to 
  7745        ** insertCell(), so reparse the cell now.
  7746        **
  7747        ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
  7748        ** and WITHOUT ROWID tables with exactly one column which is the
  7749        ** primary key.
  7750        */
  7751        if( b.szCell[j]==4 ){
  7752          assert(leafCorrection==4);
  7753          sz = pParent->xCellSize(pParent, pCell);
  7754        }
  7755      }
  7756      iOvflSpace += sz;
  7757      assert( sz<=pBt->maxLocal+23 );
  7758      assert( iOvflSpace <= (int)pBt->pageSize );
  7759      insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
  7760      if( rc!=SQLITE_OK ) goto balance_cleanup;
  7761      assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  7762    }
  7763  
  7764    /* Now update the actual sibling pages. The order in which they are updated
  7765    ** is important, as this code needs to avoid disrupting any page from which
  7766    ** cells may still to be read. In practice, this means:
  7767    **
  7768    **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
  7769    **      then it is not safe to update page apNew[iPg] until after
  7770    **      the left-hand sibling apNew[iPg-1] has been updated.
  7771    **
  7772    **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
  7773    **      then it is not safe to update page apNew[iPg] until after
  7774    **      the right-hand sibling apNew[iPg+1] has been updated.
  7775    **
  7776    ** If neither of the above apply, the page is safe to update.
  7777    **
  7778    ** The iPg value in the following loop starts at nNew-1 goes down
  7779    ** to 0, then back up to nNew-1 again, thus making two passes over
  7780    ** the pages.  On the initial downward pass, only condition (1) above
  7781    ** needs to be tested because (2) will always be true from the previous
  7782    ** step.  On the upward pass, both conditions are always true, so the
  7783    ** upwards pass simply processes pages that were missed on the downward
  7784    ** pass.
  7785    */
  7786    for(i=1-nNew; i<nNew; i++){
  7787      int iPg = i<0 ? -i : i;
  7788      assert( iPg>=0 && iPg<nNew );
  7789      if( abDone[iPg] ) continue;         /* Skip pages already processed */
  7790      if( i>=0                            /* On the upwards pass, or... */
  7791       || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
  7792      ){
  7793        int iNew;
  7794        int iOld;
  7795        int nNewCell;
  7796  
  7797        /* Verify condition (1):  If cells are moving left, update iPg
  7798        ** only after iPg-1 has already been updated. */
  7799        assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
  7800  
  7801        /* Verify condition (2):  If cells are moving right, update iPg
  7802        ** only after iPg+1 has already been updated. */
  7803        assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
  7804  
  7805        if( iPg==0 ){
  7806          iNew = iOld = 0;
  7807          nNewCell = cntNew[0];
  7808        }else{
  7809          iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
  7810          iNew = cntNew[iPg-1] + !leafData;
  7811          nNewCell = cntNew[iPg] - iNew;
  7812        }
  7813  
  7814        rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
  7815        if( rc ) goto balance_cleanup;
  7816        abDone[iPg]++;
  7817        apNew[iPg]->nFree = usableSpace-szNew[iPg];
  7818        assert( apNew[iPg]->nOverflow==0 );
  7819        assert( apNew[iPg]->nCell==nNewCell );
  7820      }
  7821    }
  7822  
  7823    /* All pages have been processed exactly once */
  7824    assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
  7825  
  7826    assert( nOld>0 );
  7827    assert( nNew>0 );
  7828  
  7829    if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
  7830      /* The root page of the b-tree now contains no cells. The only sibling
  7831      ** page is the right-child of the parent. Copy the contents of the
  7832      ** child page into the parent, decreasing the overall height of the
  7833      ** b-tree structure by one. This is described as the "balance-shallower"
  7834      ** sub-algorithm in some documentation.
  7835      **
  7836      ** If this is an auto-vacuum database, the call to copyNodeContent() 
  7837      ** sets all pointer-map entries corresponding to database image pages 
  7838      ** for which the pointer is stored within the content being copied.
  7839      **
  7840      ** It is critical that the child page be defragmented before being
  7841      ** copied into the parent, because if the parent is page 1 then it will
  7842      ** by smaller than the child due to the database header, and so all the
  7843      ** free space needs to be up front.
  7844      */
  7845      assert( nNew==1 || CORRUPT_DB );
  7846      rc = defragmentPage(apNew[0], -1);
  7847      testcase( rc!=SQLITE_OK );
  7848      assert( apNew[0]->nFree == 
  7849          (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
  7850        || rc!=SQLITE_OK
  7851      );
  7852      copyNodeContent(apNew[0], pParent, &rc);
  7853      freePage(apNew[0], &rc);
  7854    }else if( ISAUTOVACUUM && !leafCorrection ){
  7855      /* Fix the pointer map entries associated with the right-child of each
  7856      ** sibling page. All other pointer map entries have already been taken
  7857      ** care of.  */
  7858      for(i=0; i<nNew; i++){
  7859        u32 key = get4byte(&apNew[i]->aData[8]);
  7860        ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
  7861      }
  7862    }
  7863  
  7864    assert( pParent->isInit );
  7865    TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
  7866            nOld, nNew, b.nCell));
  7867  
  7868    /* Free any old pages that were not reused as new pages.
  7869    */
  7870    for(i=nNew; i<nOld; i++){
  7871      freePage(apOld[i], &rc);
  7872    }
  7873  
  7874  #if 0
  7875    if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
  7876      /* The ptrmapCheckPages() contains assert() statements that verify that
  7877      ** all pointer map pages are set correctly. This is helpful while 
  7878      ** debugging. This is usually disabled because a corrupt database may
  7879      ** cause an assert() statement to fail.  */
  7880      ptrmapCheckPages(apNew, nNew);
  7881      ptrmapCheckPages(&pParent, 1);
  7882    }
  7883  #endif
  7884  
  7885    /*
  7886    ** Cleanup before returning.
  7887    */
  7888  balance_cleanup:
  7889    sqlite3StackFree(0, b.apCell);
  7890    for(i=0; i<nOld; i++){
  7891      releasePage(apOld[i]);
  7892    }
  7893    for(i=0; i<nNew; i++){
  7894      releasePage(apNew[i]);
  7895    }
  7896  
  7897    return rc;
  7898  }
  7899  
  7900  
  7901  /*
  7902  ** This function is called when the root page of a b-tree structure is
  7903  ** overfull (has one or more overflow pages).
  7904  **
  7905  ** A new child page is allocated and the contents of the current root
  7906  ** page, including overflow cells, are copied into the child. The root
  7907  ** page is then overwritten to make it an empty page with the right-child 
  7908  ** pointer pointing to the new page.
  7909  **
  7910  ** Before returning, all pointer-map entries corresponding to pages 
  7911  ** that the new child-page now contains pointers to are updated. The
  7912  ** entry corresponding to the new right-child pointer of the root
  7913  ** page is also updated.
  7914  **
  7915  ** If successful, *ppChild is set to contain a reference to the child 
  7916  ** page and SQLITE_OK is returned. In this case the caller is required
  7917  ** to call releasePage() on *ppChild exactly once. If an error occurs,
  7918  ** an error code is returned and *ppChild is set to 0.
  7919  */
  7920  static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
  7921    int rc;                        /* Return value from subprocedures */
  7922    MemPage *pChild = 0;           /* Pointer to a new child page */
  7923    Pgno pgnoChild = 0;            /* Page number of the new child page */
  7924    BtShared *pBt = pRoot->pBt;    /* The BTree */
  7925  
  7926    assert( pRoot->nOverflow>0 );
  7927    assert( sqlite3_mutex_held(pBt->mutex) );
  7928  
  7929    /* Make pRoot, the root page of the b-tree, writable. Allocate a new 
  7930    ** page that will become the new right-child of pPage. Copy the contents
  7931    ** of the node stored on pRoot into the new child page.
  7932    */
  7933    rc = sqlite3PagerWrite(pRoot->pDbPage);
  7934    if( rc==SQLITE_OK ){
  7935      rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
  7936      copyNodeContent(pRoot, pChild, &rc);
  7937      if( ISAUTOVACUUM ){
  7938        ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
  7939      }
  7940    }
  7941    if( rc ){
  7942      *ppChild = 0;
  7943      releasePage(pChild);
  7944      return rc;
  7945    }
  7946    assert( sqlite3PagerIswriteable(pChild->pDbPage) );
  7947    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
  7948    assert( pChild->nCell==pRoot->nCell );
  7949  
  7950    TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
  7951  
  7952    /* Copy the overflow cells from pRoot to pChild */
  7953    memcpy(pChild->aiOvfl, pRoot->aiOvfl,
  7954           pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
  7955    memcpy(pChild->apOvfl, pRoot->apOvfl,
  7956           pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
  7957    pChild->nOverflow = pRoot->nOverflow;
  7958  
  7959    /* Zero the contents of pRoot. Then install pChild as the right-child. */
  7960    zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
  7961    put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
  7962  
  7963    *ppChild = pChild;
  7964    return SQLITE_OK;
  7965  }
  7966  
  7967  /*
  7968  ** The page that pCur currently points to has just been modified in
  7969  ** some way. This function figures out if this modification means the
  7970  ** tree needs to be balanced, and if so calls the appropriate balancing 
  7971  ** routine. Balancing routines are:
  7972  **
  7973  **   balance_quick()
  7974  **   balance_deeper()
  7975  **   balance_nonroot()
  7976  */
  7977  static int balance(BtCursor *pCur){
  7978    int rc = SQLITE_OK;
  7979    const int nMin = pCur->pBt->usableSize * 2 / 3;
  7980    u8 aBalanceQuickSpace[13];
  7981    u8 *pFree = 0;
  7982  
  7983    VVA_ONLY( int balance_quick_called = 0 );
  7984    VVA_ONLY( int balance_deeper_called = 0 );
  7985  
  7986    do {
  7987      int iPage = pCur->iPage;
  7988      MemPage *pPage = pCur->pPage;
  7989  
  7990      if( iPage==0 ){
  7991        if( pPage->nOverflow ){
  7992          /* The root page of the b-tree is overfull. In this case call the
  7993          ** balance_deeper() function to create a new child for the root-page
  7994          ** and copy the current contents of the root-page to it. The
  7995          ** next iteration of the do-loop will balance the child page.
  7996          */ 
  7997          assert( balance_deeper_called==0 );
  7998          VVA_ONLY( balance_deeper_called++ );
  7999          rc = balance_deeper(pPage, &pCur->apPage[1]);
  8000          if( rc==SQLITE_OK ){
  8001            pCur->iPage = 1;
  8002            pCur->ix = 0;
  8003            pCur->aiIdx[0] = 0;
  8004            pCur->apPage[0] = pPage;
  8005            pCur->pPage = pCur->apPage[1];
  8006            assert( pCur->pPage->nOverflow );
  8007          }
  8008        }else{
  8009          break;
  8010        }
  8011      }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
  8012        break;
  8013      }else{
  8014        MemPage * const pParent = pCur->apPage[iPage-1];
  8015        int const iIdx = pCur->aiIdx[iPage-1];
  8016  
  8017        rc = sqlite3PagerWrite(pParent->pDbPage);
  8018        if( rc==SQLITE_OK ){
  8019  #ifndef SQLITE_OMIT_QUICKBALANCE
  8020          if( pPage->intKeyLeaf
  8021           && pPage->nOverflow==1
  8022           && pPage->aiOvfl[0]==pPage->nCell
  8023           && pParent->pgno!=1
  8024           && pParent->nCell==iIdx
  8025          ){
  8026            /* Call balance_quick() to create a new sibling of pPage on which
  8027            ** to store the overflow cell. balance_quick() inserts a new cell
  8028            ** into pParent, which may cause pParent overflow. If this
  8029            ** happens, the next iteration of the do-loop will balance pParent 
  8030            ** use either balance_nonroot() or balance_deeper(). Until this
  8031            ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
  8032            ** buffer. 
  8033            **
  8034            ** The purpose of the following assert() is to check that only a
  8035            ** single call to balance_quick() is made for each call to this
  8036            ** function. If this were not verified, a subtle bug involving reuse
  8037            ** of the aBalanceQuickSpace[] might sneak in.
  8038            */
  8039            assert( balance_quick_called==0 ); 
  8040            VVA_ONLY( balance_quick_called++ );
  8041            rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
  8042          }else
  8043  #endif
  8044          {
  8045            /* In this case, call balance_nonroot() to redistribute cells
  8046            ** between pPage and up to 2 of its sibling pages. This involves
  8047            ** modifying the contents of pParent, which may cause pParent to
  8048            ** become overfull or underfull. The next iteration of the do-loop
  8049            ** will balance the parent page to correct this.
  8050            ** 
  8051            ** If the parent page becomes overfull, the overflow cell or cells
  8052            ** are stored in the pSpace buffer allocated immediately below. 
  8053            ** A subsequent iteration of the do-loop will deal with this by
  8054            ** calling balance_nonroot() (balance_deeper() may be called first,
  8055            ** but it doesn't deal with overflow cells - just moves them to a
  8056            ** different page). Once this subsequent call to balance_nonroot() 
  8057            ** has completed, it is safe to release the pSpace buffer used by
  8058            ** the previous call, as the overflow cell data will have been 
  8059            ** copied either into the body of a database page or into the new
  8060            ** pSpace buffer passed to the latter call to balance_nonroot().
  8061            */
  8062            u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
  8063            rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
  8064                                 pCur->hints&BTREE_BULKLOAD);
  8065            if( pFree ){
  8066              /* If pFree is not NULL, it points to the pSpace buffer used 
  8067              ** by a previous call to balance_nonroot(). Its contents are
  8068              ** now stored either on real database pages or within the 
  8069              ** new pSpace buffer, so it may be safely freed here. */
  8070              sqlite3PageFree(pFree);
  8071            }
  8072  
  8073            /* The pSpace buffer will be freed after the next call to
  8074            ** balance_nonroot(), or just before this function returns, whichever
  8075            ** comes first. */
  8076            pFree = pSpace;
  8077          }
  8078        }
  8079  
  8080        pPage->nOverflow = 0;
  8081  
  8082        /* The next iteration of the do-loop balances the parent page. */
  8083        releasePage(pPage);
  8084        pCur->iPage--;
  8085        assert( pCur->iPage>=0 );
  8086        pCur->pPage = pCur->apPage[pCur->iPage];
  8087      }
  8088    }while( rc==SQLITE_OK );
  8089  
  8090    if( pFree ){
  8091      sqlite3PageFree(pFree);
  8092    }
  8093    return rc;
  8094  }
  8095  
  8096  
  8097  /*
  8098  ** Insert a new record into the BTree.  The content of the new record
  8099  ** is described by the pX object.  The pCur cursor is used only to
  8100  ** define what table the record should be inserted into, and is left
  8101  ** pointing at a random location.
  8102  **
  8103  ** For a table btree (used for rowid tables), only the pX.nKey value of
  8104  ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
  8105  ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
  8106  ** hold the content of the row.
  8107  **
  8108  ** For an index btree (used for indexes and WITHOUT ROWID tables), the
  8109  ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The 
  8110  ** pX.pData,nData,nZero fields must be zero.
  8111  **
  8112  ** If the seekResult parameter is non-zero, then a successful call to
  8113  ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
  8114  ** been performed.  In other words, if seekResult!=0 then the cursor
  8115  ** is currently pointing to a cell that will be adjacent to the cell
  8116  ** to be inserted.  If seekResult<0 then pCur points to a cell that is
  8117  ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
  8118  ** that is larger than (pKey,nKey).
  8119  **
  8120  ** If seekResult==0, that means pCur is pointing at some unknown location.
  8121  ** In that case, this routine must seek the cursor to the correct insertion
  8122  ** point for (pKey,nKey) before doing the insertion.  For index btrees,
  8123  ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
  8124  ** key values and pX->aMem can be used instead of pX->pKey to avoid having
  8125  ** to decode the key.
  8126  */
  8127  int sqlite3BtreeInsert(
  8128    BtCursor *pCur,                /* Insert data into the table of this cursor */
  8129    const BtreePayload *pX,        /* Content of the row to be inserted */
  8130    int flags,                     /* True if this is likely an append */
  8131    int seekResult                 /* Result of prior MovetoUnpacked() call */
  8132  ){
  8133    int rc;
  8134    int loc = seekResult;          /* -1: before desired location  +1: after */
  8135    int szNew = 0;
  8136    int idx;
  8137    MemPage *pPage;
  8138    Btree *p = pCur->pBtree;
  8139    BtShared *pBt = p->pBt;
  8140    unsigned char *oldCell;
  8141    unsigned char *newCell = 0;
  8142  
  8143    assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
  8144  
  8145    if( pCur->eState==CURSOR_FAULT ){
  8146      assert( pCur->skipNext!=SQLITE_OK );
  8147      return pCur->skipNext;
  8148    }
  8149  
  8150    assert( cursorOwnsBtShared(pCur) );
  8151    assert( (pCur->curFlags & BTCF_WriteFlag)!=0
  8152                && pBt->inTransaction==TRANS_WRITE
  8153                && (pBt->btsFlags & BTS_READ_ONLY)==0 );
  8154    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
  8155  
  8156    /* Assert that the caller has been consistent. If this cursor was opened
  8157    ** expecting an index b-tree, then the caller should be inserting blob
  8158    ** keys with no associated data. If the cursor was opened expecting an
  8159    ** intkey table, the caller should be inserting integer keys with a
  8160    ** blob of associated data.  */
  8161    assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
  8162  
  8163    /* Save the positions of any other cursors open on this table.
  8164    **
  8165    ** In some cases, the call to btreeMoveto() below is a no-op. For
  8166    ** example, when inserting data into a table with auto-generated integer
  8167    ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 
  8168    ** integer key to use. It then calls this function to actually insert the 
  8169    ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
  8170    ** that the cursor is already where it needs to be and returns without
  8171    ** doing any work. To avoid thwarting these optimizations, it is important
  8172    ** not to clear the cursor here.
  8173    */
  8174    if( pCur->curFlags & BTCF_Multiple ){
  8175      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
  8176      if( rc ) return rc;
  8177    }
  8178  
  8179    if( pCur->pKeyInfo==0 ){
  8180      assert( pX->pKey==0 );
  8181      /* If this is an insert into a table b-tree, invalidate any incrblob 
  8182      ** cursors open on the row being replaced */
  8183      invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
  8184  
  8185      /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 
  8186      ** to a row with the same key as the new entry being inserted.  */
  8187      assert( (flags & BTREE_SAVEPOSITION)==0 || 
  8188              ((pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey) );
  8189  
  8190      /* If the cursor is currently on the last row and we are appending a
  8191      ** new row onto the end, set the "loc" to avoid an unnecessary
  8192      ** btreeMoveto() call */
  8193      if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
  8194        loc = 0;
  8195      }else if( loc==0 ){
  8196        rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
  8197        if( rc ) return rc;
  8198      }
  8199    }else if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
  8200      if( pX->nMem ){
  8201        UnpackedRecord r;
  8202        r.pKeyInfo = pCur->pKeyInfo;
  8203        r.aMem = pX->aMem;
  8204        r.nField = pX->nMem;
  8205        r.default_rc = 0;
  8206        r.errCode = 0;
  8207        r.r1 = 0;
  8208        r.r2 = 0;
  8209        r.eqSeen = 0;
  8210        rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
  8211      }else{
  8212        rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
  8213      }
  8214      if( rc ) return rc;
  8215    }
  8216    assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
  8217  
  8218    pPage = pCur->pPage;
  8219    assert( pPage->intKey || pX->nKey>=0 );
  8220    assert( pPage->leaf || !pPage->intKey );
  8221  
  8222    TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
  8223            pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
  8224            loc==0 ? "overwrite" : "new entry"));
  8225    assert( pPage->isInit );
  8226    newCell = pBt->pTmpSpace;
  8227    assert( newCell!=0 );
  8228    rc = fillInCell(pPage, newCell, pX, &szNew);
  8229    if( rc ) goto end_insert;
  8230    assert( szNew==pPage->xCellSize(pPage, newCell) );
  8231    assert( szNew <= MX_CELL_SIZE(pBt) );
  8232    idx = pCur->ix;
  8233    if( loc==0 ){
  8234      CellInfo info;
  8235      assert( idx<pPage->nCell );
  8236      rc = sqlite3PagerWrite(pPage->pDbPage);
  8237      if( rc ){
  8238        goto end_insert;
  8239      }
  8240      oldCell = findCell(pPage, idx);
  8241      if( !pPage->leaf ){
  8242        memcpy(newCell, oldCell, 4);
  8243      }
  8244      rc = clearCell(pPage, oldCell, &info);
  8245      if( info.nSize==szNew && info.nLocal==info.nPayload 
  8246       && (!ISAUTOVACUUM || szNew<pPage->minLocal)
  8247      ){
  8248        /* Overwrite the old cell with the new if they are the same size.
  8249        ** We could also try to do this if the old cell is smaller, then add
  8250        ** the leftover space to the free list.  But experiments show that
  8251        ** doing that is no faster then skipping this optimization and just
  8252        ** calling dropCell() and insertCell(). 
  8253        **
  8254        ** This optimization cannot be used on an autovacuum database if the
  8255        ** new entry uses overflow pages, as the insertCell() call below is
  8256        ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
  8257        assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
  8258        if( oldCell+szNew > pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
  8259        memcpy(oldCell, newCell, szNew);
  8260        return SQLITE_OK;
  8261      }
  8262      dropCell(pPage, idx, info.nSize, &rc);
  8263      if( rc ) goto end_insert;
  8264    }else if( loc<0 && pPage->nCell>0 ){
  8265      assert( pPage->leaf );
  8266      idx = ++pCur->ix;
  8267      pCur->curFlags &= ~BTCF_ValidNKey;
  8268    }else{
  8269      assert( pPage->leaf );
  8270    }
  8271    insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
  8272    assert( pPage->nOverflow==0 || rc==SQLITE_OK );
  8273    assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
  8274  
  8275    /* If no error has occurred and pPage has an overflow cell, call balance() 
  8276    ** to redistribute the cells within the tree. Since balance() may move
  8277    ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
  8278    ** variables.
  8279    **
  8280    ** Previous versions of SQLite called moveToRoot() to move the cursor
  8281    ** back to the root page as balance() used to invalidate the contents
  8282    ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
  8283    ** set the cursor state to "invalid". This makes common insert operations
  8284    ** slightly faster.
  8285    **
  8286    ** There is a subtle but important optimization here too. When inserting
  8287    ** multiple records into an intkey b-tree using a single cursor (as can
  8288    ** happen while processing an "INSERT INTO ... SELECT" statement), it
  8289    ** is advantageous to leave the cursor pointing to the last entry in
  8290    ** the b-tree if possible. If the cursor is left pointing to the last
  8291    ** entry in the table, and the next row inserted has an integer key
  8292    ** larger than the largest existing key, it is possible to insert the
  8293    ** row without seeking the cursor. This can be a big performance boost.
  8294    */
  8295    pCur->info.nSize = 0;
  8296    if( pPage->nOverflow ){
  8297      assert( rc==SQLITE_OK );
  8298      pCur->curFlags &= ~(BTCF_ValidNKey);
  8299      rc = balance(pCur);
  8300  
  8301      /* Must make sure nOverflow is reset to zero even if the balance()
  8302      ** fails. Internal data structure corruption will result otherwise. 
  8303      ** Also, set the cursor state to invalid. This stops saveCursorPosition()
  8304      ** from trying to save the current position of the cursor.  */
  8305      pCur->pPage->nOverflow = 0;
  8306      pCur->eState = CURSOR_INVALID;
  8307      if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
  8308        btreeReleaseAllCursorPages(pCur);
  8309        if( pCur->pKeyInfo ){
  8310          assert( pCur->pKey==0 );
  8311          pCur->pKey = sqlite3Malloc( pX->nKey );
  8312          if( pCur->pKey==0 ){
  8313            rc = SQLITE_NOMEM;
  8314          }else{
  8315            memcpy(pCur->pKey, pX->pKey, pX->nKey);
  8316          }
  8317        }
  8318        pCur->eState = CURSOR_REQUIRESEEK;
  8319        pCur->nKey = pX->nKey;
  8320      }
  8321    }
  8322    assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
  8323  
  8324  end_insert:
  8325    return rc;
  8326  }
  8327  
  8328  /*
  8329  ** Delete the entry that the cursor is pointing to. 
  8330  **
  8331  ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
  8332  ** the cursor is left pointing at an arbitrary location after the delete.
  8333  ** But if that bit is set, then the cursor is left in a state such that
  8334  ** the next call to BtreeNext() or BtreePrev() moves it to the same row
  8335  ** as it would have been on if the call to BtreeDelete() had been omitted.
  8336  **
  8337  ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
  8338  ** associated with a single table entry and its indexes.  Only one of those
  8339  ** deletes is considered the "primary" delete.  The primary delete occurs
  8340  ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
  8341  ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
  8342  ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
  8343  ** but which might be used by alternative storage engines.
  8344  */
  8345  int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
  8346    Btree *p = pCur->pBtree;
  8347    BtShared *pBt = p->pBt;              
  8348    int rc;                              /* Return code */
  8349    MemPage *pPage;                      /* Page to delete cell from */
  8350    unsigned char *pCell;                /* Pointer to cell to delete */
  8351    int iCellIdx;                        /* Index of cell to delete */
  8352    int iCellDepth;                      /* Depth of node containing pCell */ 
  8353    CellInfo info;                       /* Size of the cell being deleted */
  8354    int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
  8355    u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
  8356  
  8357    assert( cursorOwnsBtShared(pCur) );
  8358    assert( pBt->inTransaction==TRANS_WRITE );
  8359    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
  8360    assert( pCur->curFlags & BTCF_WriteFlag );
  8361    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
  8362    assert( !hasReadConflicts(p, pCur->pgnoRoot) );
  8363    assert( pCur->ix<pCur->pPage->nCell );
  8364    assert( pCur->eState==CURSOR_VALID );
  8365    assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
  8366  
  8367    iCellDepth = pCur->iPage;
  8368    iCellIdx = pCur->ix;
  8369    pPage = pCur->pPage;
  8370    pCell = findCell(pPage, iCellIdx);
  8371  
  8372    /* If the bPreserve flag is set to true, then the cursor position must
  8373    ** be preserved following this delete operation. If the current delete
  8374    ** will cause a b-tree rebalance, then this is done by saving the cursor
  8375    ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 
  8376    ** returning. 
  8377    **
  8378    ** Or, if the current delete will not cause a rebalance, then the cursor
  8379    ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
  8380    ** before or after the deleted entry. In this case set bSkipnext to true.  */
  8381    if( bPreserve ){
  8382      if( !pPage->leaf 
  8383       || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
  8384      ){
  8385        /* A b-tree rebalance will be required after deleting this entry.
  8386        ** Save the cursor key.  */
  8387        rc = saveCursorKey(pCur);
  8388        if( rc ) return rc;
  8389      }else{
  8390        bSkipnext = 1;
  8391      }
  8392    }
  8393  
  8394    /* If the page containing the entry to delete is not a leaf page, move
  8395    ** the cursor to the largest entry in the tree that is smaller than
  8396    ** the entry being deleted. This cell will replace the cell being deleted
  8397    ** from the internal node. The 'previous' entry is used for this instead
  8398    ** of the 'next' entry, as the previous entry is always a part of the
  8399    ** sub-tree headed by the child page of the cell being deleted. This makes
  8400    ** balancing the tree following the delete operation easier.  */
  8401    if( !pPage->leaf ){
  8402      rc = sqlite3BtreePrevious(pCur, 0);
  8403      assert( rc!=SQLITE_DONE );
  8404      if( rc ) return rc;
  8405    }
  8406  
  8407    /* Save the positions of any other cursors open on this table before
  8408    ** making any modifications.  */
  8409    if( pCur->curFlags & BTCF_Multiple ){
  8410      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
  8411      if( rc ) return rc;
  8412    }
  8413  
  8414    /* If this is a delete operation to remove a row from a table b-tree,
  8415    ** invalidate any incrblob cursors open on the row being deleted.  */
  8416    if( pCur->pKeyInfo==0 ){
  8417      invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
  8418    }
  8419  
  8420    /* Make the page containing the entry to be deleted writable. Then free any
  8421    ** overflow pages associated with the entry and finally remove the cell
  8422    ** itself from within the page.  */
  8423    rc = sqlite3PagerWrite(pPage->pDbPage);
  8424    if( rc ) return rc;
  8425    rc = clearCell(pPage, pCell, &info);
  8426    dropCell(pPage, iCellIdx, info.nSize, &rc);
  8427    if( rc ) return rc;
  8428  
  8429    /* If the cell deleted was not located on a leaf page, then the cursor
  8430    ** is currently pointing to the largest entry in the sub-tree headed
  8431    ** by the child-page of the cell that was just deleted from an internal
  8432    ** node. The cell from the leaf node needs to be moved to the internal
  8433    ** node to replace the deleted cell.  */
  8434    if( !pPage->leaf ){
  8435      MemPage *pLeaf = pCur->pPage;
  8436      int nCell;
  8437      Pgno n;
  8438      unsigned char *pTmp;
  8439  
  8440      if( iCellDepth<pCur->iPage-1 ){
  8441        n = pCur->apPage[iCellDepth+1]->pgno;
  8442      }else{
  8443        n = pCur->pPage->pgno;
  8444      }
  8445      pCell = findCell(pLeaf, pLeaf->nCell-1);
  8446      if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
  8447      nCell = pLeaf->xCellSize(pLeaf, pCell);
  8448      assert( MX_CELL_SIZE(pBt) >= nCell );
  8449      pTmp = pBt->pTmpSpace;
  8450      assert( pTmp!=0 );
  8451      rc = sqlite3PagerWrite(pLeaf->pDbPage);
  8452      if( rc==SQLITE_OK ){
  8453        insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
  8454      }
  8455      dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
  8456      if( rc ) return rc;
  8457    }
  8458  
  8459    /* Balance the tree. If the entry deleted was located on a leaf page,
  8460    ** then the cursor still points to that page. In this case the first
  8461    ** call to balance() repairs the tree, and the if(...) condition is
  8462    ** never true.
  8463    **
  8464    ** Otherwise, if the entry deleted was on an internal node page, then
  8465    ** pCur is pointing to the leaf page from which a cell was removed to
  8466    ** replace the cell deleted from the internal node. This is slightly
  8467    ** tricky as the leaf node may be underfull, and the internal node may
  8468    ** be either under or overfull. In this case run the balancing algorithm
  8469    ** on the leaf node first. If the balance proceeds far enough up the
  8470    ** tree that we can be sure that any problem in the internal node has
  8471    ** been corrected, so be it. Otherwise, after balancing the leaf node,
  8472    ** walk the cursor up the tree to the internal node and balance it as 
  8473    ** well.  */
  8474    rc = balance(pCur);
  8475    if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
  8476      releasePageNotNull(pCur->pPage);
  8477      pCur->iPage--;
  8478      while( pCur->iPage>iCellDepth ){
  8479        releasePage(pCur->apPage[pCur->iPage--]);
  8480      }
  8481      pCur->pPage = pCur->apPage[pCur->iPage];
  8482      rc = balance(pCur);
  8483    }
  8484  
  8485    if( rc==SQLITE_OK ){
  8486      if( bSkipnext ){
  8487        assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
  8488        assert( pPage==pCur->pPage || CORRUPT_DB );
  8489        assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
  8490        pCur->eState = CURSOR_SKIPNEXT;
  8491        if( iCellIdx>=pPage->nCell ){
  8492          pCur->skipNext = -1;
  8493          pCur->ix = pPage->nCell-1;
  8494        }else{
  8495          pCur->skipNext = 1;
  8496        }
  8497      }else{
  8498        rc = moveToRoot(pCur);
  8499        if( bPreserve ){
  8500          btreeReleaseAllCursorPages(pCur);
  8501          pCur->eState = CURSOR_REQUIRESEEK;
  8502        }
  8503        if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
  8504      }
  8505    }
  8506    return rc;
  8507  }
  8508  
  8509  /*
  8510  ** Create a new BTree table.  Write into *piTable the page
  8511  ** number for the root page of the new table.
  8512  **
  8513  ** The type of type is determined by the flags parameter.  Only the
  8514  ** following values of flags are currently in use.  Other values for
  8515  ** flags might not work:
  8516  **
  8517  **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
  8518  **     BTREE_ZERODATA                  Used for SQL indices
  8519  */
  8520  static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
  8521    BtShared *pBt = p->pBt;
  8522    MemPage *pRoot;
  8523    Pgno pgnoRoot;
  8524    int rc;
  8525    int ptfFlags;          /* Page-type flage for the root page of new table */
  8526  
  8527    assert( sqlite3BtreeHoldsMutex(p) );
  8528    assert( pBt->inTransaction==TRANS_WRITE );
  8529    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
  8530  
  8531  #ifdef SQLITE_OMIT_AUTOVACUUM
  8532    rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  8533    if( rc ){
  8534      return rc;
  8535    }
  8536  #else
  8537    if( pBt->autoVacuum ){
  8538      Pgno pgnoMove;      /* Move a page here to make room for the root-page */
  8539      MemPage *pPageMove; /* The page to move to. */
  8540  
  8541      /* Creating a new table may probably require moving an existing database
  8542      ** to make room for the new tables root page. In case this page turns
  8543      ** out to be an overflow page, delete all overflow page-map caches
  8544      ** held by open cursors.
  8545      */
  8546      invalidateAllOverflowCache(pBt);
  8547  
  8548      /* Read the value of meta[3] from the database to determine where the
  8549      ** root page of the new table should go. meta[3] is the largest root-page
  8550      ** created so far, so the new root-page is (meta[3]+1).
  8551      */
  8552      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
  8553      pgnoRoot++;
  8554  
  8555      /* The new root-page may not be allocated on a pointer-map page, or the
  8556      ** PENDING_BYTE page.
  8557      */
  8558      while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
  8559          pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
  8560        pgnoRoot++;
  8561      }
  8562      assert( pgnoRoot>=3 || CORRUPT_DB );
  8563      testcase( pgnoRoot<3 );
  8564  
  8565      /* Allocate a page. The page that currently resides at pgnoRoot will
  8566      ** be moved to the allocated page (unless the allocated page happens
  8567      ** to reside at pgnoRoot).
  8568      */
  8569      rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
  8570      if( rc!=SQLITE_OK ){
  8571        return rc;
  8572      }
  8573  
  8574      if( pgnoMove!=pgnoRoot ){
  8575        /* pgnoRoot is the page that will be used for the root-page of
  8576        ** the new table (assuming an error did not occur). But we were
  8577        ** allocated pgnoMove. If required (i.e. if it was not allocated
  8578        ** by extending the file), the current page at position pgnoMove
  8579        ** is already journaled.
  8580        */
  8581        u8 eType = 0;
  8582        Pgno iPtrPage = 0;
  8583  
  8584        /* Save the positions of any open cursors. This is required in
  8585        ** case they are holding a reference to an xFetch reference
  8586        ** corresponding to page pgnoRoot.  */
  8587        rc = saveAllCursors(pBt, 0, 0);
  8588        releasePage(pPageMove);
  8589        if( rc!=SQLITE_OK ){
  8590          return rc;
  8591        }
  8592  
  8593        /* Move the page currently at pgnoRoot to pgnoMove. */
  8594        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  8595        if( rc!=SQLITE_OK ){
  8596          return rc;
  8597        }
  8598        rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
  8599        if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
  8600          rc = SQLITE_CORRUPT_BKPT;
  8601        }
  8602        if( rc!=SQLITE_OK ){
  8603          releasePage(pRoot);
  8604          return rc;
  8605        }
  8606        assert( eType!=PTRMAP_ROOTPAGE );
  8607        assert( eType!=PTRMAP_FREEPAGE );
  8608        rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
  8609        releasePage(pRoot);
  8610  
  8611        /* Obtain the page at pgnoRoot */
  8612        if( rc!=SQLITE_OK ){
  8613          return rc;
  8614        }
  8615        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  8616        if( rc!=SQLITE_OK ){
  8617          return rc;
  8618        }
  8619        rc = sqlite3PagerWrite(pRoot->pDbPage);
  8620        if( rc!=SQLITE_OK ){
  8621          releasePage(pRoot);
  8622          return rc;
  8623        }
  8624      }else{
  8625        pRoot = pPageMove;
  8626      } 
  8627  
  8628      /* Update the pointer-map and meta-data with the new root-page number. */
  8629      ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
  8630      if( rc ){
  8631        releasePage(pRoot);
  8632        return rc;
  8633      }
  8634  
  8635      /* When the new root page was allocated, page 1 was made writable in
  8636      ** order either to increase the database filesize, or to decrement the
  8637      ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
  8638      */
  8639      assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
  8640      rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
  8641      if( NEVER(rc) ){
  8642        releasePage(pRoot);
  8643        return rc;
  8644      }
  8645  
  8646    }else{
  8647      rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  8648      if( rc ) return rc;
  8649    }
  8650  #endif
  8651    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
  8652    if( createTabFlags & BTREE_INTKEY ){
  8653      ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
  8654    }else{
  8655      ptfFlags = PTF_ZERODATA | PTF_LEAF;
  8656    }
  8657    zeroPage(pRoot, ptfFlags);
  8658    sqlite3PagerUnref(pRoot->pDbPage);
  8659    assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
  8660    *piTable = (int)pgnoRoot;
  8661    return SQLITE_OK;
  8662  }
  8663  int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
  8664    int rc;
  8665    sqlite3BtreeEnter(p);
  8666    rc = btreeCreateTable(p, piTable, flags);
  8667    sqlite3BtreeLeave(p);
  8668    return rc;
  8669  }
  8670  
  8671  /*
  8672  ** Erase the given database page and all its children.  Return
  8673  ** the page to the freelist.
  8674  */
  8675  static int clearDatabasePage(
  8676    BtShared *pBt,           /* The BTree that contains the table */
  8677    Pgno pgno,               /* Page number to clear */
  8678    int freePageFlag,        /* Deallocate page if true */
  8679    int *pnChange            /* Add number of Cells freed to this counter */
  8680  ){
  8681    MemPage *pPage;
  8682    int rc;
  8683    unsigned char *pCell;
  8684    int i;
  8685    int hdr;
  8686    CellInfo info;
  8687  
  8688    assert( sqlite3_mutex_held(pBt->mutex) );
  8689    if( pgno>btreePagecount(pBt) ){
  8690      return SQLITE_CORRUPT_BKPT;
  8691    }
  8692    rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
  8693    if( rc ) return rc;
  8694    if( pPage->bBusy ){
  8695      rc = SQLITE_CORRUPT_BKPT;
  8696      goto cleardatabasepage_out;
  8697    }
  8698    pPage->bBusy = 1;
  8699    hdr = pPage->hdrOffset;
  8700    for(i=0; i<pPage->nCell; i++){
  8701      pCell = findCell(pPage, i);
  8702      if( !pPage->leaf ){
  8703        rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
  8704        if( rc ) goto cleardatabasepage_out;
  8705      }
  8706      rc = clearCell(pPage, pCell, &info);
  8707      if( rc ) goto cleardatabasepage_out;
  8708    }
  8709    if( !pPage->leaf ){
  8710      rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
  8711      if( rc ) goto cleardatabasepage_out;
  8712    }else if( pnChange ){
  8713      assert( pPage->intKey || CORRUPT_DB );
  8714      testcase( !pPage->intKey );
  8715      *pnChange += pPage->nCell;
  8716    }
  8717    if( freePageFlag ){
  8718      freePage(pPage, &rc);
  8719    }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
  8720      zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
  8721    }
  8722  
  8723  cleardatabasepage_out:
  8724    pPage->bBusy = 0;
  8725    releasePage(pPage);
  8726    return rc;
  8727  }
  8728  
  8729  /*
  8730  ** Delete all information from a single table in the database.  iTable is
  8731  ** the page number of the root of the table.  After this routine returns,
  8732  ** the root page is empty, but still exists.
  8733  **
  8734  ** This routine will fail with SQLITE_LOCKED if there are any open
  8735  ** read cursors on the table.  Open write cursors are moved to the
  8736  ** root of the table.
  8737  **
  8738  ** If pnChange is not NULL, then table iTable must be an intkey table. The
  8739  ** integer value pointed to by pnChange is incremented by the number of
  8740  ** entries in the table.
  8741  */
  8742  int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
  8743    int rc;
  8744    BtShared *pBt = p->pBt;
  8745    sqlite3BtreeEnter(p);
  8746    assert( p->inTrans==TRANS_WRITE );
  8747  
  8748    rc = saveAllCursors(pBt, (Pgno)iTable, 0);
  8749  
  8750    if( SQLITE_OK==rc ){
  8751      /* Invalidate all incrblob cursors open on table iTable (assuming iTable
  8752      ** is the root of a table b-tree - if it is not, the following call is
  8753      ** a no-op).  */
  8754      invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
  8755      rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
  8756    }
  8757    sqlite3BtreeLeave(p);
  8758    return rc;
  8759  }
  8760  
  8761  /*
  8762  ** Delete all information from the single table that pCur is open on.
  8763  **
  8764  ** This routine only work for pCur on an ephemeral table.
  8765  */
  8766  int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
  8767    return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
  8768  }
  8769  
  8770  /*
  8771  ** Erase all information in a table and add the root of the table to
  8772  ** the freelist.  Except, the root of the principle table (the one on
  8773  ** page 1) is never added to the freelist.
  8774  **
  8775  ** This routine will fail with SQLITE_LOCKED if there are any open
  8776  ** cursors on the table.
  8777  **
  8778  ** If AUTOVACUUM is enabled and the page at iTable is not the last
  8779  ** root page in the database file, then the last root page 
  8780  ** in the database file is moved into the slot formerly occupied by
  8781  ** iTable and that last slot formerly occupied by the last root page
  8782  ** is added to the freelist instead of iTable.  In this say, all
  8783  ** root pages are kept at the beginning of the database file, which
  8784  ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
  8785  ** page number that used to be the last root page in the file before
  8786  ** the move.  If no page gets moved, *piMoved is set to 0.
  8787  ** The last root page is recorded in meta[3] and the value of
  8788  ** meta[3] is updated by this procedure.
  8789  */
  8790  static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
  8791    int rc;
  8792    MemPage *pPage = 0;
  8793    BtShared *pBt = p->pBt;
  8794  
  8795    assert( sqlite3BtreeHoldsMutex(p) );
  8796    assert( p->inTrans==TRANS_WRITE );
  8797    assert( iTable>=2 );
  8798  
  8799    rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
  8800    if( rc ) return rc;
  8801    rc = sqlite3BtreeClearTable(p, iTable, 0);
  8802    if( rc ){
  8803      releasePage(pPage);
  8804      return rc;
  8805    }
  8806  
  8807    *piMoved = 0;
  8808  
  8809  #ifdef SQLITE_OMIT_AUTOVACUUM
  8810    freePage(pPage, &rc);
  8811    releasePage(pPage);
  8812  #else
  8813    if( pBt->autoVacuum ){
  8814      Pgno maxRootPgno;
  8815      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
  8816  
  8817      if( iTable==maxRootPgno ){
  8818        /* If the table being dropped is the table with the largest root-page
  8819        ** number in the database, put the root page on the free list. 
  8820        */
  8821        freePage(pPage, &rc);
  8822        releasePage(pPage);
  8823        if( rc!=SQLITE_OK ){
  8824          return rc;
  8825        }
  8826      }else{
  8827        /* The table being dropped does not have the largest root-page
  8828        ** number in the database. So move the page that does into the 
  8829        ** gap left by the deleted root-page.
  8830        */
  8831        MemPage *pMove;
  8832        releasePage(pPage);
  8833        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
  8834        if( rc!=SQLITE_OK ){
  8835          return rc;
  8836        }
  8837        rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
  8838        releasePage(pMove);
  8839        if( rc!=SQLITE_OK ){
  8840          return rc;
  8841        }
  8842        pMove = 0;
  8843        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
  8844        freePage(pMove, &rc);
  8845        releasePage(pMove);
  8846        if( rc!=SQLITE_OK ){
  8847          return rc;
  8848        }
  8849        *piMoved = maxRootPgno;
  8850      }
  8851  
  8852      /* Set the new 'max-root-page' value in the database header. This
  8853      ** is the old value less one, less one more if that happens to
  8854      ** be a root-page number, less one again if that is the
  8855      ** PENDING_BYTE_PAGE.
  8856      */
  8857      maxRootPgno--;
  8858      while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
  8859             || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
  8860        maxRootPgno--;
  8861      }
  8862      assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
  8863  
  8864      rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
  8865    }else{
  8866      freePage(pPage, &rc);
  8867      releasePage(pPage);
  8868    }
  8869  #endif
  8870    return rc;  
  8871  }
  8872  int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
  8873    int rc;
  8874    sqlite3BtreeEnter(p);
  8875    rc = btreeDropTable(p, iTable, piMoved);
  8876    sqlite3BtreeLeave(p);
  8877    return rc;
  8878  }
  8879  
  8880  
  8881  /*
  8882  ** This function may only be called if the b-tree connection already
  8883  ** has a read or write transaction open on the database.
  8884  **
  8885  ** Read the meta-information out of a database file.  Meta[0]
  8886  ** is the number of free pages currently in the database.  Meta[1]
  8887  ** through meta[15] are available for use by higher layers.  Meta[0]
  8888  ** is read-only, the others are read/write.
  8889  ** 
  8890  ** The schema layer numbers meta values differently.  At the schema
  8891  ** layer (and the SetCookie and ReadCookie opcodes) the number of
  8892  ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
  8893  **
  8894  ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
  8895  ** of reading the value out of the header, it instead loads the "DataVersion"
  8896  ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
  8897  ** database file.  It is a number computed by the pager.  But its access
  8898  ** pattern is the same as header meta values, and so it is convenient to
  8899  ** read it from this routine.
  8900  */
  8901  void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
  8902    BtShared *pBt = p->pBt;
  8903  
  8904    sqlite3BtreeEnter(p);
  8905    assert( p->inTrans>TRANS_NONE );
  8906    assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
  8907    assert( pBt->pPage1 );
  8908    assert( idx>=0 && idx<=15 );
  8909  
  8910    if( idx==BTREE_DATA_VERSION ){
  8911      *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
  8912    }else{
  8913      *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
  8914    }
  8915  
  8916    /* If auto-vacuum is disabled in this build and this is an auto-vacuum
  8917    ** database, mark the database as read-only.  */
  8918  #ifdef SQLITE_OMIT_AUTOVACUUM
  8919    if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
  8920      pBt->btsFlags |= BTS_READ_ONLY;
  8921    }
  8922  #endif
  8923  
  8924    sqlite3BtreeLeave(p);
  8925  }
  8926  
  8927  /*
  8928  ** Write meta-information back into the database.  Meta[0] is
  8929  ** read-only and may not be written.
  8930  */
  8931  int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
  8932    BtShared *pBt = p->pBt;
  8933    unsigned char *pP1;
  8934    int rc;
  8935    assert( idx>=1 && idx<=15 );
  8936    sqlite3BtreeEnter(p);
  8937    assert( p->inTrans==TRANS_WRITE );
  8938    assert( pBt->pPage1!=0 );
  8939    pP1 = pBt->pPage1->aData;
  8940    rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  8941    if( rc==SQLITE_OK ){
  8942      put4byte(&pP1[36 + idx*4], iMeta);
  8943  #ifndef SQLITE_OMIT_AUTOVACUUM
  8944      if( idx==BTREE_INCR_VACUUM ){
  8945        assert( pBt->autoVacuum || iMeta==0 );
  8946        assert( iMeta==0 || iMeta==1 );
  8947        pBt->incrVacuum = (u8)iMeta;
  8948      }
  8949  #endif
  8950    }
  8951    sqlite3BtreeLeave(p);
  8952    return rc;
  8953  }
  8954  
  8955  #ifndef SQLITE_OMIT_BTREECOUNT
  8956  /*
  8957  ** The first argument, pCur, is a cursor opened on some b-tree. Count the
  8958  ** number of entries in the b-tree and write the result to *pnEntry.
  8959  **
  8960  ** SQLITE_OK is returned if the operation is successfully executed. 
  8961  ** Otherwise, if an error is encountered (i.e. an IO error or database
  8962  ** corruption) an SQLite error code is returned.
  8963  */
  8964  int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
  8965    i64 nEntry = 0;                      /* Value to return in *pnEntry */
  8966    int rc;                              /* Return code */
  8967  
  8968    rc = moveToRoot(pCur);
  8969    if( rc==SQLITE_EMPTY ){
  8970      *pnEntry = 0;
  8971      return SQLITE_OK;
  8972    }
  8973  
  8974    /* Unless an error occurs, the following loop runs one iteration for each
  8975    ** page in the B-Tree structure (not including overflow pages). 
  8976    */
  8977    while( rc==SQLITE_OK ){
  8978      int iIdx;                          /* Index of child node in parent */
  8979      MemPage *pPage;                    /* Current page of the b-tree */
  8980  
  8981      /* If this is a leaf page or the tree is not an int-key tree, then 
  8982      ** this page contains countable entries. Increment the entry counter
  8983      ** accordingly.
  8984      */
  8985      pPage = pCur->pPage;
  8986      if( pPage->leaf || !pPage->intKey ){
  8987        nEntry += pPage->nCell;
  8988      }
  8989  
  8990      /* pPage is a leaf node. This loop navigates the cursor so that it 
  8991      ** points to the first interior cell that it points to the parent of
  8992      ** the next page in the tree that has not yet been visited. The
  8993      ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
  8994      ** of the page, or to the number of cells in the page if the next page
  8995      ** to visit is the right-child of its parent.
  8996      **
  8997      ** If all pages in the tree have been visited, return SQLITE_OK to the
  8998      ** caller.
  8999      */
  9000      if( pPage->leaf ){
  9001        do {
  9002          if( pCur->iPage==0 ){
  9003            /* All pages of the b-tree have been visited. Return successfully. */
  9004            *pnEntry = nEntry;
  9005            return moveToRoot(pCur);
  9006          }
  9007          moveToParent(pCur);
  9008        }while ( pCur->ix>=pCur->pPage->nCell );
  9009  
  9010        pCur->ix++;
  9011        pPage = pCur->pPage;
  9012      }
  9013  
  9014      /* Descend to the child node of the cell that the cursor currently 
  9015      ** points at. This is the right-child if (iIdx==pPage->nCell).
  9016      */
  9017      iIdx = pCur->ix;
  9018      if( iIdx==pPage->nCell ){
  9019        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
  9020      }else{
  9021        rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
  9022      }
  9023    }
  9024  
  9025    /* An error has occurred. Return an error code. */
  9026    return rc;
  9027  }
  9028  #endif
  9029  
  9030  /*
  9031  ** Return the pager associated with a BTree.  This routine is used for
  9032  ** testing and debugging only.
  9033  */
  9034  Pager *sqlite3BtreePager(Btree *p){
  9035    return p->pBt->pPager;
  9036  }
  9037  
  9038  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  9039  /*
  9040  ** Append a message to the error message string.
  9041  */
  9042  static void checkAppendMsg(
  9043    IntegrityCk *pCheck,
  9044    const char *zFormat,
  9045    ...
  9046  ){
  9047    va_list ap;
  9048    if( !pCheck->mxErr ) return;
  9049    pCheck->mxErr--;
  9050    pCheck->nErr++;
  9051    va_start(ap, zFormat);
  9052    if( pCheck->errMsg.nChar ){
  9053      sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
  9054    }
  9055    if( pCheck->zPfx ){
  9056      sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
  9057    }
  9058    sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);
  9059    va_end(ap);
  9060    if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
  9061      pCheck->mallocFailed = 1;
  9062    }
  9063  }
  9064  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  9065  
  9066  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  9067  
  9068  /*
  9069  ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
  9070  ** corresponds to page iPg is already set.
  9071  */
  9072  static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
  9073    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
  9074    return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
  9075  }
  9076  
  9077  /*
  9078  ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
  9079  */
  9080  static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
  9081    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
  9082    pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
  9083  }
  9084  
  9085  
  9086  /*
  9087  ** Add 1 to the reference count for page iPage.  If this is the second
  9088  ** reference to the page, add an error message to pCheck->zErrMsg.
  9089  ** Return 1 if there are 2 or more references to the page and 0 if
  9090  ** if this is the first reference to the page.
  9091  **
  9092  ** Also check that the page number is in bounds.
  9093  */
  9094  static int checkRef(IntegrityCk *pCheck, Pgno iPage){
  9095    if( iPage==0 ) return 1;
  9096    if( iPage>pCheck->nPage ){
  9097      checkAppendMsg(pCheck, "invalid page number %d", iPage);
  9098      return 1;
  9099    }
  9100    if( getPageReferenced(pCheck, iPage) ){
  9101      checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
  9102      return 1;
  9103    }
  9104    setPageReferenced(pCheck, iPage);
  9105    return 0;
  9106  }
  9107  
  9108  #ifndef SQLITE_OMIT_AUTOVACUUM
  9109  /*
  9110  ** Check that the entry in the pointer-map for page iChild maps to 
  9111  ** page iParent, pointer type ptrType. If not, append an error message
  9112  ** to pCheck.
  9113  */
  9114  static void checkPtrmap(
  9115    IntegrityCk *pCheck,   /* Integrity check context */
  9116    Pgno iChild,           /* Child page number */
  9117    u8 eType,              /* Expected pointer map type */
  9118    Pgno iParent           /* Expected pointer map parent page number */
  9119  ){
  9120    int rc;
  9121    u8 ePtrmapType;
  9122    Pgno iPtrmapParent;
  9123  
  9124    rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
  9125    if( rc!=SQLITE_OK ){
  9126      if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
  9127      checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
  9128      return;
  9129    }
  9130  
  9131    if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
  9132      checkAppendMsg(pCheck,
  9133        "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
  9134        iChild, eType, iParent, ePtrmapType, iPtrmapParent);
  9135    }
  9136  }
  9137  #endif
  9138  
  9139  /*
  9140  ** Check the integrity of the freelist or of an overflow page list.
  9141  ** Verify that the number of pages on the list is N.
  9142  */
  9143  static void checkList(
  9144    IntegrityCk *pCheck,  /* Integrity checking context */
  9145    int isFreeList,       /* True for a freelist.  False for overflow page list */
  9146    int iPage,            /* Page number for first page in the list */
  9147    int N                 /* Expected number of pages in the list */
  9148  ){
  9149    int i;
  9150    int expected = N;
  9151    int iFirst = iPage;
  9152    while( N-- > 0 && pCheck->mxErr ){
  9153      DbPage *pOvflPage;
  9154      unsigned char *pOvflData;
  9155      if( iPage<1 ){
  9156        checkAppendMsg(pCheck,
  9157           "%d of %d pages missing from overflow list starting at %d",
  9158            N+1, expected, iFirst);
  9159        break;
  9160      }
  9161      if( checkRef(pCheck, iPage) ) break;
  9162      if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
  9163        checkAppendMsg(pCheck, "failed to get page %d", iPage);
  9164        break;
  9165      }
  9166      pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
  9167      if( isFreeList ){
  9168        int n = get4byte(&pOvflData[4]);
  9169  #ifndef SQLITE_OMIT_AUTOVACUUM
  9170        if( pCheck->pBt->autoVacuum ){
  9171          checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
  9172        }
  9173  #endif
  9174        if( n>(int)pCheck->pBt->usableSize/4-2 ){
  9175          checkAppendMsg(pCheck,
  9176             "freelist leaf count too big on page %d", iPage);
  9177          N--;
  9178        }else{
  9179          for(i=0; i<n; i++){
  9180            Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
  9181  #ifndef SQLITE_OMIT_AUTOVACUUM
  9182            if( pCheck->pBt->autoVacuum ){
  9183              checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
  9184            }
  9185  #endif
  9186            checkRef(pCheck, iFreePage);
  9187          }
  9188          N -= n;
  9189        }
  9190      }
  9191  #ifndef SQLITE_OMIT_AUTOVACUUM
  9192      else{
  9193        /* If this database supports auto-vacuum and iPage is not the last
  9194        ** page in this overflow list, check that the pointer-map entry for
  9195        ** the following page matches iPage.
  9196        */
  9197        if( pCheck->pBt->autoVacuum && N>0 ){
  9198          i = get4byte(pOvflData);
  9199          checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
  9200        }
  9201      }
  9202  #endif
  9203      iPage = get4byte(pOvflData);
  9204      sqlite3PagerUnref(pOvflPage);
  9205  
  9206      if( isFreeList && N<(iPage!=0) ){
  9207        checkAppendMsg(pCheck, "free-page count in header is too small");
  9208      }
  9209    }
  9210  }
  9211  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  9212  
  9213  /*
  9214  ** An implementation of a min-heap.
  9215  **
  9216  ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
  9217  ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
  9218  ** and aHeap[N*2+1].
  9219  **
  9220  ** The heap property is this:  Every node is less than or equal to both
  9221  ** of its daughter nodes.  A consequence of the heap property is that the
  9222  ** root node aHeap[1] is always the minimum value currently in the heap.
  9223  **
  9224  ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
  9225  ** the heap, preserving the heap property.  The btreeHeapPull() routine
  9226  ** removes the root element from the heap (the minimum value in the heap)
  9227  ** and then moves other nodes around as necessary to preserve the heap
  9228  ** property.
  9229  **
  9230  ** This heap is used for cell overlap and coverage testing.  Each u32
  9231  ** entry represents the span of a cell or freeblock on a btree page.  
  9232  ** The upper 16 bits are the index of the first byte of a range and the
  9233  ** lower 16 bits are the index of the last byte of that range.
  9234  */
  9235  static void btreeHeapInsert(u32 *aHeap, u32 x){
  9236    u32 j, i = ++aHeap[0];
  9237    aHeap[i] = x;
  9238    while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
  9239      x = aHeap[j];
  9240      aHeap[j] = aHeap[i];
  9241      aHeap[i] = x;
  9242      i = j;
  9243    }
  9244  }
  9245  static int btreeHeapPull(u32 *aHeap, u32 *pOut){
  9246    u32 j, i, x;
  9247    if( (x = aHeap[0])==0 ) return 0;
  9248    *pOut = aHeap[1];
  9249    aHeap[1] = aHeap[x];
  9250    aHeap[x] = 0xffffffff;
  9251    aHeap[0]--;
  9252    i = 1;
  9253    while( (j = i*2)<=aHeap[0] ){
  9254      if( aHeap[j]>aHeap[j+1] ) j++;
  9255      if( aHeap[i]<aHeap[j] ) break;
  9256      x = aHeap[i];
  9257      aHeap[i] = aHeap[j];
  9258      aHeap[j] = x;
  9259      i = j;
  9260    }
  9261    return 1;  
  9262  }
  9263  
  9264  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  9265  /*
  9266  ** Do various sanity checks on a single page of a tree.  Return
  9267  ** the tree depth.  Root pages return 0.  Parents of root pages
  9268  ** return 1, and so forth.
  9269  ** 
  9270  ** These checks are done:
  9271  **
  9272  **      1.  Make sure that cells and freeblocks do not overlap
  9273  **          but combine to completely cover the page.
  9274  **      2.  Make sure integer cell keys are in order.
  9275  **      3.  Check the integrity of overflow pages.
  9276  **      4.  Recursively call checkTreePage on all children.
  9277  **      5.  Verify that the depth of all children is the same.
  9278  */
  9279  static int checkTreePage(
  9280    IntegrityCk *pCheck,  /* Context for the sanity check */
  9281    int iPage,            /* Page number of the page to check */
  9282    i64 *piMinKey,        /* Write minimum integer primary key here */
  9283    i64 maxKey            /* Error if integer primary key greater than this */
  9284  ){
  9285    MemPage *pPage = 0;      /* The page being analyzed */
  9286    int i;                   /* Loop counter */
  9287    int rc;                  /* Result code from subroutine call */
  9288    int depth = -1, d2;      /* Depth of a subtree */
  9289    int pgno;                /* Page number */
  9290    int nFrag;               /* Number of fragmented bytes on the page */
  9291    int hdr;                 /* Offset to the page header */
  9292    int cellStart;           /* Offset to the start of the cell pointer array */
  9293    int nCell;               /* Number of cells */
  9294    int doCoverageCheck = 1; /* True if cell coverage checking should be done */
  9295    int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
  9296                             ** False if IPK must be strictly less than maxKey */
  9297    u8 *data;                /* Page content */
  9298    u8 *pCell;               /* Cell content */
  9299    u8 *pCellIdx;            /* Next element of the cell pointer array */
  9300    BtShared *pBt;           /* The BtShared object that owns pPage */
  9301    u32 pc;                  /* Address of a cell */
  9302    u32 usableSize;          /* Usable size of the page */
  9303    u32 contentOffset;       /* Offset to the start of the cell content area */
  9304    u32 *heap = 0;           /* Min-heap used for checking cell coverage */
  9305    u32 x, prev = 0;         /* Next and previous entry on the min-heap */
  9306    const char *saved_zPfx = pCheck->zPfx;
  9307    int saved_v1 = pCheck->v1;
  9308    int saved_v2 = pCheck->v2;
  9309    u8 savedIsInit = 0;
  9310  
  9311    /* Check that the page exists
  9312    */
  9313    pBt = pCheck->pBt;
  9314    usableSize = pBt->usableSize;
  9315    if( iPage==0 ) return 0;
  9316    if( checkRef(pCheck, iPage) ) return 0;
  9317    pCheck->zPfx = "Page %d: ";
  9318    pCheck->v1 = iPage;
  9319    if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
  9320      checkAppendMsg(pCheck,
  9321         "unable to get the page. error code=%d", rc);
  9322      goto end_of_check;
  9323    }
  9324  
  9325    /* Clear MemPage.isInit to make sure the corruption detection code in
  9326    ** btreeInitPage() is executed.  */
  9327    savedIsInit = pPage->isInit;
  9328    pPage->isInit = 0;
  9329    if( (rc = btreeInitPage(pPage))!=0 ){
  9330      assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
  9331      checkAppendMsg(pCheck,
  9332                     "btreeInitPage() returns error code %d", rc);
  9333      goto end_of_check;
  9334    }
  9335    data = pPage->aData;
  9336    hdr = pPage->hdrOffset;
  9337  
  9338    /* Set up for cell analysis */
  9339    pCheck->zPfx = "On tree page %d cell %d: ";
  9340    contentOffset = get2byteNotZero(&data[hdr+5]);
  9341    assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
  9342  
  9343    /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
  9344    ** number of cells on the page. */
  9345    nCell = get2byte(&data[hdr+3]);
  9346    assert( pPage->nCell==nCell );
  9347  
  9348    /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
  9349    ** immediately follows the b-tree page header. */
  9350    cellStart = hdr + 12 - 4*pPage->leaf;
  9351    assert( pPage->aCellIdx==&data[cellStart] );
  9352    pCellIdx = &data[cellStart + 2*(nCell-1)];
  9353  
  9354    if( !pPage->leaf ){
  9355      /* Analyze the right-child page of internal pages */
  9356      pgno = get4byte(&data[hdr+8]);
  9357  #ifndef SQLITE_OMIT_AUTOVACUUM
  9358      if( pBt->autoVacuum ){
  9359        pCheck->zPfx = "On page %d at right child: ";
  9360        checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
  9361      }
  9362  #endif
  9363      depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
  9364      keyCanBeEqual = 0;
  9365    }else{
  9366      /* For leaf pages, the coverage check will occur in the same loop
  9367      ** as the other cell checks, so initialize the heap.  */
  9368      heap = pCheck->heap;
  9369      heap[0] = 0;
  9370    }
  9371  
  9372    /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
  9373    ** integer offsets to the cell contents. */
  9374    for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
  9375      CellInfo info;
  9376  
  9377      /* Check cell size */
  9378      pCheck->v2 = i;
  9379      assert( pCellIdx==&data[cellStart + i*2] );
  9380      pc = get2byteAligned(pCellIdx);
  9381      pCellIdx -= 2;
  9382      if( pc<contentOffset || pc>usableSize-4 ){
  9383        checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
  9384                               pc, contentOffset, usableSize-4);
  9385        doCoverageCheck = 0;
  9386        continue;
  9387      }
  9388      pCell = &data[pc];
  9389      pPage->xParseCell(pPage, pCell, &info);
  9390      if( pc+info.nSize>usableSize ){
  9391        checkAppendMsg(pCheck, "Extends off end of page");
  9392        doCoverageCheck = 0;
  9393        continue;
  9394      }
  9395  
  9396      /* Check for integer primary key out of range */
  9397      if( pPage->intKey ){
  9398        if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
  9399          checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
  9400        }
  9401        maxKey = info.nKey;
  9402        keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
  9403      }
  9404  
  9405      /* Check the content overflow list */
  9406      if( info.nPayload>info.nLocal ){
  9407        int nPage;       /* Number of pages on the overflow chain */
  9408        Pgno pgnoOvfl;   /* First page of the overflow chain */
  9409        assert( pc + info.nSize - 4 <= usableSize );
  9410        nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
  9411        pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
  9412  #ifndef SQLITE_OMIT_AUTOVACUUM
  9413        if( pBt->autoVacuum ){
  9414          checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
  9415        }
  9416  #endif
  9417        checkList(pCheck, 0, pgnoOvfl, nPage);
  9418      }
  9419  
  9420      if( !pPage->leaf ){
  9421        /* Check sanity of left child page for internal pages */
  9422        pgno = get4byte(pCell);
  9423  #ifndef SQLITE_OMIT_AUTOVACUUM
  9424        if( pBt->autoVacuum ){
  9425          checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
  9426        }
  9427  #endif
  9428        d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
  9429        keyCanBeEqual = 0;
  9430        if( d2!=depth ){
  9431          checkAppendMsg(pCheck, "Child page depth differs");
  9432          depth = d2;
  9433        }
  9434      }else{
  9435        /* Populate the coverage-checking heap for leaf pages */
  9436        btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
  9437      }
  9438    }
  9439    *piMinKey = maxKey;
  9440  
  9441    /* Check for complete coverage of the page
  9442    */
  9443    pCheck->zPfx = 0;
  9444    if( doCoverageCheck && pCheck->mxErr>0 ){
  9445      /* For leaf pages, the min-heap has already been initialized and the
  9446      ** cells have already been inserted.  But for internal pages, that has
  9447      ** not yet been done, so do it now */
  9448      if( !pPage->leaf ){
  9449        heap = pCheck->heap;
  9450        heap[0] = 0;
  9451        for(i=nCell-1; i>=0; i--){
  9452          u32 size;
  9453          pc = get2byteAligned(&data[cellStart+i*2]);
  9454          size = pPage->xCellSize(pPage, &data[pc]);
  9455          btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
  9456        }
  9457      }
  9458      /* Add the freeblocks to the min-heap
  9459      **
  9460      ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
  9461      ** is the offset of the first freeblock, or zero if there are no
  9462      ** freeblocks on the page. 
  9463      */
  9464      i = get2byte(&data[hdr+1]);
  9465      while( i>0 ){
  9466        int size, j;
  9467        assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
  9468        size = get2byte(&data[i+2]);
  9469        assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
  9470        btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
  9471        /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
  9472        ** big-endian integer which is the offset in the b-tree page of the next
  9473        ** freeblock in the chain, or zero if the freeblock is the last on the
  9474        ** chain. */
  9475        j = get2byte(&data[i]);
  9476        /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
  9477        ** increasing offset. */
  9478        assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
  9479        assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
  9480        i = j;
  9481      }
  9482      /* Analyze the min-heap looking for overlap between cells and/or 
  9483      ** freeblocks, and counting the number of untracked bytes in nFrag.
  9484      ** 
  9485      ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
  9486      ** There is an implied first entry the covers the page header, the cell
  9487      ** pointer index, and the gap between the cell pointer index and the start
  9488      ** of cell content.  
  9489      **
  9490      ** The loop below pulls entries from the min-heap in order and compares
  9491      ** the start_address against the previous end_address.  If there is an
  9492      ** overlap, that means bytes are used multiple times.  If there is a gap,
  9493      ** that gap is added to the fragmentation count.
  9494      */
  9495      nFrag = 0;
  9496      prev = contentOffset - 1;   /* Implied first min-heap entry */
  9497      while( btreeHeapPull(heap,&x) ){
  9498        if( (prev&0xffff)>=(x>>16) ){
  9499          checkAppendMsg(pCheck,
  9500            "Multiple uses for byte %u of page %d", x>>16, iPage);
  9501          break;
  9502        }else{
  9503          nFrag += (x>>16) - (prev&0xffff) - 1;
  9504          prev = x;
  9505        }
  9506      }
  9507      nFrag += usableSize - (prev&0xffff) - 1;
  9508      /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
  9509      ** is stored in the fifth field of the b-tree page header.
  9510      ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
  9511      ** number of fragmented free bytes within the cell content area.
  9512      */
  9513      if( heap[0]==0 && nFrag!=data[hdr+7] ){
  9514        checkAppendMsg(pCheck,
  9515            "Fragmentation of %d bytes reported as %d on page %d",
  9516            nFrag, data[hdr+7], iPage);
  9517      }
  9518    }
  9519  
  9520  end_of_check:
  9521    if( !doCoverageCheck ) pPage->isInit = savedIsInit;
  9522    releasePage(pPage);
  9523    pCheck->zPfx = saved_zPfx;
  9524    pCheck->v1 = saved_v1;
  9525    pCheck->v2 = saved_v2;
  9526    return depth+1;
  9527  }
  9528  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  9529  
  9530  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  9531  /*
  9532  ** This routine does a complete check of the given BTree file.  aRoot[] is
  9533  ** an array of pages numbers were each page number is the root page of
  9534  ** a table.  nRoot is the number of entries in aRoot.
  9535  **
  9536  ** A read-only or read-write transaction must be opened before calling
  9537  ** this function.
  9538  **
  9539  ** Write the number of error seen in *pnErr.  Except for some memory
  9540  ** allocation errors,  an error message held in memory obtained from
  9541  ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
  9542  ** returned.  If a memory allocation error occurs, NULL is returned.
  9543  */
  9544  char *sqlite3BtreeIntegrityCheck(
  9545    Btree *p,     /* The btree to be checked */
  9546    int *aRoot,   /* An array of root pages numbers for individual trees */
  9547    int nRoot,    /* Number of entries in aRoot[] */
  9548    int mxErr,    /* Stop reporting errors after this many */
  9549    int *pnErr    /* Write number of errors seen to this variable */
  9550  ){
  9551    Pgno i;
  9552    IntegrityCk sCheck;
  9553    BtShared *pBt = p->pBt;
  9554    int savedDbFlags = pBt->db->flags;
  9555    char zErr[100];
  9556    VVA_ONLY( int nRef );
  9557  
  9558    sqlite3BtreeEnter(p);
  9559    assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
  9560    VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
  9561    assert( nRef>=0 );
  9562    sCheck.pBt = pBt;
  9563    sCheck.pPager = pBt->pPager;
  9564    sCheck.nPage = btreePagecount(sCheck.pBt);
  9565    sCheck.mxErr = mxErr;
  9566    sCheck.nErr = 0;
  9567    sCheck.mallocFailed = 0;
  9568    sCheck.zPfx = 0;
  9569    sCheck.v1 = 0;
  9570    sCheck.v2 = 0;
  9571    sCheck.aPgRef = 0;
  9572    sCheck.heap = 0;
  9573    sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
  9574    sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
  9575    if( sCheck.nPage==0 ){
  9576      goto integrity_ck_cleanup;
  9577    }
  9578  
  9579    sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
  9580    if( !sCheck.aPgRef ){
  9581      sCheck.mallocFailed = 1;
  9582      goto integrity_ck_cleanup;
  9583    }
  9584    sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
  9585    if( sCheck.heap==0 ){
  9586      sCheck.mallocFailed = 1;
  9587      goto integrity_ck_cleanup;
  9588    }
  9589  
  9590    i = PENDING_BYTE_PAGE(pBt);
  9591    if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
  9592  
  9593    /* Check the integrity of the freelist
  9594    */
  9595    sCheck.zPfx = "Main freelist: ";
  9596    checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
  9597              get4byte(&pBt->pPage1->aData[36]));
  9598    sCheck.zPfx = 0;
  9599  
  9600    /* Check all the tables.
  9601    */
  9602    testcase( pBt->db->flags & SQLITE_CellSizeCk );
  9603    pBt->db->flags &= ~SQLITE_CellSizeCk;
  9604    for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
  9605      i64 notUsed;
  9606      if( aRoot[i]==0 ) continue;
  9607  #ifndef SQLITE_OMIT_AUTOVACUUM
  9608      if( pBt->autoVacuum && aRoot[i]>1 ){
  9609        checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
  9610      }
  9611  #endif
  9612      checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
  9613    }
  9614    pBt->db->flags = savedDbFlags;
  9615  
  9616    /* Make sure every page in the file is referenced
  9617    */
  9618    for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
  9619  #ifdef SQLITE_OMIT_AUTOVACUUM
  9620      if( getPageReferenced(&sCheck, i)==0 ){
  9621        checkAppendMsg(&sCheck, "Page %d is never used", i);
  9622      }
  9623  #else
  9624      /* If the database supports auto-vacuum, make sure no tables contain
  9625      ** references to pointer-map pages.
  9626      */
  9627      if( getPageReferenced(&sCheck, i)==0 && 
  9628         (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
  9629        checkAppendMsg(&sCheck, "Page %d is never used", i);
  9630      }
  9631      if( getPageReferenced(&sCheck, i)!=0 && 
  9632         (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
  9633        checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
  9634      }
  9635  #endif
  9636    }
  9637  
  9638    /* Clean  up and report errors.
  9639    */
  9640  integrity_ck_cleanup:
  9641    sqlite3PageFree(sCheck.heap);
  9642    sqlite3_free(sCheck.aPgRef);
  9643    if( sCheck.mallocFailed ){
  9644      sqlite3StrAccumReset(&sCheck.errMsg);
  9645      sCheck.nErr++;
  9646    }
  9647    *pnErr = sCheck.nErr;
  9648    if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
  9649    /* Make sure this analysis did not leave any unref() pages. */
  9650    assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
  9651    sqlite3BtreeLeave(p);
  9652    return sqlite3StrAccumFinish(&sCheck.errMsg);
  9653  }
  9654  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  9655  
  9656  /*
  9657  ** Return the full pathname of the underlying database file.  Return
  9658  ** an empty string if the database is in-memory or a TEMP database.
  9659  **
  9660  ** The pager filename is invariant as long as the pager is
  9661  ** open so it is safe to access without the BtShared mutex.
  9662  */
  9663  const char *sqlite3BtreeGetFilename(Btree *p){
  9664    assert( p->pBt->pPager!=0 );
  9665    return sqlite3PagerFilename(p->pBt->pPager, 1);
  9666  }
  9667  
  9668  /*
  9669  ** Return the pathname of the journal file for this database. The return
  9670  ** value of this routine is the same regardless of whether the journal file
  9671  ** has been created or not.
  9672  **
  9673  ** The pager journal filename is invariant as long as the pager is
  9674  ** open so it is safe to access without the BtShared mutex.
  9675  */
  9676  const char *sqlite3BtreeGetJournalname(Btree *p){
  9677    assert( p->pBt->pPager!=0 );
  9678    return sqlite3PagerJournalname(p->pBt->pPager);
  9679  }
  9680  
  9681  /*
  9682  ** Return non-zero if a transaction is active.
  9683  */
  9684  int sqlite3BtreeIsInTrans(Btree *p){
  9685    assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
  9686    return (p && (p->inTrans==TRANS_WRITE));
  9687  }
  9688  
  9689  #ifndef SQLITE_OMIT_WAL
  9690  /*
  9691  ** Run a checkpoint on the Btree passed as the first argument.
  9692  **
  9693  ** Return SQLITE_LOCKED if this or any other connection has an open 
  9694  ** transaction on the shared-cache the argument Btree is connected to.
  9695  **
  9696  ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
  9697  */
  9698  int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
  9699    int rc = SQLITE_OK;
  9700    if( p ){
  9701      BtShared *pBt = p->pBt;
  9702      sqlite3BtreeEnter(p);
  9703      if( pBt->inTransaction!=TRANS_NONE ){
  9704        rc = SQLITE_LOCKED;
  9705      }else{
  9706        rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
  9707      }
  9708      sqlite3BtreeLeave(p);
  9709    }
  9710    return rc;
  9711  }
  9712  #endif
  9713  
  9714  /*
  9715  ** Return non-zero if a read (or write) transaction is active.
  9716  */
  9717  int sqlite3BtreeIsInReadTrans(Btree *p){
  9718    assert( p );
  9719    assert( sqlite3_mutex_held(p->db->mutex) );
  9720    return p->inTrans!=TRANS_NONE;
  9721  }
  9722  
  9723  int sqlite3BtreeIsInBackup(Btree *p){
  9724    assert( p );
  9725    assert( sqlite3_mutex_held(p->db->mutex) );
  9726    return p->nBackup!=0;
  9727  }
  9728  
  9729  /*
  9730  ** This function returns a pointer to a blob of memory associated with
  9731  ** a single shared-btree. The memory is used by client code for its own
  9732  ** purposes (for example, to store a high-level schema associated with 
  9733  ** the shared-btree). The btree layer manages reference counting issues.
  9734  **
  9735  ** The first time this is called on a shared-btree, nBytes bytes of memory
  9736  ** are allocated, zeroed, and returned to the caller. For each subsequent 
  9737  ** call the nBytes parameter is ignored and a pointer to the same blob
  9738  ** of memory returned. 
  9739  **
  9740  ** If the nBytes parameter is 0 and the blob of memory has not yet been
  9741  ** allocated, a null pointer is returned. If the blob has already been
  9742  ** allocated, it is returned as normal.
  9743  **
  9744  ** Just before the shared-btree is closed, the function passed as the 
  9745  ** xFree argument when the memory allocation was made is invoked on the 
  9746  ** blob of allocated memory. The xFree function should not call sqlite3_free()
  9747  ** on the memory, the btree layer does that.
  9748  */
  9749  void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
  9750    BtShared *pBt = p->pBt;
  9751    sqlite3BtreeEnter(p);
  9752    if( !pBt->pSchema && nBytes ){
  9753      pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
  9754      pBt->xFreeSchema = xFree;
  9755    }
  9756    sqlite3BtreeLeave(p);
  9757    return pBt->pSchema;
  9758  }
  9759  
  9760  /*
  9761  ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 
  9762  ** btree as the argument handle holds an exclusive lock on the 
  9763  ** sqlite_master table. Otherwise SQLITE_OK.
  9764  */
  9765  int sqlite3BtreeSchemaLocked(Btree *p){
  9766    int rc;
  9767    assert( sqlite3_mutex_held(p->db->mutex) );
  9768    sqlite3BtreeEnter(p);
  9769    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
  9770    assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
  9771    sqlite3BtreeLeave(p);
  9772    return rc;
  9773  }
  9774  
  9775  
  9776  #ifndef SQLITE_OMIT_SHARED_CACHE
  9777  /*
  9778  ** Obtain a lock on the table whose root page is iTab.  The
  9779  ** lock is a write lock if isWritelock is true or a read lock
  9780  ** if it is false.
  9781  */
  9782  int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
  9783    int rc = SQLITE_OK;
  9784    assert( p->inTrans!=TRANS_NONE );
  9785    if( p->sharable ){
  9786      u8 lockType = READ_LOCK + isWriteLock;
  9787      assert( READ_LOCK+1==WRITE_LOCK );
  9788      assert( isWriteLock==0 || isWriteLock==1 );
  9789  
  9790      sqlite3BtreeEnter(p);
  9791      rc = querySharedCacheTableLock(p, iTab, lockType);
  9792      if( rc==SQLITE_OK ){
  9793        rc = setSharedCacheTableLock(p, iTab, lockType);
  9794      }
  9795      sqlite3BtreeLeave(p);
  9796    }
  9797    return rc;
  9798  }
  9799  #endif
  9800  
  9801  #ifndef SQLITE_OMIT_INCRBLOB
  9802  /*
  9803  ** Argument pCsr must be a cursor opened for writing on an 
  9804  ** INTKEY table currently pointing at a valid table entry. 
  9805  ** This function modifies the data stored as part of that entry.
  9806  **
  9807  ** Only the data content may only be modified, it is not possible to 
  9808  ** change the length of the data stored. If this function is called with
  9809  ** parameters that attempt to write past the end of the existing data,
  9810  ** no modifications are made and SQLITE_CORRUPT is returned.
  9811  */
  9812  int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
  9813    int rc;
  9814    assert( cursorOwnsBtShared(pCsr) );
  9815    assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
  9816    assert( pCsr->curFlags & BTCF_Incrblob );
  9817  
  9818    rc = restoreCursorPosition(pCsr);
  9819    if( rc!=SQLITE_OK ){
  9820      return rc;
  9821    }
  9822    assert( pCsr->eState!=CURSOR_REQUIRESEEK );
  9823    if( pCsr->eState!=CURSOR_VALID ){
  9824      return SQLITE_ABORT;
  9825    }
  9826  
  9827    /* Save the positions of all other cursors open on this table. This is
  9828    ** required in case any of them are holding references to an xFetch
  9829    ** version of the b-tree page modified by the accessPayload call below.
  9830    **
  9831    ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
  9832    ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
  9833    ** saveAllCursors can only return SQLITE_OK.
  9834    */
  9835    VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
  9836    assert( rc==SQLITE_OK );
  9837  
  9838    /* Check some assumptions: 
  9839    **   (a) the cursor is open for writing,
  9840    **   (b) there is a read/write transaction open,
  9841    **   (c) the connection holds a write-lock on the table (if required),
  9842    **   (d) there are no conflicting read-locks, and
  9843    **   (e) the cursor points at a valid row of an intKey table.
  9844    */
  9845    if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
  9846      return SQLITE_READONLY;
  9847    }
  9848    assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
  9849                && pCsr->pBt->inTransaction==TRANS_WRITE );
  9850    assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
  9851    assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
  9852    assert( pCsr->pPage->intKey );
  9853  
  9854    return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
  9855  }
  9856  
  9857  /* 
  9858  ** Mark this cursor as an incremental blob cursor.
  9859  */
  9860  void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
  9861    pCur->curFlags |= BTCF_Incrblob;
  9862    pCur->pBtree->hasIncrblobCur = 1;
  9863  }
  9864  #endif
  9865  
  9866  /*
  9867  ** Set both the "read version" (single byte at byte offset 18) and 
  9868  ** "write version" (single byte at byte offset 19) fields in the database
  9869  ** header to iVersion.
  9870  */
  9871  int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
  9872    BtShared *pBt = pBtree->pBt;
  9873    int rc;                         /* Return code */
  9874   
  9875    assert( iVersion==1 || iVersion==2 );
  9876  
  9877    /* If setting the version fields to 1, do not automatically open the
  9878    ** WAL connection, even if the version fields are currently set to 2.
  9879    */
  9880    pBt->btsFlags &= ~BTS_NO_WAL;
  9881    if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
  9882  
  9883    rc = sqlite3BtreeBeginTrans(pBtree, 0);
  9884    if( rc==SQLITE_OK ){
  9885      u8 *aData = pBt->pPage1->aData;
  9886      if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
  9887        rc = sqlite3BtreeBeginTrans(pBtree, 2);
  9888        if( rc==SQLITE_OK ){
  9889          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  9890          if( rc==SQLITE_OK ){
  9891            aData[18] = (u8)iVersion;
  9892            aData[19] = (u8)iVersion;
  9893          }
  9894        }
  9895      }
  9896    }
  9897  
  9898    pBt->btsFlags &= ~BTS_NO_WAL;
  9899    return rc;
  9900  }
  9901  
  9902  /*
  9903  ** Return true if the cursor has a hint specified.  This routine is
  9904  ** only used from within assert() statements
  9905  */
  9906  int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
  9907    return (pCsr->hints & mask)!=0;
  9908  }
  9909  
  9910  /*
  9911  ** Return true if the given Btree is read-only.
  9912  */
  9913  int sqlite3BtreeIsReadonly(Btree *p){
  9914    return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
  9915  }
  9916  
  9917  /*
  9918  ** Return the size of the header added to each page by this module.
  9919  */
  9920  int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
  9921  
  9922  #if !defined(SQLITE_OMIT_SHARED_CACHE)
  9923  /*
  9924  ** Return true if the Btree passed as the only argument is sharable.
  9925  */
  9926  int sqlite3BtreeSharable(Btree *p){
  9927    return p->sharable;
  9928  }
  9929  
  9930  /*
  9931  ** Return the number of connections to the BtShared object accessed by
  9932  ** the Btree handle passed as the only argument. For private caches 
  9933  ** this is always 1. For shared caches it may be 1 or greater.
  9934  */
  9935  int sqlite3BtreeConnectionCount(Btree *p){
  9936    testcase( p->sharable );
  9937    return p->pBt->nRef;
  9938  }
  9939  #endif