modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/lsm1/lsm_tree.c (about) 1 /* 2 ** 2011-08-18 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** 13 ** This file contains the implementation of an in-memory tree structure. 14 ** 15 ** Technically the tree is a B-tree of order 4 (in the Knuth sense - each 16 ** node may have up to 4 children). Keys are stored within B-tree nodes by 17 ** reference. This may be slightly slower than a conventional red-black 18 ** tree, but it is simpler. It is also an easier structure to modify to 19 ** create a version that supports nested transaction rollback. 20 ** 21 ** This tree does not currently support a delete operation. One is not 22 ** required. When LSM deletes a key from a database, it inserts a DELETE 23 ** marker into the data structure. As a result, although the value associated 24 ** with a key stored in the in-memory tree structure may be modified, no 25 ** keys are ever removed. 26 */ 27 28 /* 29 ** MVCC NOTES 30 ** 31 ** The in-memory tree structure supports SQLite-style MVCC. This means 32 ** that while one client is writing to the tree structure, other clients 33 ** may still be querying an older snapshot of the tree. 34 ** 35 ** One way to implement this is to use an append-only b-tree. In this 36 ** case instead of modifying nodes in-place, a copy of the node is made 37 ** and the required modifications made to the copy. The parent of the 38 ** node is then modified (to update the pointer so that it points to 39 ** the new copy), which causes a copy of the parent to be made, and so on. 40 ** This means that each time the tree is written to a new root node is 41 ** created. A snapshot is identified by the root node that it uses. 42 ** 43 ** The problem with the above is that each time the tree is written to, 44 ** a copy of the node structure modified and all of its ancestor nodes 45 ** is made. This may prove excessive with large tree structures. 46 ** 47 ** To reduce this overhead, the data structure used for a tree node is 48 ** designed so that it may be edited in place exactly once without 49 ** affecting existing users. In other words, the node structure is capable 50 ** of storing two separate versions of the node at the same time. 51 ** When a node is to be edited, if the node structure already contains 52 ** two versions, a copy is made as in the append-only approach. Or, if 53 ** it only contains a single version, it is edited in place. 54 ** 55 ** This reduces the overhead so that, roughly, one new node structure 56 ** must be allocated for each write (on top of those allocations that 57 ** would have been required by a non-MVCC tree). Logic: Assume that at 58 ** any time, 50% of nodes in the tree already contain 2 versions. When 59 ** a new entry is written to a node, there is a 50% chance that a copy 60 ** of the node will be required. And a 25% chance that a copy of its 61 ** parent is required. And so on. 62 ** 63 ** ROLLBACK 64 ** 65 ** The in-memory tree also supports transaction and sub-transaction 66 ** rollback. In order to rollback to point in time X, the following is 67 ** necessary: 68 ** 69 ** 1. All memory allocated since X must be freed, and 70 ** 2. All "v2" data adding to nodes that existed at X should be zeroed. 71 ** 3. The root node must be restored to its X value. 72 ** 73 ** The Mempool object used to allocate memory for the tree supports 74 ** operation (1) - see the lsmPoolMark() and lsmPoolRevert() functions. 75 ** 76 ** To support (2), all nodes that have v2 data are part of a singly linked 77 ** list, sorted by the age of the v2 data (nodes that have had data added 78 ** most recently are at the end of the list). So to zero all v2 data added 79 ** since X, the linked list is traversed from the first node added following 80 ** X onwards. 81 ** 82 */ 83 84 #ifndef _LSM_INT_H 85 # include "lsmInt.h" 86 #endif 87 88 #include <string.h> 89 90 #define MAX_DEPTH 32 91 92 typedef struct TreeKey TreeKey; 93 typedef struct TreeNode TreeNode; 94 typedef struct TreeLeaf TreeLeaf; 95 typedef struct NodeVersion NodeVersion; 96 97 struct TreeOld { 98 u32 iShmid; /* Last shared-memory chunk in use by old */ 99 u32 iRoot; /* Offset of root node in shm file */ 100 u32 nHeight; /* Height of tree structure */ 101 }; 102 103 #if 0 104 /* 105 ** assert() that a TreeKey.flags value is sane. Usage: 106 ** 107 ** assert( lsmAssertFlagsOk(pTreeKey->flags) ); 108 */ 109 static int lsmAssertFlagsOk(u8 keyflags){ 110 /* At least one flag must be set. Otherwise, what is this key doing? */ 111 assert( keyflags!=0 ); 112 113 /* The POINT_DELETE and INSERT flags cannot both be set. */ 114 assert( (keyflags & LSM_POINT_DELETE)==0 || (keyflags & LSM_INSERT)==0 ); 115 116 /* If both the START_DELETE and END_DELETE flags are set, then the INSERT 117 ** flag must also be set. In other words - the three DELETE flags cannot 118 ** all be set */ 119 assert( (keyflags & LSM_END_DELETE)==0 120 || (keyflags & LSM_START_DELETE)==0 121 || (keyflags & LSM_POINT_DELETE)==0 122 ); 123 124 return 1; 125 } 126 #endif 127 static int assert_delete_ranges_match(lsm_db *); 128 static int treeCountEntries(lsm_db *db); 129 130 /* 131 ** Container for a key-value pair. Within the *-shm file, each key/value 132 ** pair is stored in a single allocation (which may not actually be 133 ** contiguous in memory). Layout is the TreeKey structure, followed by 134 ** the nKey bytes of key blob, followed by the nValue bytes of value blob 135 ** (if nValue is non-negative). 136 */ 137 struct TreeKey { 138 int nKey; /* Size of pKey in bytes */ 139 int nValue; /* Size of pValue. Or negative. */ 140 u8 flags; /* Various LSM_XXX flags */ 141 }; 142 143 #define TKV_KEY(p) ((void *)&(p)[1]) 144 #define TKV_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey)) 145 146 /* 147 ** A single tree node. A node structure may contain up to 3 key/value 148 ** pairs. Internal (non-leaf) nodes have up to 4 children. 149 ** 150 ** TODO: Update the format of this to be more compact. Get it working 151 ** first though... 152 */ 153 struct TreeNode { 154 u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ 155 156 /* The following fields are present for interior nodes only, not leaves. */ 157 u32 aiChildPtr[4]; /* Array of pointers to child nodes */ 158 159 /* The extra child pointer slot. */ 160 u32 iV2; /* Transaction number of v2 */ 161 u8 iV2Child; /* apChild[] entry replaced by pV2Ptr */ 162 u32 iV2Ptr; /* Substitute pointer */ 163 }; 164 165 struct TreeLeaf { 166 u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ 167 }; 168 169 typedef struct TreeBlob TreeBlob; 170 struct TreeBlob { 171 int n; 172 u8 *a; 173 }; 174 175 /* 176 ** Cursor for searching a tree structure. 177 ** 178 ** If a cursor does not point to any element (a.k.a. EOF), then the 179 ** TreeCursor.iNode variable is set to a negative value. Otherwise, the 180 ** cursor currently points to key aiCell[iNode] on node apTreeNode[iNode]. 181 ** 182 ** Entries in the apTreeNode[] and aiCell[] arrays contain the node and 183 ** index of the TreeNode.apChild[] pointer followed to descend to the 184 ** current element. Hence apTreeNode[0] always contains the root node of 185 ** the tree. 186 */ 187 struct TreeCursor { 188 lsm_db *pDb; /* Database handle for this cursor */ 189 TreeRoot *pRoot; /* Root node and height of tree to access */ 190 int iNode; /* Cursor points at apTreeNode[iNode] */ 191 TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */ 192 u8 aiCell[MAX_DEPTH]; /* Current position in tree */ 193 TreeKey *pSave; /* Saved key */ 194 TreeBlob blob; /* Dynamic storage for a key */ 195 }; 196 197 /* 198 ** A value guaranteed to be larger than the largest possible transaction 199 ** id (TreeHeader.iTransId). 200 */ 201 #define WORKING_VERSION (1<<30) 202 203 static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){ 204 if( n>p->n ){ 205 lsmFree(pDb->pEnv, p->a); 206 p->a = lsmMallocRc(pDb->pEnv, n, pRc); 207 p->n = n; 208 } 209 return (p->a==0); 210 } 211 static void tblobFree(lsm_db *pDb, TreeBlob *p){ 212 lsmFree(pDb->pEnv, p->a); 213 } 214 215 216 /*********************************************************************** 217 ** Start of IntArray methods. */ 218 /* 219 ** Append value iVal to the contents of IntArray *p. Return LSM_OK if 220 ** successful, or LSM_NOMEM if an OOM condition is encountered. 221 */ 222 static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){ 223 assert( p->nArray<=p->nAlloc ); 224 if( p->nArray>=p->nAlloc ){ 225 u32 *aNew; 226 int nNew = p->nArray ? p->nArray*2 : 128; 227 aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32)); 228 if( !aNew ) return LSM_NOMEM_BKPT; 229 p->aArray = aNew; 230 p->nAlloc = nNew; 231 } 232 233 p->aArray[p->nArray++] = iVal; 234 return LSM_OK; 235 } 236 237 /* 238 ** Zero the IntArray object. 239 */ 240 static void intArrayFree(lsm_env *pEnv, IntArray *p){ 241 p->nArray = 0; 242 } 243 244 /* 245 ** Return the number of entries currently in the int-array object. 246 */ 247 static int intArraySize(IntArray *p){ 248 return p->nArray; 249 } 250 251 /* 252 ** Return a copy of the iIdx'th entry in the int-array. 253 */ 254 static u32 intArrayEntry(IntArray *p, int iIdx){ 255 return p->aArray[iIdx]; 256 } 257 258 /* 259 ** Truncate the int-array so that all but the first nVal values are 260 ** discarded. 261 */ 262 static void intArrayTruncate(IntArray *p, int nVal){ 263 p->nArray = nVal; 264 } 265 /* End of IntArray methods. 266 ***********************************************************************/ 267 268 static int treeKeycmp(void *p1, int n1, void *p2, int n2){ 269 int res; 270 res = memcmp(p1, p2, LSM_MIN(n1, n2)); 271 if( res==0 ) res = (n1-n2); 272 return res; 273 } 274 275 /* 276 ** The pointer passed as the first argument points to an interior node, 277 ** not a leaf. This function returns the offset of the iCell'th child 278 ** sub-tree of the node. 279 */ 280 static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){ 281 assert( iVersion>=0 ); 282 assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) ); 283 if( p->iV2 && p->iV2<=(u32)iVersion && iCell==p->iV2Child ) return p->iV2Ptr; 284 return p->aiChildPtr[iCell]; 285 } 286 287 /* 288 ** Given an offset within the *-shm file, return the associated chunk number. 289 */ 290 static int treeOffsetToChunk(u32 iOff){ 291 assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); 292 return (int)(iOff>>15); 293 } 294 295 #define treeShmptrUnsafe(pDb, iPtr) \ 296 (&((u8*)((pDb)->apShm[(iPtr)>>15]))[(iPtr) & (LSM_SHM_CHUNK_SIZE-1)]) 297 298 /* 299 ** Return a pointer to the mapped memory location associated with *-shm 300 ** file offset iPtr. 301 */ 302 static void *treeShmptr(lsm_db *pDb, u32 iPtr){ 303 304 assert( (iPtr>>15)<(u32)pDb->nShm ); 305 assert( pDb->apShm[iPtr>>15] ); 306 307 return iPtr ? treeShmptrUnsafe(pDb, iPtr) : 0; 308 } 309 310 static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){ 311 return (ShmChunk *)(pDb->apShm[iChunk]); 312 } 313 314 static ShmChunk * treeShmChunkRc(lsm_db *pDb, int iChunk, int *pRc){ 315 assert( *pRc==LSM_OK ); 316 if( iChunk<pDb->nShm || LSM_OK==(*pRc = lsmShmCacheChunks(pDb, iChunk+1)) ){ 317 return (ShmChunk *)(pDb->apShm[iChunk]); 318 } 319 return 0; 320 } 321 322 323 #ifndef NDEBUG 324 static void assertIsWorkingChild( 325 lsm_db *db, 326 TreeNode *pNode, 327 TreeNode *pParent, 328 int iCell 329 ){ 330 TreeNode *p; 331 u32 iPtr = getChildPtr(pParent, WORKING_VERSION, iCell); 332 p = treeShmptr(db, iPtr); 333 assert( p==pNode ); 334 } 335 #else 336 # define assertIsWorkingChild(w,x,y,z) 337 #endif 338 339 /* Values for the third argument to treeShmkey(). */ 340 #define TKV_LOADKEY 1 341 #define TKV_LOADVAL 2 342 343 static TreeKey *treeShmkey( 344 lsm_db *pDb, /* Database handle */ 345 u32 iPtr, /* Shmptr to TreeKey struct */ 346 int eLoad, /* Either zero or a TREEKEY_LOADXXX value */ 347 TreeBlob *pBlob, /* Used if dynamic memory is required */ 348 int *pRc /* IN/OUT: Error code */ 349 ){ 350 TreeKey *pRet; 351 352 assert( eLoad==TKV_LOADKEY || eLoad==TKV_LOADVAL ); 353 pRet = (TreeKey *)treeShmptr(pDb, iPtr); 354 if( pRet ){ 355 int nReq; /* Bytes of space required at pRet */ 356 int nAvail; /* Bytes of space available at pRet */ 357 358 nReq = sizeof(TreeKey) + pRet->nKey; 359 if( eLoad==TKV_LOADVAL && pRet->nValue>0 ){ 360 nReq += pRet->nValue; 361 } 362 assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); 363 nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1)); 364 365 if( nAvail<nReq ){ 366 if( tblobGrow(pDb, pBlob, nReq, pRc)==0 ){ 367 int nLoad = 0; 368 while( *pRc==LSM_OK ){ 369 ShmChunk *pChunk; 370 void *p = treeShmptr(pDb, iPtr); 371 int n = LSM_MIN(nAvail, nReq-nLoad); 372 373 memcpy(&pBlob->a[nLoad], p, n); 374 nLoad += n; 375 if( nLoad==nReq ) break; 376 377 pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr)); 378 assert( pChunk ); 379 iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR; 380 nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR; 381 } 382 } 383 pRet = (TreeKey *)(pBlob->a); 384 } 385 } 386 387 return pRet; 388 } 389 390 #if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT) 391 void assert_leaf_looks_ok(TreeNode *pNode){ 392 assert( pNode->apKey[1] ); 393 } 394 395 void assert_node_looks_ok(TreeNode *pNode, int nHeight){ 396 if( pNode ){ 397 assert( pNode->apKey[1] ); 398 if( nHeight>1 ){ 399 int i; 400 assert( getChildPtr(pNode, WORKING_VERSION, 1) ); 401 assert( getChildPtr(pNode, WORKING_VERSION, 2) ); 402 for(i=0; i<4; i++){ 403 assert_node_looks_ok(getChildPtr(pNode, WORKING_VERSION, i), nHeight-1); 404 } 405 } 406 } 407 } 408 409 /* 410 ** Run various assert() statements to check that the working-version of the 411 ** tree is correct in the following respects: 412 ** 413 ** * todo... 414 */ 415 void assert_tree_looks_ok(int rc, Tree *pTree){ 416 } 417 #else 418 # define assert_tree_looks_ok(x,y) 419 #endif 420 421 void lsmFlagsToString(int flags, char *zFlags){ 422 423 zFlags[0] = (flags & LSM_END_DELETE) ? ']' : '.'; 424 425 /* Only one of LSM_POINT_DELETE, LSM_INSERT and LSM_SEPARATOR should ever 426 ** be set. If this is not true, write a '?' to the output. */ 427 switch( flags & (LSM_POINT_DELETE|LSM_INSERT|LSM_SEPARATOR) ){ 428 case 0: zFlags[1] = '.'; break; 429 case LSM_POINT_DELETE: zFlags[1] = '-'; break; 430 case LSM_INSERT: zFlags[1] = '+'; break; 431 case LSM_SEPARATOR: zFlags[1] = '^'; break; 432 default: zFlags[1] = '?'; break; 433 } 434 435 zFlags[2] = (flags & LSM_SYSTEMKEY) ? '*' : '.'; 436 zFlags[3] = (flags & LSM_START_DELETE) ? '[' : '.'; 437 zFlags[4] = '\0'; 438 } 439 440 #ifdef LSM_DEBUG 441 442 /* 443 ** Pointer pBlob points to a buffer containing a blob of binary data 444 ** nBlob bytes long. Append the contents of this blob to *pStr, with 445 ** each octet represented by a 2-digit hexadecimal number. For example, 446 ** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF}, 447 ** then "0144ff" is appended to *pStr. 448 */ 449 static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){ 450 int i; 451 lsmStringExtend(pStr, nBlob*2); 452 if( pStr->nAlloc==0 ) return; 453 for(i=0; i<nBlob; i++){ 454 u8 c = ((u8*)pBlob)[i]; 455 if( c>='a' && c<='z' ){ 456 pStr->z[pStr->n++] = c; 457 }else if( c!=0 || nBlob==1 || i!=(nBlob-1) ){ 458 pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf]; 459 pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf]; 460 } 461 } 462 pStr->z[pStr->n] = 0; 463 } 464 465 #if 0 /* NOT USED */ 466 /* 467 ** Append nIndent space (0x20) characters to string *pStr. 468 */ 469 static void lsmAppendIndent(LsmString *pStr, int nIndent){ 470 int i; 471 lsmStringExtend(pStr, nIndent); 472 for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1); 473 } 474 #endif 475 476 static void strAppendFlags(LsmString *pStr, u8 flags){ 477 char zFlags[8]; 478 479 lsmFlagsToString(flags, zFlags); 480 zFlags[4] = ':'; 481 482 lsmStringAppend(pStr, zFlags, 5); 483 } 484 485 void dump_node_contents( 486 lsm_db *pDb, 487 u32 iNode, /* Print out the contents of this node */ 488 char *zPath, /* Path from root to this node */ 489 int nPath, /* Number of bytes in zPath */ 490 int nHeight /* Height: (0==leaf) (1==parent-of-leaf) */ 491 ){ 492 const char *zSpace = " "; 493 int i; 494 int rc = LSM_OK; 495 LsmString s; 496 TreeNode *pNode; 497 TreeBlob b = {0, 0}; 498 499 pNode = (TreeNode *)treeShmptr(pDb, iNode); 500 501 if( nHeight==0 ){ 502 /* Append the nIndent bytes of space to string s. */ 503 lsmStringInit(&s, pDb->pEnv); 504 505 /* Append each key to string s. */ 506 for(i=0; i<3; i++){ 507 u32 iPtr = pNode->aiKeyPtr[i]; 508 if( iPtr ){ 509 TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i],TKV_LOADKEY, &b,&rc); 510 strAppendFlags(&s, pKey->flags); 511 lsmAppendStrBlob(&s, TKV_KEY(pKey), pKey->nKey); 512 lsmStringAppend(&s, " ", -1); 513 } 514 } 515 516 printf("% 6d %.*sleaf%.*s: %s\n", 517 iNode, nPath, zPath, 20-nPath-4, zSpace, s.z 518 ); 519 lsmStringClear(&s); 520 }else{ 521 for(i=0; i<4 && nHeight>0; i++){ 522 u32 iPtr = getChildPtr(pNode, pDb->treehdr.root.iTransId, i); 523 zPath[nPath] = (char)(i+'0'); 524 zPath[nPath+1] = '/'; 525 526 if( iPtr ){ 527 dump_node_contents(pDb, iPtr, zPath, nPath+2, nHeight-1); 528 } 529 if( i!=3 && pNode->aiKeyPtr[i] ){ 530 TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TKV_LOADKEY,&b,&rc); 531 lsmStringInit(&s, pDb->pEnv); 532 strAppendFlags(&s, pKey->flags); 533 lsmAppendStrBlob(&s, TKV_KEY(pKey), pKey->nKey); 534 printf("% 6d %.*s%.*s: %s\n", 535 iNode, nPath+1, zPath, 20-nPath-1, zSpace, s.z); 536 lsmStringClear(&s); 537 } 538 } 539 } 540 541 tblobFree(pDb, &b); 542 } 543 544 void dump_tree_contents(lsm_db *pDb, const char *zCaption){ 545 char zPath[64]; 546 TreeRoot *p = &pDb->treehdr.root; 547 printf("\n%s\n", zCaption); 548 zPath[0] = '/'; 549 if( p->iRoot ){ 550 dump_node_contents(pDb, p->iRoot, zPath, 1, p->nHeight-1); 551 } 552 fflush(stdout); 553 } 554 555 #endif 556 557 /* 558 ** Initialize a cursor object, the space for which has already been 559 ** allocated. 560 */ 561 static void treeCursorInit(lsm_db *pDb, int bOld, TreeCursor *pCsr){ 562 memset(pCsr, 0, sizeof(TreeCursor)); 563 pCsr->pDb = pDb; 564 if( bOld ){ 565 pCsr->pRoot = &pDb->treehdr.oldroot; 566 }else{ 567 pCsr->pRoot = &pDb->treehdr.root; 568 } 569 pCsr->iNode = -1; 570 } 571 572 /* 573 ** Return a pointer to the mapping of the TreeKey object that the cursor 574 ** is pointing to. 575 */ 576 static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){ 577 TreeKey *pRet; 578 lsm_db *pDb = pCsr->pDb; 579 u32 iPtr = pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]]; 580 581 assert( iPtr ); 582 pRet = (TreeKey*)treeShmptrUnsafe(pDb, iPtr); 583 if( !(pRet->flags & LSM_CONTIGUOUS) ){ 584 pRet = treeShmkey(pDb, iPtr, TKV_LOADVAL, pBlob, pRc); 585 } 586 587 return pRet; 588 } 589 590 /* 591 ** Save the current position of tree cursor pCsr. 592 */ 593 int lsmTreeCursorSave(TreeCursor *pCsr){ 594 int rc = LSM_OK; 595 if( pCsr && pCsr->pSave==0 ){ 596 int iNode = pCsr->iNode; 597 if( iNode>=0 ){ 598 pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc); 599 } 600 pCsr->iNode = -1; 601 } 602 return rc; 603 } 604 605 /* 606 ** Restore the position of a saved tree cursor. 607 */ 608 static int treeCursorRestore(TreeCursor *pCsr, int *pRes){ 609 int rc = LSM_OK; 610 if( pCsr->pSave ){ 611 TreeKey *pKey = pCsr->pSave; 612 pCsr->pSave = 0; 613 if( pRes ){ 614 rc = lsmTreeCursorSeek(pCsr, TKV_KEY(pKey), pKey->nKey, pRes); 615 } 616 } 617 return rc; 618 } 619 620 /* 621 ** Allocate nByte bytes of space within the *-shm file. If successful, 622 ** return LSM_OK and set *piPtr to the offset within the file at which 623 ** the allocated space is located. 624 */ 625 static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){ 626 u32 iRet = 0; 627 if( *pRc==LSM_OK ){ 628 const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE; 629 const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR; 630 u32 iWrite; /* Current write offset */ 631 u32 iEof; /* End of current chunk */ 632 int iChunk; /* Current chunk */ 633 634 assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) ); 635 636 /* Check if there is enough space on the current chunk to fit the 637 ** new allocation. If not, link in a new chunk and put the new 638 ** allocation at the start of it. */ 639 iWrite = pDb->treehdr.iWrite; 640 if( bAlign ){ 641 iWrite = (iWrite + 3) & ~0x0003; 642 assert( (iWrite % 4)==0 ); 643 } 644 645 assert( iWrite ); 646 iChunk = treeOffsetToChunk(iWrite-1); 647 iEof = (iChunk+1) * CHUNK_SIZE; 648 assert( iEof>=iWrite && (iEof-iWrite)<(u32)CHUNK_SIZE ); 649 if( (iWrite+nByte)>iEof ){ 650 ShmChunk *pHdr; /* Header of chunk just finished (iChunk) */ 651 ShmChunk *pFirst; /* Header of chunk treehdr.iFirst */ 652 ShmChunk *pNext; /* Header of new chunk */ 653 int iNext = 0; /* Next chunk */ 654 int rc = LSM_OK; 655 656 pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst); 657 658 assert( shm_sequence_ge(pDb->treehdr.iUsedShmid, pFirst->iShmid) ); 659 assert( (pDb->treehdr.iNextShmid+1-pDb->treehdr.nChunk)==pFirst->iShmid ); 660 661 /* Check if the chunk at the start of the linked list is still in 662 ** use. If not, reuse it. If so, allocate a new chunk by appending 663 ** to the *-shm file. */ 664 if( pDb->treehdr.iUsedShmid!=pFirst->iShmid ){ 665 int bInUse; 666 rc = lsmTreeInUse(pDb, pFirst->iShmid, &bInUse); 667 if( rc!=LSM_OK ){ 668 *pRc = rc; 669 return 0; 670 } 671 if( bInUse==0 ){ 672 iNext = pDb->treehdr.iFirst; 673 pDb->treehdr.iFirst = pFirst->iNext; 674 assert( pDb->treehdr.iFirst ); 675 } 676 } 677 if( iNext==0 ) iNext = pDb->treehdr.nChunk++; 678 679 /* Set the header values for the new chunk */ 680 pNext = treeShmChunkRc(pDb, iNext, &rc); 681 if( pNext ){ 682 pNext->iNext = 0; 683 pNext->iShmid = (pDb->treehdr.iNextShmid++); 684 }else{ 685 *pRc = rc; 686 return 0; 687 } 688 689 /* Set the header values for the chunk just finished */ 690 pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE); 691 pHdr->iNext = iNext; 692 693 /* Advance to the next chunk */ 694 iWrite = iNext * CHUNK_SIZE + CHUNK_HDR; 695 } 696 697 /* Allocate space at iWrite. */ 698 iRet = iWrite; 699 pDb->treehdr.iWrite = iWrite + nByte; 700 pDb->treehdr.root.nByte += nByte; 701 } 702 return iRet; 703 } 704 705 /* 706 ** Allocate and zero nByte bytes of space within the *-shm file. 707 */ 708 static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){ 709 u32 iPtr; 710 void *p; 711 iPtr = treeShmalloc(pDb, 1, nByte, pRc); 712 p = treeShmptr(pDb, iPtr); 713 if( p ){ 714 assert( *pRc==LSM_OK ); 715 memset(p, 0, nByte); 716 *piPtr = iPtr; 717 } 718 return p; 719 } 720 721 static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){ 722 return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc); 723 } 724 725 static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){ 726 return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc); 727 } 728 729 static TreeKey *newTreeKey( 730 lsm_db *pDb, 731 u32 *piPtr, 732 void *pKey, int nKey, /* Key data */ 733 void *pVal, int nVal, /* Value data (or nVal<0 for delete) */ 734 int *pRc 735 ){ 736 TreeKey *p; 737 u32 iPtr; 738 u32 iEnd; 739 int nRem; 740 u8 *a; 741 int n; 742 743 /* Allocate space for the TreeKey structure itself */ 744 *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc); 745 p = treeShmptr(pDb, iPtr); 746 if( *pRc ) return 0; 747 p->nKey = nKey; 748 p->nValue = nVal; 749 750 /* Allocate and populate the space required for the key and value. */ 751 n = nRem = nKey; 752 a = (u8 *)pKey; 753 while( a ){ 754 while( nRem>0 ){ 755 u8 *aAlloc; 756 int nAlloc; 757 u32 iWrite; 758 759 iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1)); 760 iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR); 761 nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), (u32)nRem); 762 763 aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc)); 764 if( aAlloc==0 ) break; 765 memcpy(aAlloc, &a[n-nRem], nAlloc); 766 nRem -= nAlloc; 767 } 768 a = pVal; 769 n = nRem = nVal; 770 pVal = 0; 771 } 772 773 iEnd = iPtr + sizeof(TreeKey) + nKey + LSM_MAX(0, nVal); 774 if( (iPtr & ~(LSM_SHM_CHUNK_SIZE-1))!=(iEnd & ~(LSM_SHM_CHUNK_SIZE-1)) ){ 775 p->flags = 0; 776 }else{ 777 p->flags = LSM_CONTIGUOUS; 778 } 779 780 if( *pRc ) return 0; 781 #if 0 782 printf("store: %d %s\n", (int)iPtr, (char *)pKey); 783 #endif 784 return p; 785 } 786 787 static TreeNode *copyTreeNode( 788 lsm_db *pDb, 789 TreeNode *pOld, 790 u32 *piNew, 791 int *pRc 792 ){ 793 TreeNode *pNew; 794 795 pNew = newTreeNode(pDb, piNew, pRc); 796 if( pNew ){ 797 memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr)); 798 memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr)); 799 if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr; 800 } 801 return pNew; 802 } 803 804 static TreeNode *copyTreeLeaf( 805 lsm_db *pDb, 806 TreeLeaf *pOld, 807 u32 *piNew, 808 int *pRc 809 ){ 810 TreeLeaf *pNew; 811 pNew = newTreeLeaf(pDb, piNew, pRc); 812 if( pNew ){ 813 memcpy(pNew, pOld, sizeof(TreeLeaf)); 814 } 815 return (TreeNode *)pNew; 816 } 817 818 /* 819 ** The tree cursor passed as the second argument currently points to an 820 ** internal node (not a leaf). Specifically, to a sub-tree pointer. This 821 ** function replaces the sub-tree that the cursor currently points to 822 ** with sub-tree pNew. 823 ** 824 ** The sub-tree may be replaced either by writing the "v2 data" on the 825 ** internal node, or by allocating a new TreeNode structure and then 826 ** calling this function on the parent of the internal node. 827 */ 828 static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){ 829 int rc = LSM_OK; 830 if( pCsr->iNode<0 ){ 831 /* iNew is the new root node */ 832 pDb->treehdr.root.iRoot = iNew; 833 }else{ 834 /* If this node already has version 2 content, allocate a copy and 835 ** update the copy with the new pointer value. Otherwise, store the 836 ** new pointer as v2 data within the current node structure. */ 837 838 TreeNode *p; /* The node to be modified */ 839 int iChildPtr; /* apChild[] entry to modify */ 840 841 p = pCsr->apTreeNode[pCsr->iNode]; 842 iChildPtr = pCsr->aiCell[pCsr->iNode]; 843 844 if( p->iV2 ){ 845 /* The "allocate new TreeNode" option */ 846 u32 iCopy; 847 TreeNode *pCopy; 848 pCopy = copyTreeNode(pDb, p, &iCopy, &rc); 849 if( pCopy ){ 850 assert( rc==LSM_OK ); 851 pCopy->aiChildPtr[iChildPtr] = iNew; 852 pCsr->iNode--; 853 rc = treeUpdatePtr(pDb, pCsr, iCopy); 854 } 855 }else{ 856 /* The "v2 data" option */ 857 u32 iPtr; 858 assert( pDb->treehdr.root.iTransId>0 ); 859 860 if( pCsr->iNode ){ 861 iPtr = getChildPtr( 862 pCsr->apTreeNode[pCsr->iNode-1], 863 pDb->treehdr.root.iTransId, pCsr->aiCell[pCsr->iNode-1] 864 ); 865 }else{ 866 iPtr = pDb->treehdr.root.iRoot; 867 } 868 rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr); 869 870 if( rc==LSM_OK ){ 871 p->iV2 = pDb->treehdr.root.iTransId; 872 p->iV2Child = (u8)iChildPtr; 873 p->iV2Ptr = iNew; 874 } 875 } 876 } 877 878 return rc; 879 } 880 881 /* 882 ** Cursor pCsr points at a node that is part of pTree. This function 883 ** inserts a new key and optionally child node pointer into that node. 884 ** 885 ** The position into which the new key and pointer are inserted is 886 ** determined by the iSlot parameter. The new key will be inserted to 887 ** the left of the key currently stored in apKey[iSlot]. Or, if iSlot is 888 ** greater than the index of the rightmost key in the node. 889 ** 890 ** Pointer pLeftPtr points to a child tree that contains keys that are 891 ** smaller than pTreeKey. 892 */ 893 static int treeInsert( 894 lsm_db *pDb, /* Database handle */ 895 TreeCursor *pCsr, /* Cursor indicating path to insert at */ 896 u32 iLeftPtr, /* Left child pointer */ 897 u32 iTreeKey, /* Location of key to insert */ 898 u32 iRightPtr, /* Right child pointer */ 899 int iSlot /* Position to insert key into */ 900 ){ 901 int rc = LSM_OK; 902 TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; 903 904 /* Check if the node is currently full. If so, split pNode in two and 905 ** call this function recursively to add a key to the parent. Otherwise, 906 ** insert the new key directly into pNode. */ 907 assert( pNode->aiKeyPtr[1] ); 908 if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){ 909 u32 iLeft; TreeNode *pLeft; /* New left-hand sibling node */ 910 u32 iRight; TreeNode *pRight; /* New right-hand sibling node */ 911 912 pLeft = newTreeNode(pDb, &iLeft, &rc); 913 pRight = newTreeNode(pDb, &iRight, &rc); 914 if( rc ) return rc; 915 916 pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0); 917 pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0]; 918 pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1); 919 920 pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2); 921 pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2]; 922 pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3); 923 924 if( pCsr->iNode==0 ){ 925 /* pNode is the root of the tree. Grow the tree by one level. */ 926 u32 iRoot; TreeNode *pRoot; /* New root node */ 927 928 pRoot = newTreeNode(pDb, &iRoot, &rc); 929 pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1]; 930 pRoot->aiChildPtr[1] = iLeft; 931 pRoot->aiChildPtr[2] = iRight; 932 933 pDb->treehdr.root.iRoot = iRoot; 934 pDb->treehdr.root.nHeight++; 935 }else{ 936 937 pCsr->iNode--; 938 rc = treeInsert(pDb, pCsr, 939 iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode] 940 ); 941 } 942 943 assert( pLeft->iV2==0 ); 944 assert( pRight->iV2==0 ); 945 switch( iSlot ){ 946 case 0: 947 pLeft->aiKeyPtr[0] = iTreeKey; 948 pLeft->aiChildPtr[0] = iLeftPtr; 949 if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr; 950 break; 951 case 1: 952 pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]); 953 pLeft->aiKeyPtr[2] = iTreeKey; 954 pLeft->aiChildPtr[2] = iLeftPtr; 955 break; 956 case 2: 957 pRight->aiKeyPtr[0] = iTreeKey; 958 pRight->aiChildPtr[0] = iLeftPtr; 959 if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr; 960 break; 961 case 3: 962 pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]); 963 pRight->aiKeyPtr[2] = iTreeKey; 964 pRight->aiChildPtr[2] = iLeftPtr; 965 break; 966 } 967 968 }else{ 969 TreeNode *pNew; 970 u32 *piKey; 971 u32 *piChild; 972 u32 iStore = 0; 973 u32 iNew = 0; 974 int i; 975 976 /* Allocate a new version of node pNode. */ 977 pNew = newTreeNode(pDb, &iNew, &rc); 978 if( rc ) return rc; 979 980 piKey = pNew->aiKeyPtr; 981 piChild = pNew->aiChildPtr; 982 983 for(i=0; i<iSlot; i++){ 984 if( pNode->aiKeyPtr[i] ){ 985 *(piKey++) = pNode->aiKeyPtr[i]; 986 *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i); 987 } 988 } 989 990 *piKey++ = iTreeKey; 991 *piChild++ = iLeftPtr; 992 993 iStore = iRightPtr; 994 for(i=iSlot; i<3; i++){ 995 if( pNode->aiKeyPtr[i] ){ 996 *(piKey++) = pNode->aiKeyPtr[i]; 997 *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i); 998 iStore = 0; 999 } 1000 } 1001 1002 if( iStore ){ 1003 *piChild = iStore; 1004 }else{ 1005 *piChild = getChildPtr(pNode, WORKING_VERSION, 1006 (pNode->aiKeyPtr[2] ? 3 : 2) 1007 ); 1008 } 1009 pCsr->iNode--; 1010 rc = treeUpdatePtr(pDb, pCsr, iNew); 1011 } 1012 1013 return rc; 1014 } 1015 1016 static int treeInsertLeaf( 1017 lsm_db *pDb, /* Database handle */ 1018 TreeCursor *pCsr, /* Cursor structure */ 1019 u32 iTreeKey, /* Key pointer to insert */ 1020 int iSlot /* Insert key to the left of this */ 1021 ){ 1022 int rc = LSM_OK; /* Return code */ 1023 TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode]; 1024 TreeLeaf *pNew; 1025 u32 iNew; 1026 1027 assert( iSlot>=0 && iSlot<=4 ); 1028 assert( pCsr->iNode>0 ); 1029 assert( pLeaf->aiKeyPtr[1] ); 1030 1031 pCsr->iNode--; 1032 1033 pNew = newTreeLeaf(pDb, &iNew, &rc); 1034 if( pNew ){ 1035 if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){ 1036 /* The leaf is full. Split it in two. */ 1037 TreeLeaf *pRight; 1038 u32 iRight; 1039 pRight = newTreeLeaf(pDb, &iRight, &rc); 1040 if( pRight ){ 1041 assert( rc==LSM_OK ); 1042 pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0]; 1043 pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2]; 1044 switch( iSlot ){ 1045 case 0: pNew->aiKeyPtr[0] = iTreeKey; break; 1046 case 1: pNew->aiKeyPtr[2] = iTreeKey; break; 1047 case 2: pRight->aiKeyPtr[0] = iTreeKey; break; 1048 case 3: pRight->aiKeyPtr[2] = iTreeKey; break; 1049 } 1050 1051 rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, 1052 pCsr->aiCell[pCsr->iNode] 1053 ); 1054 } 1055 }else{ 1056 int iOut = 0; 1057 int i; 1058 for(i=0; i<4; i++){ 1059 if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey; 1060 if( i<3 && pLeaf->aiKeyPtr[i] ){ 1061 pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i]; 1062 } 1063 } 1064 rc = treeUpdatePtr(pDb, pCsr, iNew); 1065 } 1066 } 1067 1068 return rc; 1069 } 1070 1071 void lsmTreeMakeOld(lsm_db *pDb){ 1072 1073 /* A write transaction must be open. Otherwise the code below that 1074 ** assumes (pDb->pClient->iLogOff) is current may malfunction. 1075 ** 1076 ** Update: currently this assert fails due to lsm_flush(), which does 1077 ** not set nTransOpen. 1078 */ 1079 assert( /* pDb->nTransOpen>0 && */ pDb->iReader>=0 ); 1080 1081 if( pDb->treehdr.iOldShmid==0 ){ 1082 pDb->treehdr.iOldLog = (pDb->treehdr.log.aRegion[2].iEnd << 1); 1083 pDb->treehdr.iOldLog |= (~(pDb->pClient->iLogOff) & (i64)0x0001); 1084 1085 pDb->treehdr.oldcksum0 = pDb->treehdr.log.cksum0; 1086 pDb->treehdr.oldcksum1 = pDb->treehdr.log.cksum1; 1087 pDb->treehdr.iOldShmid = pDb->treehdr.iNextShmid-1; 1088 memcpy(&pDb->treehdr.oldroot, &pDb->treehdr.root, sizeof(TreeRoot)); 1089 1090 pDb->treehdr.root.iTransId = 1; 1091 pDb->treehdr.root.iRoot = 0; 1092 pDb->treehdr.root.nHeight = 0; 1093 pDb->treehdr.root.nByte = 0; 1094 } 1095 } 1096 1097 void lsmTreeDiscardOld(lsm_db *pDb){ 1098 assert( lsmShmAssertLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL) 1099 || lsmShmAssertLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL) 1100 ); 1101 pDb->treehdr.iUsedShmid = pDb->treehdr.iOldShmid; 1102 pDb->treehdr.iOldShmid = 0; 1103 } 1104 1105 int lsmTreeHasOld(lsm_db *pDb){ 1106 return pDb->treehdr.iOldShmid!=0; 1107 } 1108 1109 /* 1110 ** This function is called during recovery to initialize the 1111 ** tree header. Only the database connections private copy of the tree-header 1112 ** is initialized here - it will be copied into shared memory if log file 1113 ** recovery is successful. 1114 */ 1115 int lsmTreeInit(lsm_db *pDb){ 1116 ShmChunk *pOne; 1117 int rc = LSM_OK; 1118 1119 memset(&pDb->treehdr, 0, sizeof(TreeHeader)); 1120 pDb->treehdr.root.iTransId = 1; 1121 pDb->treehdr.iFirst = 1; 1122 pDb->treehdr.nChunk = 2; 1123 pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR; 1124 pDb->treehdr.iNextShmid = 2; 1125 pDb->treehdr.iUsedShmid = 1; 1126 1127 pOne = treeShmChunkRc(pDb, 1, &rc); 1128 if( pOne ){ 1129 pOne->iNext = 0; 1130 pOne->iShmid = 1; 1131 } 1132 return rc; 1133 } 1134 1135 static void treeHeaderChecksum( 1136 TreeHeader *pHdr, 1137 u32 *aCksum 1138 ){ 1139 u32 cksum1 = 0x12345678; 1140 u32 cksum2 = 0x9ABCDEF0; 1141 u32 *a = (u32 *)pHdr; 1142 int i; 1143 1144 assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) ); 1145 assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 ); 1146 1147 for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){ 1148 cksum1 += a[i]; 1149 cksum2 += (cksum1 + a[i+1]); 1150 } 1151 aCksum[0] = cksum1; 1152 aCksum[1] = cksum2; 1153 } 1154 1155 /* 1156 ** Return true if the checksum stored in TreeHeader object *pHdr is 1157 ** consistent with the contents of its other fields. 1158 */ 1159 static int treeHeaderChecksumOk(TreeHeader *pHdr){ 1160 u32 aCksum[2]; 1161 treeHeaderChecksum(pHdr, aCksum); 1162 return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum))); 1163 } 1164 1165 /* 1166 ** This type is used by functions lsmTreeRepair() and treeSortByShmid() to 1167 ** make relinking the linked list of shared-memory chunks easier. 1168 */ 1169 typedef struct ShmChunkLoc ShmChunkLoc; 1170 struct ShmChunkLoc { 1171 ShmChunk *pShm; 1172 u32 iLoc; 1173 }; 1174 1175 /* 1176 ** This function checks that the linked list of shared memory chunks 1177 ** that starts at chunk db->treehdr.iFirst: 1178 ** 1179 ** 1) Includes all chunks in the shared-memory region, and 1180 ** 2) Links them together in order of ascending shm-id. 1181 ** 1182 ** If no error occurs and the conditions above are met, LSM_OK is returned. 1183 ** 1184 ** If either of the conditions are untrue, LSM_CORRUPT is returned. Or, if 1185 ** an error is encountered before the checks are completed, another LSM error 1186 ** code (i.e. LSM_IOERR or LSM_NOMEM) may be returned. 1187 */ 1188 static int treeCheckLinkedList(lsm_db *db){ 1189 int rc = LSM_OK; 1190 int nVisit = 0; 1191 ShmChunk *p; 1192 1193 p = treeShmChunkRc(db, db->treehdr.iFirst, &rc); 1194 while( rc==LSM_OK && p ){ 1195 if( p->iNext ){ 1196 if( p->iNext>=db->treehdr.nChunk ){ 1197 rc = LSM_CORRUPT_BKPT; 1198 }else{ 1199 ShmChunk *pNext = treeShmChunkRc(db, p->iNext, &rc); 1200 if( rc==LSM_OK ){ 1201 if( pNext->iShmid!=p->iShmid+1 ){ 1202 rc = LSM_CORRUPT_BKPT; 1203 } 1204 p = pNext; 1205 } 1206 } 1207 }else{ 1208 p = 0; 1209 } 1210 nVisit++; 1211 } 1212 1213 if( rc==LSM_OK && (u32)nVisit!=db->treehdr.nChunk-1 ){ 1214 rc = LSM_CORRUPT_BKPT; 1215 } 1216 return rc; 1217 } 1218 1219 /* 1220 ** Iterate through the current in-memory tree. If there are any v2-pointers 1221 ** with transaction ids larger than db->treehdr.iTransId, zero them. 1222 */ 1223 static int treeRepairPtrs(lsm_db *db){ 1224 int rc = LSM_OK; 1225 1226 if( db->treehdr.root.nHeight>1 ){ 1227 TreeCursor csr; /* Cursor used to iterate through tree */ 1228 u32 iTransId = db->treehdr.root.iTransId; 1229 1230 /* Initialize the cursor structure. Also decrement the nHeight variable 1231 ** in the tree-header. This will prevent the cursor from visiting any 1232 ** leaf nodes. */ 1233 db->treehdr.root.nHeight--; 1234 treeCursorInit(db, 0, &csr); 1235 1236 rc = lsmTreeCursorEnd(&csr, 0); 1237 while( rc==LSM_OK && lsmTreeCursorValid(&csr) ){ 1238 TreeNode *pNode = csr.apTreeNode[csr.iNode]; 1239 if( pNode->iV2>iTransId ){ 1240 pNode->iV2Child = 0; 1241 pNode->iV2Ptr = 0; 1242 pNode->iV2 = 0; 1243 } 1244 rc = lsmTreeCursorNext(&csr); 1245 } 1246 tblobFree(csr.pDb, &csr.blob); 1247 1248 db->treehdr.root.nHeight++; 1249 } 1250 1251 return rc; 1252 } 1253 1254 static int treeRepairList(lsm_db *db){ 1255 int rc = LSM_OK; 1256 int i; 1257 ShmChunk *p; 1258 ShmChunk *pMin = 0; 1259 u32 iMin = 0; 1260 1261 /* Iterate through all shm chunks. Find the smallest shm-id present in 1262 ** the shared-memory region. */ 1263 for(i=1; rc==LSM_OK && (u32)i<db->treehdr.nChunk; i++){ 1264 p = treeShmChunkRc(db, i, &rc); 1265 if( p && (pMin==0 || shm_sequence_ge(pMin->iShmid, p->iShmid)) ){ 1266 pMin = p; 1267 iMin = i; 1268 } 1269 } 1270 1271 /* Fix the shm-id values on any chunks with a shm-id greater than or 1272 ** equal to treehdr.iNextShmid. Then do a merge-sort of all chunks to 1273 ** fix the ShmChunk.iNext pointers. 1274 */ 1275 if( rc==LSM_OK ){ 1276 int nSort; 1277 int nByte; 1278 u32 iPrevShmid; 1279 ShmChunkLoc *aSort; 1280 1281 /* Allocate space for a merge sort. */ 1282 nSort = 1; 1283 while( (u32)nSort < (db->treehdr.nChunk-1) ) nSort = nSort * 2; 1284 nByte = sizeof(ShmChunkLoc) * nSort * 2; 1285 aSort = lsmMallocZeroRc(db->pEnv, nByte, &rc); 1286 iPrevShmid = pMin->iShmid; 1287 1288 /* Fix all shm-ids, if required. */ 1289 if( rc==LSM_OK ){ 1290 iPrevShmid = pMin->iShmid-1; 1291 for(i=1; (u32)i<db->treehdr.nChunk; i++){ 1292 p = treeShmChunk(db, i); 1293 aSort[i-1].pShm = p; 1294 aSort[i-1].iLoc = i; 1295 if( (u32)i!=db->treehdr.iFirst ){ 1296 if( shm_sequence_ge(p->iShmid, db->treehdr.iNextShmid) ){ 1297 p->iShmid = iPrevShmid--; 1298 } 1299 } 1300 } 1301 if( iMin!=db->treehdr.iFirst ){ 1302 p = treeShmChunk(db, db->treehdr.iFirst); 1303 p->iShmid = iPrevShmid; 1304 } 1305 } 1306 1307 if( rc==LSM_OK ){ 1308 ShmChunkLoc *aSpace = &aSort[nSort]; 1309 for(i=0; i<nSort; i++){ 1310 if( aSort[i].pShm ){ 1311 assert( shm_sequence_ge(aSort[i].pShm->iShmid, iPrevShmid) ); 1312 assert( aSpace[aSort[i].pShm->iShmid - iPrevShmid].pShm==0 ); 1313 aSpace[aSort[i].pShm->iShmid - iPrevShmid] = aSort[i]; 1314 } 1315 } 1316 1317 if( aSpace[nSort-1].pShm ) aSpace[nSort-1].pShm->iNext = 0; 1318 for(i=0; i<nSort-1; i++){ 1319 if( aSpace[i].pShm ){ 1320 aSpace[i].pShm->iNext = aSpace[i+1].iLoc; 1321 } 1322 } 1323 1324 rc = treeCheckLinkedList(db); 1325 lsmFree(db->pEnv, aSort); 1326 } 1327 } 1328 1329 return rc; 1330 } 1331 1332 /* 1333 ** This function is called as part of opening a write-transaction if the 1334 ** writer-flag is already set - indicating that the previous writer 1335 ** failed before ending its transaction. 1336 */ 1337 int lsmTreeRepair(lsm_db *db){ 1338 int rc = LSM_OK; 1339 TreeHeader hdr; 1340 ShmHeader *pHdr = db->pShmhdr; 1341 1342 /* Ensure that the two tree-headers are consistent. Copy one over the other 1343 ** if necessary. Prefer the data from a tree-header for which the checksum 1344 ** computes. Or, if they both compute, prefer tree-header-1. */ 1345 if( memcmp(&pHdr->hdr1, &pHdr->hdr2, sizeof(TreeHeader)) ){ 1346 if( treeHeaderChecksumOk(&pHdr->hdr1) ){ 1347 memcpy(&pHdr->hdr2, &pHdr->hdr1, sizeof(TreeHeader)); 1348 }else{ 1349 memcpy(&pHdr->hdr1, &pHdr->hdr2, sizeof(TreeHeader)); 1350 } 1351 } 1352 1353 /* Save the connections current copy of the tree-header. It will be 1354 ** restored before returning. */ 1355 memcpy(&hdr, &db->treehdr, sizeof(TreeHeader)); 1356 1357 /* Walk the tree. Zero any v2 pointers with a transaction-id greater than 1358 ** the transaction-id currently in the tree-headers. */ 1359 rc = treeRepairPtrs(db); 1360 1361 /* Repair the linked list of shared-memory chunks. */ 1362 if( rc==LSM_OK ){ 1363 rc = treeRepairList(db); 1364 } 1365 1366 memcpy(&db->treehdr, &hdr, sizeof(TreeHeader)); 1367 return rc; 1368 } 1369 1370 static void treeOverwriteKey(lsm_db *db, TreeCursor *pCsr, u32 iKey, int *pRc){ 1371 if( *pRc==LSM_OK ){ 1372 TreeRoot *p = &db->treehdr.root; 1373 TreeNode *pNew; 1374 u32 iNew; 1375 TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; 1376 int iCell = pCsr->aiCell[pCsr->iNode]; 1377 1378 /* Create a copy of this node */ 1379 if( (pCsr->iNode>0 && (u32)pCsr->iNode==(p->nHeight-1)) ){ 1380 pNew = copyTreeLeaf(db, (TreeLeaf *)pNode, &iNew, pRc); 1381 }else{ 1382 pNew = copyTreeNode(db, pNode, &iNew, pRc); 1383 } 1384 1385 if( pNew ){ 1386 /* Modify the value in the new version */ 1387 pNew->aiKeyPtr[iCell] = iKey; 1388 1389 /* Change the pointer in the parent (if any) to point at the new 1390 ** TreeNode */ 1391 pCsr->iNode--; 1392 treeUpdatePtr(db, pCsr, iNew); 1393 } 1394 } 1395 } 1396 1397 static int treeNextIsEndDelete(lsm_db *db, TreeCursor *pCsr){ 1398 int iNode = pCsr->iNode; 1399 int iCell = pCsr->aiCell[iNode]+1; 1400 1401 /* Cursor currently points to a leaf node. */ 1402 assert( (u32)pCsr->iNode==(db->treehdr.root.nHeight-1) ); 1403 1404 while( iNode>=0 ){ 1405 TreeNode *pNode = pCsr->apTreeNode[iNode]; 1406 if( iCell<3 && pNode->aiKeyPtr[iCell] ){ 1407 int rc = LSM_OK; 1408 TreeKey *pKey = treeShmptr(db, pNode->aiKeyPtr[iCell]); 1409 assert( rc==LSM_OK ); 1410 return ((pKey->flags & LSM_END_DELETE) ? 1 : 0); 1411 } 1412 iNode--; 1413 iCell = pCsr->aiCell[iNode]; 1414 } 1415 1416 return 0; 1417 } 1418 1419 static int treePrevIsStartDelete(lsm_db *db, TreeCursor *pCsr){ 1420 int iNode = pCsr->iNode; 1421 1422 /* Cursor currently points to a leaf node. */ 1423 assert( (u32)pCsr->iNode==(db->treehdr.root.nHeight-1) ); 1424 1425 while( iNode>=0 ){ 1426 TreeNode *pNode = pCsr->apTreeNode[iNode]; 1427 int iCell = pCsr->aiCell[iNode]-1; 1428 if( iCell>=0 && pNode->aiKeyPtr[iCell] ){ 1429 int rc = LSM_OK; 1430 TreeKey *pKey = treeShmptr(db, pNode->aiKeyPtr[iCell]); 1431 assert( rc==LSM_OK ); 1432 return ((pKey->flags & LSM_START_DELETE) ? 1 : 0); 1433 } 1434 iNode--; 1435 } 1436 1437 return 0; 1438 } 1439 1440 1441 static int treeInsertEntry( 1442 lsm_db *pDb, /* Database handle */ 1443 int flags, /* Flags associated with entry */ 1444 void *pKey, /* Pointer to key data */ 1445 int nKey, /* Size of key data in bytes */ 1446 void *pVal, /* Pointer to value data (or NULL) */ 1447 int nVal /* Bytes in value data (or -ve for delete) */ 1448 ){ 1449 int rc = LSM_OK; /* Return Code */ 1450 TreeKey *pTreeKey; /* New key-value being inserted */ 1451 u32 iTreeKey; 1452 TreeRoot *p = &pDb->treehdr.root; 1453 TreeCursor csr; /* Cursor to seek to pKey/nKey */ 1454 int res = 0; /* Result of seek operation on csr */ 1455 1456 assert( nVal>=0 || pVal==0 ); 1457 assert_tree_looks_ok(LSM_OK, pTree); 1458 assert( flags==LSM_INSERT || flags==LSM_POINT_DELETE 1459 || flags==LSM_START_DELETE || flags==LSM_END_DELETE 1460 ); 1461 assert( (flags & LSM_CONTIGUOUS)==0 ); 1462 #if 0 1463 dump_tree_contents(pDb, "before"); 1464 #endif 1465 1466 if( p->iRoot ){ 1467 TreeKey *pRes; /* Key at end of seek operation */ 1468 treeCursorInit(pDb, 0, &csr); 1469 1470 /* Seek to the leaf (or internal node) that the new key belongs on */ 1471 rc = lsmTreeCursorSeek(&csr, pKey, nKey, &res); 1472 pRes = csrGetKey(&csr, &csr.blob, &rc); 1473 if( rc!=LSM_OK ) return rc; 1474 assert( pRes ); 1475 1476 if( flags==LSM_START_DELETE ){ 1477 /* When inserting a start-delete-range entry, if the key that 1478 ** occurs immediately before the new entry is already a START_DELETE, 1479 ** then the new entry is not required. */ 1480 if( (res<=0 && (pRes->flags & LSM_START_DELETE)) 1481 || (res>0 && treePrevIsStartDelete(pDb, &csr)) 1482 ){ 1483 goto insert_entry_out; 1484 } 1485 }else if( flags==LSM_END_DELETE ){ 1486 /* When inserting an start-delete-range entry, if the key that 1487 ** occurs immediately after the new entry is already an END_DELETE, 1488 ** then the new entry is not required. */ 1489 if( (res<0 && treeNextIsEndDelete(pDb, &csr)) 1490 || (res>=0 && (pRes->flags & LSM_END_DELETE)) 1491 ){ 1492 goto insert_entry_out; 1493 } 1494 } 1495 1496 if( res==0 && (flags & (LSM_END_DELETE|LSM_START_DELETE)) ){ 1497 if( pRes->flags & LSM_INSERT ){ 1498 nVal = pRes->nValue; 1499 pVal = TKV_VAL(pRes); 1500 } 1501 flags = flags | pRes->flags; 1502 } 1503 1504 if( flags & (LSM_INSERT|LSM_POINT_DELETE) ){ 1505 if( (res<0 && (pRes->flags & LSM_START_DELETE)) 1506 || (res>0 && (pRes->flags & LSM_END_DELETE)) 1507 ){ 1508 flags = flags | (LSM_END_DELETE|LSM_START_DELETE); 1509 }else if( res==0 ){ 1510 flags = flags | (pRes->flags & (LSM_END_DELETE|LSM_START_DELETE)); 1511 } 1512 } 1513 }else{ 1514 memset(&csr, 0, sizeof(TreeCursor)); 1515 } 1516 1517 /* Allocate and populate a new key-value pair structure */ 1518 pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc); 1519 if( rc!=LSM_OK ) return rc; 1520 assert( pTreeKey->flags==0 || pTreeKey->flags==LSM_CONTIGUOUS ); 1521 pTreeKey->flags |= flags; 1522 1523 if( p->iRoot==0 ){ 1524 /* The tree is completely empty. Add a new root node and install 1525 ** (pKey/nKey) as the middle entry. Even though it is a leaf at the 1526 ** moment, use newTreeNode() to allocate the node (i.e. allocate enough 1527 ** space for the fields used by interior nodes). This is because the 1528 ** treeInsert() routine may convert this node to an interior node. */ 1529 TreeNode *pRoot = newTreeNode(pDb, &p->iRoot, &rc); 1530 if( rc==LSM_OK ){ 1531 assert( p->nHeight==0 ); 1532 pRoot->aiKeyPtr[1] = iTreeKey; 1533 p->nHeight = 1; 1534 } 1535 }else{ 1536 if( res==0 ){ 1537 /* The search found a match within the tree. */ 1538 treeOverwriteKey(pDb, &csr, iTreeKey, &rc); 1539 }else{ 1540 /* The cursor now points to the leaf node into which the new entry should 1541 ** be inserted. There may or may not be a free slot within the leaf for 1542 ** the new key-value pair. 1543 ** 1544 ** iSlot is set to the index of the key within pLeaf that the new key 1545 ** should be inserted to the left of (or to a value 1 greater than the 1546 ** index of the rightmost key if the new key is larger than all keys 1547 ** currently stored in the node). 1548 */ 1549 int iSlot = csr.aiCell[csr.iNode] + (res<0); 1550 if( csr.iNode==0 ){ 1551 rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot); 1552 }else{ 1553 rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot); 1554 } 1555 } 1556 } 1557 1558 #if 0 1559 dump_tree_contents(pDb, "after"); 1560 #endif 1561 insert_entry_out: 1562 tblobFree(pDb, &csr.blob); 1563 assert_tree_looks_ok(rc, pTree); 1564 return rc; 1565 } 1566 1567 /* 1568 ** Insert a new entry into the in-memory tree. 1569 ** 1570 ** If the value of the 5th parameter, nVal, is negative, then a delete-marker 1571 ** is inserted into the tree. In this case the value pointer, pVal, must be 1572 ** NULL. 1573 */ 1574 int lsmTreeInsert( 1575 lsm_db *pDb, /* Database handle */ 1576 void *pKey, /* Pointer to key data */ 1577 int nKey, /* Size of key data in bytes */ 1578 void *pVal, /* Pointer to value data (or NULL) */ 1579 int nVal /* Bytes in value data (or -ve for delete) */ 1580 ){ 1581 int flags; 1582 if( nVal<0 ){ 1583 flags = LSM_POINT_DELETE; 1584 }else{ 1585 flags = LSM_INSERT; 1586 } 1587 1588 return treeInsertEntry(pDb, flags, pKey, nKey, pVal, nVal); 1589 } 1590 1591 static int treeDeleteEntry(lsm_db *db, TreeCursor *pCsr, u32 iNewptr){ 1592 TreeRoot *p = &db->treehdr.root; 1593 TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; 1594 int iSlot = pCsr->aiCell[pCsr->iNode]; 1595 int bLeaf; 1596 int rc = LSM_OK; 1597 1598 assert( pNode->aiKeyPtr[1] ); 1599 assert( pNode->aiKeyPtr[iSlot] ); 1600 assert( iSlot==0 || iSlot==1 || iSlot==2 ); 1601 assert( ((u32)pCsr->iNode==(db->treehdr.root.nHeight-1))==(iNewptr==0) ); 1602 1603 bLeaf = ((u32)pCsr->iNode==(p->nHeight-1) && p->nHeight>1); 1604 1605 if( pNode->aiKeyPtr[0] || pNode->aiKeyPtr[2] ){ 1606 /* There are currently at least 2 keys on this node. So just create 1607 ** a new copy of the node with one of the keys removed. If the node 1608 ** happens to be the root node of the tree, allocate an entire 1609 ** TreeNode structure instead of just a TreeLeaf. */ 1610 TreeNode *pNew; 1611 u32 iNew; 1612 1613 if( bLeaf ){ 1614 pNew = (TreeNode *)newTreeLeaf(db, &iNew, &rc); 1615 }else{ 1616 pNew = newTreeNode(db, &iNew, &rc); 1617 } 1618 if( pNew ){ 1619 int i; 1620 int iOut = 1; 1621 for(i=0; i<4; i++){ 1622 if( i==iSlot ){ 1623 i++; 1624 if( bLeaf==0 ) pNew->aiChildPtr[iOut] = iNewptr; 1625 if( i<3 ) pNew->aiKeyPtr[iOut] = pNode->aiKeyPtr[i]; 1626 iOut++; 1627 }else if( bLeaf || p->nHeight==1 ){ 1628 if( i<3 && pNode->aiKeyPtr[i] ){ 1629 pNew->aiKeyPtr[iOut++] = pNode->aiKeyPtr[i]; 1630 } 1631 }else{ 1632 if( getChildPtr(pNode, WORKING_VERSION, i) ){ 1633 pNew->aiChildPtr[iOut] = getChildPtr(pNode, WORKING_VERSION, i); 1634 if( i<3 ) pNew->aiKeyPtr[iOut] = pNode->aiKeyPtr[i]; 1635 iOut++; 1636 } 1637 } 1638 } 1639 assert( iOut<=4 ); 1640 assert( bLeaf || pNew->aiChildPtr[0]==0 ); 1641 pCsr->iNode--; 1642 rc = treeUpdatePtr(db, pCsr, iNew); 1643 } 1644 1645 }else if( pCsr->iNode==0 ){ 1646 /* Removing the only key in the root node. iNewptr is the new root. */ 1647 assert( iSlot==1 ); 1648 db->treehdr.root.iRoot = iNewptr; 1649 db->treehdr.root.nHeight--; 1650 1651 }else{ 1652 /* There is only one key on this node and the node is not the root 1653 ** node. Find a peer for this node. Then redistribute the contents of 1654 ** the peer and the parent cell between the parent and either one or 1655 ** two new nodes. */ 1656 TreeNode *pParent; /* Parent tree node */ 1657 int iPSlot; 1658 u32 iPeer; /* Pointer to peer leaf node */ 1659 int iDir; 1660 TreeNode *pPeer; /* The peer leaf node */ 1661 TreeNode *pNew1; u32 iNew1; /* First new leaf node */ 1662 1663 assert( iSlot==1 ); 1664 1665 pParent = pCsr->apTreeNode[pCsr->iNode-1]; 1666 iPSlot = pCsr->aiCell[pCsr->iNode-1]; 1667 1668 if( iPSlot>0 && getChildPtr(pParent, WORKING_VERSION, iPSlot-1) ){ 1669 iDir = -1; 1670 }else{ 1671 iDir = +1; 1672 } 1673 iPeer = getChildPtr(pParent, WORKING_VERSION, iPSlot+iDir); 1674 pPeer = (TreeNode *)treeShmptr(db, iPeer); 1675 assertIsWorkingChild(db, pNode, pParent, iPSlot); 1676 1677 /* Allocate the first new leaf node. This is always required. */ 1678 if( bLeaf ){ 1679 pNew1 = (TreeNode *)newTreeLeaf(db, &iNew1, &rc); 1680 }else{ 1681 pNew1 = (TreeNode *)newTreeNode(db, &iNew1, &rc); 1682 } 1683 1684 if( pPeer->aiKeyPtr[0] && pPeer->aiKeyPtr[2] ){ 1685 /* Peer node is completely full. This means that two new leaf nodes 1686 ** and a new parent node are required. */ 1687 1688 TreeNode *pNew2; u32 iNew2; /* Second new leaf node */ 1689 TreeNode *pNewP; u32 iNewP; /* New parent node */ 1690 1691 if( bLeaf ){ 1692 pNew2 = (TreeNode *)newTreeLeaf(db, &iNew2, &rc); 1693 }else{ 1694 pNew2 = (TreeNode *)newTreeNode(db, &iNew2, &rc); 1695 } 1696 pNewP = copyTreeNode(db, pParent, &iNewP, &rc); 1697 1698 if( iDir==-1 ){ 1699 pNew1->aiKeyPtr[1] = pPeer->aiKeyPtr[0]; 1700 if( bLeaf==0 ){ 1701 pNew1->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 0); 1702 pNew1->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 1); 1703 } 1704 1705 pNewP->aiChildPtr[iPSlot-1] = iNew1; 1706 pNewP->aiKeyPtr[iPSlot-1] = pPeer->aiKeyPtr[1]; 1707 pNewP->aiChildPtr[iPSlot] = iNew2; 1708 1709 pNew2->aiKeyPtr[0] = pPeer->aiKeyPtr[2]; 1710 pNew2->aiKeyPtr[1] = pParent->aiKeyPtr[iPSlot-1]; 1711 if( bLeaf==0 ){ 1712 pNew2->aiChildPtr[0] = getChildPtr(pPeer, WORKING_VERSION, 2); 1713 pNew2->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 3); 1714 pNew2->aiChildPtr[2] = iNewptr; 1715 } 1716 }else{ 1717 pNew1->aiKeyPtr[1] = pParent->aiKeyPtr[iPSlot]; 1718 if( bLeaf==0 ){ 1719 pNew1->aiChildPtr[1] = iNewptr; 1720 pNew1->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 0); 1721 } 1722 1723 pNewP->aiChildPtr[iPSlot] = iNew1; 1724 pNewP->aiKeyPtr[iPSlot] = pPeer->aiKeyPtr[0]; 1725 pNewP->aiChildPtr[iPSlot+1] = iNew2; 1726 1727 pNew2->aiKeyPtr[0] = pPeer->aiKeyPtr[1]; 1728 pNew2->aiKeyPtr[1] = pPeer->aiKeyPtr[2]; 1729 if( bLeaf==0 ){ 1730 pNew2->aiChildPtr[0] = getChildPtr(pPeer, WORKING_VERSION, 1); 1731 pNew2->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 2); 1732 pNew2->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 3); 1733 } 1734 } 1735 assert( pCsr->iNode>=1 ); 1736 pCsr->iNode -= 2; 1737 if( rc==LSM_OK ){ 1738 assert( pNew1->aiKeyPtr[1] && pNew2->aiKeyPtr[1] ); 1739 rc = treeUpdatePtr(db, pCsr, iNewP); 1740 } 1741 }else{ 1742 int iKOut = 0; 1743 int iPOut = 0; 1744 int i; 1745 1746 pCsr->iNode--; 1747 1748 if( iDir==1 ){ 1749 pNew1->aiKeyPtr[iKOut++] = pParent->aiKeyPtr[iPSlot]; 1750 if( bLeaf==0 ) pNew1->aiChildPtr[iPOut++] = iNewptr; 1751 } 1752 for(i=0; i<3; i++){ 1753 if( pPeer->aiKeyPtr[i] ){ 1754 pNew1->aiKeyPtr[iKOut++] = pPeer->aiKeyPtr[i]; 1755 } 1756 } 1757 if( bLeaf==0 ){ 1758 for(i=0; i<4; i++){ 1759 if( getChildPtr(pPeer, WORKING_VERSION, i) ){ 1760 pNew1->aiChildPtr[iPOut++] = getChildPtr(pPeer, WORKING_VERSION, i); 1761 } 1762 } 1763 } 1764 if( iDir==-1 ){ 1765 iPSlot--; 1766 pNew1->aiKeyPtr[iKOut++] = pParent->aiKeyPtr[iPSlot]; 1767 if( bLeaf==0 ) pNew1->aiChildPtr[iPOut++] = iNewptr; 1768 pCsr->aiCell[pCsr->iNode] = (u8)iPSlot; 1769 } 1770 1771 rc = treeDeleteEntry(db, pCsr, iNew1); 1772 } 1773 } 1774 1775 return rc; 1776 } 1777 1778 /* 1779 ** Delete a range of keys from the tree structure (i.e. the lsm_delete_range() 1780 ** function, not lsm_delete()). 1781 ** 1782 ** This is a two step process: 1783 ** 1784 ** 1) Remove all entries currently stored in the tree that have keys 1785 ** that fall into the deleted range. 1786 ** 1787 ** TODO: There are surely good ways to optimize this step - removing 1788 ** a range of keys from a b-tree. But for now, this function removes 1789 ** them one at a time using the usual approach. 1790 ** 1791 ** 2) Unless the largest key smaller than or equal to (pKey1/nKey1) is 1792 ** already marked as START_DELETE, insert a START_DELETE key. 1793 ** Similarly, unless the smallest key greater than or equal to 1794 ** (pKey2/nKey2) is already START_END, insert a START_END key. 1795 */ 1796 int lsmTreeDelete( 1797 lsm_db *db, 1798 void *pKey1, int nKey1, /* Start of range */ 1799 void *pKey2, int nKey2 /* End of range */ 1800 ){ 1801 int rc = LSM_OK; 1802 int bDone = 0; 1803 TreeRoot *p = &db->treehdr.root; 1804 TreeBlob blob = {0, 0}; 1805 1806 /* The range must be sensible - that (key1 < key2). */ 1807 assert( treeKeycmp(pKey1, nKey1, pKey2, nKey2)<0 ); 1808 assert( assert_delete_ranges_match(db) ); 1809 1810 #if 0 1811 static int nCall = 0; 1812 printf("\n"); 1813 nCall++; 1814 printf("%d delete %s .. %s\n", nCall, (char *)pKey1, (char *)pKey2); 1815 dump_tree_contents(db, "before delete"); 1816 #endif 1817 1818 /* Step 1. This loop runs until the tree contains no keys within the 1819 ** range being deleted. Or until an error occurs. */ 1820 while( bDone==0 && rc==LSM_OK ){ 1821 int res; 1822 TreeCursor csr; /* Cursor to seek to first key in range */ 1823 void *pDel; int nDel; /* Key to (possibly) delete this iteration */ 1824 #ifndef NDEBUG 1825 int nEntry = treeCountEntries(db); 1826 #endif 1827 1828 /* Seek the cursor to the first entry in the tree greater than pKey1. */ 1829 treeCursorInit(db, 0, &csr); 1830 lsmTreeCursorSeek(&csr, pKey1, nKey1, &res); 1831 if( res<=0 && lsmTreeCursorValid(&csr) ) lsmTreeCursorNext(&csr); 1832 1833 /* If there is no such entry, or if it is greater than pKey2, then the 1834 ** tree now contains no keys in the range being deleted. In this case 1835 ** break out of the loop. */ 1836 bDone = 1; 1837 if( lsmTreeCursorValid(&csr) ){ 1838 lsmTreeCursorKey(&csr, 0, &pDel, &nDel); 1839 if( treeKeycmp(pDel, nDel, pKey2, nKey2)<0 ) bDone = 0; 1840 } 1841 1842 if( bDone==0 ){ 1843 if( (u32)csr.iNode==(p->nHeight-1) ){ 1844 /* The element to delete already lies on a leaf node */ 1845 rc = treeDeleteEntry(db, &csr, 0); 1846 }else{ 1847 /* 1. Overwrite the current key with a copy of the next key in the 1848 ** tree (key N). 1849 ** 1850 ** 2. Seek to key N (cursor will stop at the internal node copy of 1851 ** N). Move to the next key (original copy of N). Delete 1852 ** this entry. 1853 */ 1854 u32 iKey; 1855 TreeKey *pKey; 1856 int iNode = csr.iNode; 1857 lsmTreeCursorNext(&csr); 1858 assert( (u32)csr.iNode==(p->nHeight-1) ); 1859 1860 iKey = csr.apTreeNode[csr.iNode]->aiKeyPtr[csr.aiCell[csr.iNode]]; 1861 lsmTreeCursorPrev(&csr); 1862 1863 treeOverwriteKey(db, &csr, iKey, &rc); 1864 pKey = treeShmkey(db, iKey, TKV_LOADKEY, &blob, &rc); 1865 if( pKey ){ 1866 rc = lsmTreeCursorSeek(&csr, TKV_KEY(pKey), pKey->nKey, &res); 1867 } 1868 if( rc==LSM_OK ){ 1869 assert( res==0 && csr.iNode==iNode ); 1870 rc = lsmTreeCursorNext(&csr); 1871 if( rc==LSM_OK ){ 1872 rc = treeDeleteEntry(db, &csr, 0); 1873 } 1874 } 1875 } 1876 } 1877 1878 /* Clean up any memory allocated by the cursor. */ 1879 tblobFree(db, &csr.blob); 1880 #if 0 1881 dump_tree_contents(db, "ddd delete"); 1882 #endif 1883 assert( bDone || treeCountEntries(db)==(nEntry-1) ); 1884 } 1885 1886 #if 0 1887 dump_tree_contents(db, "during delete"); 1888 #endif 1889 1890 /* Now insert the START_DELETE and END_DELETE keys. */ 1891 if( rc==LSM_OK ){ 1892 rc = treeInsertEntry(db, LSM_START_DELETE, pKey1, nKey1, 0, -1); 1893 } 1894 #if 0 1895 dump_tree_contents(db, "during delete 2"); 1896 #endif 1897 if( rc==LSM_OK ){ 1898 rc = treeInsertEntry(db, LSM_END_DELETE, pKey2, nKey2, 0, -1); 1899 } 1900 1901 #if 0 1902 dump_tree_contents(db, "after delete"); 1903 #endif 1904 1905 tblobFree(db, &blob); 1906 assert( assert_delete_ranges_match(db) ); 1907 return rc; 1908 } 1909 1910 /* 1911 ** Return, in bytes, the amount of memory currently used by the tree 1912 ** structure. 1913 */ 1914 int lsmTreeSize(lsm_db *pDb){ 1915 return pDb->treehdr.root.nByte; 1916 } 1917 1918 /* 1919 ** Open a cursor on the in-memory tree pTree. 1920 */ 1921 int lsmTreeCursorNew(lsm_db *pDb, int bOld, TreeCursor **ppCsr){ 1922 TreeCursor *pCsr; 1923 *ppCsr = pCsr = lsmMalloc(pDb->pEnv, sizeof(TreeCursor)); 1924 if( pCsr ){ 1925 treeCursorInit(pDb, bOld, pCsr); 1926 return LSM_OK; 1927 } 1928 return LSM_NOMEM_BKPT; 1929 } 1930 1931 /* 1932 ** Close an in-memory tree cursor. 1933 */ 1934 void lsmTreeCursorDestroy(TreeCursor *pCsr){ 1935 if( pCsr ){ 1936 tblobFree(pCsr->pDb, &pCsr->blob); 1937 lsmFree(pCsr->pDb->pEnv, pCsr); 1938 } 1939 } 1940 1941 void lsmTreeCursorReset(TreeCursor *pCsr){ 1942 if( pCsr ){ 1943 pCsr->iNode = -1; 1944 pCsr->pSave = 0; 1945 } 1946 } 1947 1948 #ifndef NDEBUG 1949 static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey, int *pRc){ 1950 TreeKey *p; 1951 int cmp = 0; 1952 assert( pCsr->iNode>=0 ); 1953 p = csrGetKey(pCsr, &pCsr->blob, pRc); 1954 if( p ){ 1955 cmp = treeKeycmp(TKV_KEY(p), p->nKey, pKey, nKey); 1956 } 1957 return cmp; 1958 } 1959 #endif 1960 1961 1962 /* 1963 ** Attempt to seek the cursor passed as the first argument to key (pKey/nKey) 1964 ** in the tree structure. If an exact match for the key is found, leave the 1965 ** cursor pointing to it and set *pRes to zero before returning. If an 1966 ** exact match cannot be found, do one of the following: 1967 ** 1968 ** * Leave the cursor pointing to the smallest element in the tree that 1969 ** is larger than the key and set *pRes to +1, or 1970 ** 1971 ** * Leave the cursor pointing to the largest element in the tree that 1972 ** is smaller than the key and set *pRes to -1, or 1973 ** 1974 ** * If the tree is empty, leave the cursor at EOF and set *pRes to -1. 1975 */ 1976 int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){ 1977 int rc = LSM_OK; /* Return code */ 1978 lsm_db *pDb = pCsr->pDb; 1979 TreeRoot *pRoot = pCsr->pRoot; 1980 u32 iNodePtr; /* Location of current node in search */ 1981 1982 /* Discard any saved position data */ 1983 treeCursorRestore(pCsr, 0); 1984 1985 iNodePtr = pRoot->iRoot; 1986 if( iNodePtr==0 ){ 1987 /* Either an error occurred or the tree is completely empty. */ 1988 assert( rc!=LSM_OK || pRoot->iRoot==0 ); 1989 *pRes = -1; 1990 pCsr->iNode = -1; 1991 }else{ 1992 TreeBlob b = {0, 0}; 1993 int res = 0; /* Result of comparison function */ 1994 int iNode = -1; 1995 while( iNodePtr ){ 1996 TreeNode *pNode; /* Node at location iNodePtr */ 1997 int iTest; /* Index of second key to test (0 or 2) */ 1998 u32 iTreeKey; 1999 TreeKey *pTreeKey; /* Key to compare against */ 2000 2001 pNode = (TreeNode *)treeShmptrUnsafe(pDb, iNodePtr); 2002 iNode++; 2003 pCsr->apTreeNode[iNode] = pNode; 2004 2005 /* Compare (pKey/nKey) with the key in the middle slot of B-tree node 2006 ** pNode. The middle slot is never empty. If the comparison is a match, 2007 ** then the search is finished. Break out of the loop. */ 2008 pTreeKey = (TreeKey*)treeShmptrUnsafe(pDb, pNode->aiKeyPtr[1]); 2009 if( !(pTreeKey->flags & LSM_CONTIGUOUS) ){ 2010 pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TKV_LOADKEY, &b, &rc); 2011 if( rc!=LSM_OK ) break; 2012 } 2013 res = treeKeycmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); 2014 if( res==0 ){ 2015 pCsr->aiCell[iNode] = 1; 2016 break; 2017 } 2018 2019 /* Based on the results of the previous comparison, compare (pKey/nKey) 2020 ** to either the left or right key of the B-tree node, if such a key 2021 ** exists. */ 2022 iTest = (res>0 ? 0 : 2); 2023 iTreeKey = pNode->aiKeyPtr[iTest]; 2024 if( iTreeKey ){ 2025 pTreeKey = (TreeKey*)treeShmptrUnsafe(pDb, iTreeKey); 2026 if( !(pTreeKey->flags & LSM_CONTIGUOUS) ){ 2027 pTreeKey = treeShmkey(pDb, iTreeKey, TKV_LOADKEY, &b, &rc); 2028 if( rc ) break; 2029 } 2030 res = treeKeycmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); 2031 if( res==0 ){ 2032 pCsr->aiCell[iNode] = (u8)iTest; 2033 break; 2034 } 2035 }else{ 2036 iTest = 1; 2037 } 2038 2039 if( (u32)iNode<(pRoot->nHeight-1) ){ 2040 iNodePtr = getChildPtr(pNode, pRoot->iTransId, iTest + (res<0)); 2041 }else{ 2042 iNodePtr = 0; 2043 } 2044 pCsr->aiCell[iNode] = (u8)(iTest + (iNodePtr && (res<0))); 2045 } 2046 2047 *pRes = res; 2048 pCsr->iNode = iNode; 2049 tblobFree(pDb, &b); 2050 } 2051 2052 /* assert() that *pRes has been set properly */ 2053 #ifndef NDEBUG 2054 if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){ 2055 int cmp = treeCsrCompare(pCsr, pKey, nKey, &rc); 2056 assert( rc!=LSM_OK || *pRes==cmp || (*pRes ^ cmp)>0 ); 2057 } 2058 #endif 2059 2060 return rc; 2061 } 2062 2063 int lsmTreeCursorNext(TreeCursor *pCsr){ 2064 #ifndef NDEBUG 2065 TreeKey *pK1; 2066 TreeBlob key1 = {0, 0}; 2067 #endif 2068 lsm_db *pDb = pCsr->pDb; 2069 TreeRoot *pRoot = pCsr->pRoot; 2070 const int iLeaf = pRoot->nHeight-1; 2071 int iCell; 2072 int rc = LSM_OK; 2073 TreeNode *pNode; 2074 2075 /* Restore the cursor position, if required */ 2076 int iRestore = 0; 2077 treeCursorRestore(pCsr, &iRestore); 2078 if( iRestore>0 ) return LSM_OK; 2079 2080 /* Save a pointer to the current key. This is used in an assert() at the 2081 ** end of this function - to check that the 'next' key really is larger 2082 ** than the current key. */ 2083 #ifndef NDEBUG 2084 pK1 = csrGetKey(pCsr, &key1, &rc); 2085 if( rc!=LSM_OK ) return rc; 2086 #endif 2087 2088 assert( lsmTreeCursorValid(pCsr) ); 2089 assert( pCsr->aiCell[pCsr->iNode]<3 ); 2090 2091 pNode = pCsr->apTreeNode[pCsr->iNode]; 2092 iCell = ++pCsr->aiCell[pCsr->iNode]; 2093 2094 /* If the current node is not a leaf, and the current cell has sub-tree 2095 ** associated with it, descend to the left-most key on the left-most 2096 ** leaf of the sub-tree. */ 2097 if( pCsr->iNode<iLeaf && getChildPtr(pNode, pRoot->iTransId, iCell) ){ 2098 do { 2099 u32 iNodePtr; 2100 pCsr->iNode++; 2101 iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell); 2102 pNode = (TreeNode *)treeShmptr(pDb, iNodePtr); 2103 pCsr->apTreeNode[pCsr->iNode] = pNode; 2104 iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0); 2105 }while( pCsr->iNode < iLeaf ); 2106 } 2107 2108 /* Otherwise, the next key is found by following pointer up the tree 2109 ** until there is a key immediately to the right of the pointer followed 2110 ** to reach the sub-tree containing the current key. */ 2111 else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){ 2112 while( (--pCsr->iNode)>=0 ){ 2113 iCell = pCsr->aiCell[pCsr->iNode]; 2114 if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; 2115 } 2116 } 2117 2118 #ifndef NDEBUG 2119 if( pCsr->iNode>=0 ){ 2120 TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); 2121 assert( rc||treeKeycmp(TKV_KEY(pK2),pK2->nKey,TKV_KEY(pK1),pK1->nKey)>=0 ); 2122 } 2123 tblobFree(pDb, &key1); 2124 #endif 2125 2126 return rc; 2127 } 2128 2129 int lsmTreeCursorPrev(TreeCursor *pCsr){ 2130 #ifndef NDEBUG 2131 TreeKey *pK1; 2132 TreeBlob key1 = {0, 0}; 2133 #endif 2134 lsm_db *pDb = pCsr->pDb; 2135 TreeRoot *pRoot = pCsr->pRoot; 2136 const int iLeaf = pRoot->nHeight-1; 2137 int iCell; 2138 int rc = LSM_OK; 2139 TreeNode *pNode; 2140 2141 /* Restore the cursor position, if required */ 2142 int iRestore = 0; 2143 treeCursorRestore(pCsr, &iRestore); 2144 if( iRestore<0 ) return LSM_OK; 2145 2146 /* Save a pointer to the current key. This is used in an assert() at the 2147 ** end of this function - to check that the 'next' key really is smaller 2148 ** than the current key. */ 2149 #ifndef NDEBUG 2150 pK1 = csrGetKey(pCsr, &key1, &rc); 2151 if( rc!=LSM_OK ) return rc; 2152 #endif 2153 2154 assert( lsmTreeCursorValid(pCsr) ); 2155 pNode = pCsr->apTreeNode[pCsr->iNode]; 2156 iCell = pCsr->aiCell[pCsr->iNode]; 2157 assert( iCell>=0 && iCell<3 ); 2158 2159 /* If the current node is not a leaf, and the current cell has sub-tree 2160 ** associated with it, descend to the right-most key on the right-most 2161 ** leaf of the sub-tree. */ 2162 if( pCsr->iNode<iLeaf && getChildPtr(pNode, pRoot->iTransId, iCell) ){ 2163 do { 2164 u32 iNodePtr; 2165 pCsr->iNode++; 2166 iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell); 2167 pNode = (TreeNode *)treeShmptr(pDb, iNodePtr); 2168 if( rc!=LSM_OK ) break; 2169 pCsr->apTreeNode[pCsr->iNode] = pNode; 2170 iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf); 2171 pCsr->aiCell[pCsr->iNode] = (u8)iCell; 2172 }while( pCsr->iNode < iLeaf ); 2173 } 2174 2175 /* Otherwise, the next key is found by following pointer up the tree until 2176 ** there is a key immediately to the left of the pointer followed to reach 2177 ** the sub-tree containing the current key. */ 2178 else{ 2179 do { 2180 iCell = pCsr->aiCell[pCsr->iNode]-1; 2181 if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; 2182 }while( (--pCsr->iNode)>=0 ); 2183 pCsr->aiCell[pCsr->iNode] = (u8)iCell; 2184 } 2185 2186 #ifndef NDEBUG 2187 if( pCsr->iNode>=0 ){ 2188 TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); 2189 assert( rc || treeKeycmp(TKV_KEY(pK2),pK2->nKey,TKV_KEY(pK1),pK1->nKey)<0 ); 2190 } 2191 tblobFree(pDb, &key1); 2192 #endif 2193 2194 return rc; 2195 } 2196 2197 /* 2198 ** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the 2199 ** in-memory tree. 2200 */ 2201 int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){ 2202 lsm_db *pDb = pCsr->pDb; 2203 TreeRoot *pRoot = pCsr->pRoot; 2204 int rc = LSM_OK; 2205 2206 u32 iNodePtr; 2207 pCsr->iNode = -1; 2208 2209 /* Discard any saved position data */ 2210 treeCursorRestore(pCsr, 0); 2211 2212 iNodePtr = pRoot->iRoot; 2213 while( iNodePtr ){ 2214 int iCell; 2215 TreeNode *pNode; 2216 2217 pNode = (TreeNode *)treeShmptr(pDb, iNodePtr); 2218 if( rc ) break; 2219 2220 if( bLast ){ 2221 iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3); 2222 }else{ 2223 iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0); 2224 } 2225 pCsr->iNode++; 2226 pCsr->apTreeNode[pCsr->iNode] = pNode; 2227 2228 if( (u32)pCsr->iNode<pRoot->nHeight-1 ){ 2229 iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell); 2230 }else{ 2231 iNodePtr = 0; 2232 } 2233 pCsr->aiCell[pCsr->iNode] = (u8)(iCell - (iNodePtr==0 && bLast)); 2234 } 2235 2236 return rc; 2237 } 2238 2239 int lsmTreeCursorFlags(TreeCursor *pCsr){ 2240 int flags = 0; 2241 if( pCsr && pCsr->iNode>=0 ){ 2242 int rc = LSM_OK; 2243 TreeKey *pKey = (TreeKey *)treeShmptrUnsafe(pCsr->pDb, 2244 pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]] 2245 ); 2246 assert( rc==LSM_OK ); 2247 flags = (pKey->flags & ~LSM_CONTIGUOUS); 2248 } 2249 return flags; 2250 } 2251 2252 int lsmTreeCursorKey(TreeCursor *pCsr, int *pFlags, void **ppKey, int *pnKey){ 2253 TreeKey *pTreeKey; 2254 int rc = LSM_OK; 2255 2256 assert( lsmTreeCursorValid(pCsr) ); 2257 2258 pTreeKey = pCsr->pSave; 2259 if( !pTreeKey ){ 2260 pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); 2261 } 2262 if( rc==LSM_OK ){ 2263 *pnKey = pTreeKey->nKey; 2264 if( pFlags ) *pFlags = pTreeKey->flags; 2265 *ppKey = (void *)&pTreeKey[1]; 2266 } 2267 2268 return rc; 2269 } 2270 2271 int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){ 2272 int res = 0; 2273 int rc; 2274 2275 rc = treeCursorRestore(pCsr, &res); 2276 if( res==0 ){ 2277 TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); 2278 if( rc==LSM_OK ){ 2279 if( pTreeKey->flags & LSM_INSERT ){ 2280 *pnVal = pTreeKey->nValue; 2281 *ppVal = TKV_VAL(pTreeKey); 2282 }else{ 2283 *ppVal = 0; 2284 *pnVal = -1; 2285 } 2286 } 2287 }else{ 2288 *ppVal = 0; 2289 *pnVal = 0; 2290 } 2291 2292 return rc; 2293 } 2294 2295 /* 2296 ** Return true if the cursor currently points to a valid entry. 2297 */ 2298 int lsmTreeCursorValid(TreeCursor *pCsr){ 2299 return (pCsr && (pCsr->pSave || pCsr->iNode>=0)); 2300 } 2301 2302 /* 2303 ** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a 2304 ** pointer to the same TreeMark structure may be used to roll the tree 2305 ** contents back to their current state. 2306 */ 2307 void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){ 2308 pMark->iRoot = pDb->treehdr.root.iRoot; 2309 pMark->nHeight = pDb->treehdr.root.nHeight; 2310 pMark->iWrite = pDb->treehdr.iWrite; 2311 pMark->nChunk = pDb->treehdr.nChunk; 2312 pMark->iNextShmid = pDb->treehdr.iNextShmid; 2313 pMark->iRollback = intArraySize(&pDb->rollback); 2314 } 2315 2316 /* 2317 ** Roll back to mark pMark. Structure *pMark should have been previously 2318 ** populated by a call to lsmTreeMark(). 2319 */ 2320 void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){ 2321 int iIdx; 2322 int nIdx; 2323 u32 iNext; 2324 ShmChunk *pChunk; 2325 u32 iChunk; 2326 u32 iShmid; 2327 2328 /* Revert all required v2 pointers. */ 2329 nIdx = intArraySize(&pDb->rollback); 2330 for(iIdx = pMark->iRollback; iIdx<nIdx; iIdx++){ 2331 TreeNode *pNode; 2332 pNode = treeShmptr(pDb, intArrayEntry(&pDb->rollback, iIdx)); 2333 assert( pNode ); 2334 pNode->iV2 = 0; 2335 pNode->iV2Child = 0; 2336 pNode->iV2Ptr = 0; 2337 } 2338 intArrayTruncate(&pDb->rollback, pMark->iRollback); 2339 2340 /* Restore the free-chunk list. */ 2341 assert( pMark->iWrite!=0 ); 2342 iChunk = treeOffsetToChunk(pMark->iWrite-1); 2343 pChunk = treeShmChunk(pDb, iChunk); 2344 iNext = pChunk->iNext; 2345 pChunk->iNext = 0; 2346 2347 pChunk = treeShmChunk(pDb, pDb->treehdr.iFirst); 2348 iShmid = pChunk->iShmid-1; 2349 2350 while( iNext ){ 2351 u32 iFree = iNext; /* Current chunk being rollback-freed */ 2352 ShmChunk *pFree; /* Pointer to chunk iFree */ 2353 2354 pFree = treeShmChunk(pDb, iFree); 2355 iNext = pFree->iNext; 2356 2357 if( iFree<pMark->nChunk ){ 2358 pFree->iNext = pDb->treehdr.iFirst; 2359 pFree->iShmid = iShmid--; 2360 pDb->treehdr.iFirst = iFree; 2361 } 2362 } 2363 2364 /* Restore the tree-header fields */ 2365 pDb->treehdr.root.iRoot = pMark->iRoot; 2366 pDb->treehdr.root.nHeight = pMark->nHeight; 2367 pDb->treehdr.iWrite = pMark->iWrite; 2368 pDb->treehdr.nChunk = pMark->nChunk; 2369 pDb->treehdr.iNextShmid = pMark->iNextShmid; 2370 } 2371 2372 /* 2373 ** Load the in-memory tree header from shared-memory into pDb->treehdr. 2374 ** If the header cannot be loaded, return LSM_PROTOCOL. 2375 ** 2376 ** If the header is successfully loaded and parameter piRead is not NULL, 2377 ** is is set to 1 if the header was loaded from ShmHeader.hdr1, or 2 if 2378 ** the header was loaded from ShmHeader.hdr2. 2379 */ 2380 int lsmTreeLoadHeader(lsm_db *pDb, int *piRead){ 2381 int nRem = LSM_ATTEMPTS_BEFORE_PROTOCOL; 2382 while( (nRem--)>0 ){ 2383 ShmHeader *pShm = pDb->pShmhdr; 2384 2385 memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader)); 2386 if( treeHeaderChecksumOk(&pDb->treehdr) ){ 2387 if( piRead ) *piRead = 1; 2388 return LSM_OK; 2389 } 2390 memcpy(&pDb->treehdr, &pShm->hdr2, sizeof(TreeHeader)); 2391 if( treeHeaderChecksumOk(&pDb->treehdr) ){ 2392 if( piRead ) *piRead = 2; 2393 return LSM_OK; 2394 } 2395 2396 lsmShmBarrier(pDb); 2397 } 2398 return LSM_PROTOCOL_BKPT; 2399 } 2400 2401 int lsmTreeLoadHeaderOk(lsm_db *pDb, int iRead){ 2402 TreeHeader *p = (iRead==1) ? &pDb->pShmhdr->hdr1 : &pDb->pShmhdr->hdr2; 2403 assert( iRead==1 || iRead==2 ); 2404 return (0==memcmp(pDb->treehdr.aCksum, p->aCksum, sizeof(u32)*2)); 2405 } 2406 2407 /* 2408 ** This function is called to conclude a transaction. If argument bCommit 2409 ** is true, the transaction is committed. Otherwise it is rolled back. 2410 */ 2411 int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){ 2412 ShmHeader *pShm = pDb->pShmhdr; 2413 2414 treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum); 2415 memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader)); 2416 lsmShmBarrier(pDb); 2417 memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)); 2418 pShm->bWriter = 0; 2419 intArrayFree(pDb->pEnv, &pDb->rollback); 2420 2421 return LSM_OK; 2422 } 2423 2424 #ifndef NDEBUG 2425 static int assert_delete_ranges_match(lsm_db *db){ 2426 int prev = 0; 2427 TreeBlob blob = {0, 0}; 2428 TreeCursor csr; /* Cursor used to iterate through tree */ 2429 int rc; 2430 2431 treeCursorInit(db, 0, &csr); 2432 for( rc = lsmTreeCursorEnd(&csr, 0); 2433 rc==LSM_OK && lsmTreeCursorValid(&csr); 2434 rc = lsmTreeCursorNext(&csr) 2435 ){ 2436 TreeKey *pKey = csrGetKey(&csr, &blob, &rc); 2437 if( rc!=LSM_OK ) break; 2438 assert( ((prev&LSM_START_DELETE)==0)==((pKey->flags&LSM_END_DELETE)==0) ); 2439 prev = pKey->flags; 2440 } 2441 2442 tblobFree(csr.pDb, &csr.blob); 2443 tblobFree(csr.pDb, &blob); 2444 2445 return 1; 2446 } 2447 2448 static int treeCountEntries(lsm_db *db){ 2449 TreeCursor csr; /* Cursor used to iterate through tree */ 2450 int rc; 2451 int nEntry = 0; 2452 2453 treeCursorInit(db, 0, &csr); 2454 for( rc = lsmTreeCursorEnd(&csr, 0); 2455 rc==LSM_OK && lsmTreeCursorValid(&csr); 2456 rc = lsmTreeCursorNext(&csr) 2457 ){ 2458 nEntry++; 2459 } 2460 2461 tblobFree(csr.pDb, &csr.blob); 2462 2463 return nEntry; 2464 } 2465 #endif