modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/lsm1/lsm_sorted.c (about) 1 /* 2 ** 2011-08-14 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** 13 ** PAGE FORMAT: 14 ** 15 ** The maximum page size is 65536 bytes. 16 ** 17 ** Since all records are equal to or larger than 2 bytes in size, and 18 ** some space within the page is consumed by the page footer, there must 19 ** be less than 2^15 records on each page. 20 ** 21 ** Each page ends with a footer that describes the pages contents. This 22 ** footer serves as similar purpose to the page header in an SQLite database. 23 ** A footer is used instead of a header because it makes it easier to 24 ** populate a new page based on a sorted list of key/value pairs. 25 ** 26 ** The footer consists of the following values (starting at the end of 27 ** the page and continuing backwards towards the start). All values are 28 ** stored as unsigned big-endian integers. 29 ** 30 ** * Number of records on page (2 bytes). 31 ** * Flags field (2 bytes). 32 ** * Left-hand pointer value (8 bytes). 33 ** * The starting offset of each record (2 bytes per record). 34 ** 35 ** Records may span pages. Unless it happens to be an exact fit, the part 36 ** of the final record that starts on page X that does not fit on page X 37 ** is stored at the start of page (X+1). This means there may be pages where 38 ** (N==0). And on most pages the first record that starts on the page will 39 ** not start at byte offset 0. For example: 40 ** 41 ** aaaaa bbbbb ccc <footer> cc eeeee fffff g <footer> gggg.... 42 ** 43 ** RECORD FORMAT: 44 ** 45 ** The first byte of the record is a flags byte. It is a combination 46 ** of the following flags (defined in lsmInt.h): 47 ** 48 ** LSM_START_DELETE 49 ** LSM_END_DELETE 50 ** LSM_POINT_DELETE 51 ** LSM_INSERT 52 ** LSM_SEPARATOR 53 ** LSM_SYSTEMKEY 54 ** 55 ** Immediately following the type byte is a pointer to the smallest key 56 ** in the next file that is larger than the key in the current record. The 57 ** pointer is encoded as a varint. When added to the 32-bit page number 58 ** stored in the footer, it is the page number of the page that contains the 59 ** smallest key in the next sorted file that is larger than this key. 60 ** 61 ** Next is the number of bytes in the key, encoded as a varint. 62 ** 63 ** If the LSM_INSERT flag is set, the number of bytes in the value, as 64 ** a varint, is next. 65 ** 66 ** Finally, the blob of data containing the key, and for LSM_INSERT 67 ** records, the value as well. 68 */ 69 70 #ifndef _LSM_INT_H 71 # include "lsmInt.h" 72 #endif 73 74 #define LSM_LOG_STRUCTURE 0 75 #define LSM_LOG_DATA 0 76 77 /* 78 ** Macros to help decode record types. 79 */ 80 #define rtTopic(eType) ((eType) & LSM_SYSTEMKEY) 81 #define rtIsDelete(eType) (((eType) & 0x0F)==LSM_POINT_DELETE) 82 83 #define rtIsSeparator(eType) (((eType) & LSM_SEPARATOR)!=0) 84 #define rtIsWrite(eType) (((eType) & LSM_INSERT)!=0) 85 #define rtIsSystem(eType) (((eType) & LSM_SYSTEMKEY)!=0) 86 87 /* 88 ** The following macros are used to access a page footer. 89 */ 90 #define SEGMENT_NRECORD_OFFSET(pgsz) ((pgsz) - 2) 91 #define SEGMENT_FLAGS_OFFSET(pgsz) ((pgsz) - 2 - 2) 92 #define SEGMENT_POINTER_OFFSET(pgsz) ((pgsz) - 2 - 2 - 8) 93 #define SEGMENT_CELLPTR_OFFSET(pgsz, iCell) ((pgsz) - 2 - 2 - 8 - 2 - (iCell)*2) 94 95 #define SEGMENT_EOF(pgsz, nEntry) SEGMENT_CELLPTR_OFFSET(pgsz, nEntry) 96 97 #define SEGMENT_BTREE_FLAG 0x0001 98 #define PGFTR_SKIP_NEXT_FLAG 0x0002 99 #define PGFTR_SKIP_THIS_FLAG 0x0004 100 101 102 #ifndef LSM_SEGMENTPTR_FREE_THRESHOLD 103 # define LSM_SEGMENTPTR_FREE_THRESHOLD 1024 104 #endif 105 106 typedef struct SegmentPtr SegmentPtr; 107 typedef struct Blob Blob; 108 109 struct Blob { 110 lsm_env *pEnv; 111 void *pData; 112 int nData; 113 int nAlloc; 114 }; 115 116 /* 117 ** A SegmentPtr object may be used for one of two purposes: 118 ** 119 ** * To iterate and/or seek within a single Segment (the combination of a 120 ** main run and an optional sorted run). 121 ** 122 ** * To iterate through the separators array of a segment. 123 */ 124 struct SegmentPtr { 125 Level *pLevel; /* Level object segment is part of */ 126 Segment *pSeg; /* Segment to access */ 127 128 /* Current page. See segmentPtrLoadPage(). */ 129 Page *pPg; /* Current page */ 130 u16 flags; /* Copy of page flags field */ 131 int nCell; /* Number of cells on pPg */ 132 Pgno iPtr; /* Base cascade pointer */ 133 134 /* Current cell. See segmentPtrLoadCell() */ 135 int iCell; /* Current record within page pPg */ 136 int eType; /* Type of current record */ 137 Pgno iPgPtr; /* Cascade pointer offset */ 138 void *pKey; int nKey; /* Key associated with current record */ 139 void *pVal; int nVal; /* Current record value (eType==WRITE only) */ 140 141 /* Blobs used to allocate buffers for pKey and pVal as required */ 142 Blob blob1; 143 Blob blob2; 144 }; 145 146 /* 147 ** Used to iterate through the keys stored in a b-tree hierarchy from start 148 ** to finish. Only First() and Next() operations are required. 149 ** 150 ** btreeCursorNew() 151 ** btreeCursorFirst() 152 ** btreeCursorNext() 153 ** btreeCursorFree() 154 ** btreeCursorPosition() 155 ** btreeCursorRestore() 156 */ 157 typedef struct BtreePg BtreePg; 158 typedef struct BtreeCursor BtreeCursor; 159 struct BtreePg { 160 Page *pPage; 161 int iCell; 162 }; 163 struct BtreeCursor { 164 Segment *pSeg; /* Iterate through this segments btree */ 165 FileSystem *pFS; /* File system to read pages from */ 166 int nDepth; /* Allocated size of aPg[] */ 167 int iPg; /* Current entry in aPg[]. -1 -> EOF. */ 168 BtreePg *aPg; /* Pages from root to current location */ 169 170 /* Cache of current entry. pKey==0 for EOF. */ 171 void *pKey; 172 int nKey; 173 int eType; 174 Pgno iPtr; 175 176 /* Storage for key, if not local */ 177 Blob blob; 178 }; 179 180 181 /* 182 ** A cursor used for merged searches or iterations through up to one 183 ** Tree structure and any number of sorted files. 184 ** 185 ** lsmMCursorNew() 186 ** lsmMCursorSeek() 187 ** lsmMCursorNext() 188 ** lsmMCursorPrev() 189 ** lsmMCursorFirst() 190 ** lsmMCursorLast() 191 ** lsmMCursorKey() 192 ** lsmMCursorValue() 193 ** lsmMCursorValid() 194 ** 195 ** iFree: 196 ** This variable is only used by cursors providing input data for a 197 ** new top-level segment. Such cursors only ever iterate forwards, not 198 ** backwards. 199 */ 200 struct MultiCursor { 201 lsm_db *pDb; /* Connection that owns this cursor */ 202 MultiCursor *pNext; /* Next cursor owned by connection pDb */ 203 int flags; /* Mask of CURSOR_XXX flags */ 204 205 int eType; /* Cache of current key type */ 206 Blob key; /* Cache of current key (or NULL) */ 207 Blob val; /* Cache of current value */ 208 209 /* All the component cursors: */ 210 TreeCursor *apTreeCsr[2]; /* Up to two tree cursors */ 211 int iFree; /* Next element of free-list (-ve for eof) */ 212 SegmentPtr *aPtr; /* Array of segment pointers */ 213 int nPtr; /* Size of array aPtr[] */ 214 BtreeCursor *pBtCsr; /* b-tree cursor (db writes only) */ 215 216 /* Comparison results */ 217 int nTree; /* Size of aTree[] array */ 218 int *aTree; /* Array of comparison results */ 219 220 /* Used by cursors flushing the in-memory tree only */ 221 void *pSystemVal; /* Pointer to buffer to free */ 222 223 /* Used by worker cursors only */ 224 Pgno *pPrevMergePtr; 225 }; 226 227 /* 228 ** The following constants are used to assign integers to each component 229 ** cursor of a multi-cursor. 230 */ 231 #define CURSOR_DATA_TREE0 0 /* Current tree cursor (apTreeCsr[0]) */ 232 #define CURSOR_DATA_TREE1 1 /* The "old" tree, if any (apTreeCsr[1]) */ 233 #define CURSOR_DATA_SYSTEM 2 /* Free-list entries (new-toplevel only) */ 234 #define CURSOR_DATA_SEGMENT 3 /* First segment pointer (aPtr[0]) */ 235 236 /* 237 ** CURSOR_IGNORE_DELETE 238 ** If set, this cursor will not visit SORTED_DELETE keys. 239 ** 240 ** CURSOR_FLUSH_FREELIST 241 ** This cursor is being used to create a new toplevel. It should also 242 ** iterate through the contents of the in-memory free block list. 243 ** 244 ** CURSOR_IGNORE_SYSTEM 245 ** If set, this cursor ignores system keys. 246 ** 247 ** CURSOR_NEXT_OK 248 ** Set if it is Ok to call lsm_csr_next(). 249 ** 250 ** CURSOR_PREV_OK 251 ** Set if it is Ok to call lsm_csr_prev(). 252 ** 253 ** CURSOR_READ_SEPARATORS 254 ** Set if this cursor should visit the separator keys in segment 255 ** aPtr[nPtr-1]. 256 ** 257 ** CURSOR_SEEK_EQ 258 ** Cursor has undergone a successful lsm_csr_seek(LSM_SEEK_EQ) operation. 259 ** The key and value are stored in MultiCursor.key and MultiCursor.val 260 ** respectively. 261 */ 262 #define CURSOR_IGNORE_DELETE 0x00000001 263 #define CURSOR_FLUSH_FREELIST 0x00000002 264 #define CURSOR_IGNORE_SYSTEM 0x00000010 265 #define CURSOR_NEXT_OK 0x00000020 266 #define CURSOR_PREV_OK 0x00000040 267 #define CURSOR_READ_SEPARATORS 0x00000080 268 #define CURSOR_SEEK_EQ 0x00000100 269 270 typedef struct MergeWorker MergeWorker; 271 typedef struct Hierarchy Hierarchy; 272 273 struct Hierarchy { 274 Page **apHier; 275 int nHier; 276 }; 277 278 /* 279 ** aSave: 280 ** When mergeWorkerNextPage() is called to advance to the next page in 281 ** the output segment, if the bStore flag for an element of aSave[] is 282 ** true, it is cleared and the corresponding iPgno value is set to the 283 ** page number of the page just completed. 284 ** 285 ** aSave[0] is used to record the pointer value to be pushed into the 286 ** b-tree hierarchy. aSave[1] is used to save the page number of the 287 ** page containing the indirect key most recently written to the b-tree. 288 ** see mergeWorkerPushHierarchy() for details. 289 */ 290 struct MergeWorker { 291 lsm_db *pDb; /* Database handle */ 292 Level *pLevel; /* Worker snapshot Level being merged */ 293 MultiCursor *pCsr; /* Cursor to read new segment contents from */ 294 int bFlush; /* True if this is an in-memory tree flush */ 295 Hierarchy hier; /* B-tree hierarchy under construction */ 296 Page *pPage; /* Current output page */ 297 int nWork; /* Number of calls to mergeWorkerNextPage() */ 298 Pgno *aGobble; /* Gobble point for each input segment */ 299 300 Pgno iIndirect; 301 struct SavedPgno { 302 Pgno iPgno; 303 int bStore; 304 } aSave[2]; 305 }; 306 307 #ifdef LSM_DEBUG_EXPENSIVE 308 static int assertPointersOk(lsm_db *, Segment *, Segment *, int); 309 static int assertBtreeOk(lsm_db *, Segment *); 310 static void assertRunInOrder(lsm_db *pDb, Segment *pSeg); 311 #else 312 #define assertRunInOrder(x,y) 313 #define assertBtreeOk(x,y) 314 #endif 315 316 317 struct FilePage { u8 *aData; int nData; }; 318 static u8 *fsPageData(Page *pPg, int *pnData){ 319 *pnData = ((struct FilePage *)(pPg))->nData; 320 return ((struct FilePage *)(pPg))->aData; 321 } 322 /*UNUSED static u8 *fsPageDataPtr(Page *pPg){ 323 return ((struct FilePage *)(pPg))->aData; 324 }*/ 325 326 /* 327 ** Write nVal as a 16-bit unsigned big-endian integer into buffer aOut. 328 */ 329 void lsmPutU16(u8 *aOut, u16 nVal){ 330 aOut[0] = (u8)((nVal>>8) & 0xFF); 331 aOut[1] = (u8)(nVal & 0xFF); 332 } 333 334 void lsmPutU32(u8 *aOut, u32 nVal){ 335 aOut[0] = (u8)((nVal>>24) & 0xFF); 336 aOut[1] = (u8)((nVal>>16) & 0xFF); 337 aOut[2] = (u8)((nVal>> 8) & 0xFF); 338 aOut[3] = (u8)((nVal ) & 0xFF); 339 } 340 341 int lsmGetU16(u8 *aOut){ 342 return (aOut[0] << 8) + aOut[1]; 343 } 344 345 u32 lsmGetU32(u8 *aOut){ 346 return ((u32)aOut[0] << 24) 347 + ((u32)aOut[1] << 16) 348 + ((u32)aOut[2] << 8) 349 + ((u32)aOut[3]); 350 } 351 352 u64 lsmGetU64(u8 *aOut){ 353 return ((u64)aOut[0] << 56) 354 + ((u64)aOut[1] << 48) 355 + ((u64)aOut[2] << 40) 356 + ((u64)aOut[3] << 32) 357 + ((u64)aOut[4] << 24) 358 + ((u32)aOut[5] << 16) 359 + ((u32)aOut[6] << 8) 360 + ((u32)aOut[7]); 361 } 362 363 void lsmPutU64(u8 *aOut, u64 nVal){ 364 aOut[0] = (u8)((nVal>>56) & 0xFF); 365 aOut[1] = (u8)((nVal>>48) & 0xFF); 366 aOut[2] = (u8)((nVal>>40) & 0xFF); 367 aOut[3] = (u8)((nVal>>32) & 0xFF); 368 aOut[4] = (u8)((nVal>>24) & 0xFF); 369 aOut[5] = (u8)((nVal>>16) & 0xFF); 370 aOut[6] = (u8)((nVal>> 8) & 0xFF); 371 aOut[7] = (u8)((nVal ) & 0xFF); 372 } 373 374 static int sortedBlobGrow(lsm_env *pEnv, Blob *pBlob, int nData){ 375 assert( pBlob->pEnv==pEnv || (pBlob->pEnv==0 && pBlob->pData==0) ); 376 if( pBlob->nAlloc<nData ){ 377 pBlob->pData = lsmReallocOrFree(pEnv, pBlob->pData, nData); 378 if( !pBlob->pData ) return LSM_NOMEM_BKPT; 379 pBlob->nAlloc = nData; 380 pBlob->pEnv = pEnv; 381 } 382 return LSM_OK; 383 } 384 385 static int sortedBlobSet(lsm_env *pEnv, Blob *pBlob, void *pData, int nData){ 386 if( sortedBlobGrow(pEnv, pBlob, nData) ) return LSM_NOMEM; 387 memcpy(pBlob->pData, pData, nData); 388 pBlob->nData = nData; 389 return LSM_OK; 390 } 391 392 #if 0 393 static int sortedBlobCopy(Blob *pDest, Blob *pSrc){ 394 return sortedBlobSet(pDest, pSrc->pData, pSrc->nData); 395 } 396 #endif 397 398 static void sortedBlobFree(Blob *pBlob){ 399 assert( pBlob->pEnv || pBlob->pData==0 ); 400 if( pBlob->pData ) lsmFree(pBlob->pEnv, pBlob->pData); 401 memset(pBlob, 0, sizeof(Blob)); 402 } 403 404 static int sortedReadData( 405 Segment *pSeg, 406 Page *pPg, 407 int iOff, 408 int nByte, 409 void **ppData, 410 Blob *pBlob 411 ){ 412 int rc = LSM_OK; 413 int iEnd; 414 int nData; 415 int nCell; 416 u8 *aData; 417 418 aData = fsPageData(pPg, &nData); 419 nCell = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]); 420 iEnd = SEGMENT_EOF(nData, nCell); 421 assert( iEnd>0 && iEnd<nData ); 422 423 if( iOff+nByte<=iEnd ){ 424 *ppData = (void *)&aData[iOff]; 425 }else{ 426 int nRem = nByte; 427 int i = iOff; 428 u8 *aDest; 429 430 /* Make sure the blob is big enough to store the value being loaded. */ 431 rc = sortedBlobGrow(lsmPageEnv(pPg), pBlob, nByte); 432 if( rc!=LSM_OK ) return rc; 433 pBlob->nData = nByte; 434 aDest = (u8 *)pBlob->pData; 435 *ppData = pBlob->pData; 436 437 /* Increment the pointer pages ref-count. */ 438 lsmFsPageRef(pPg); 439 440 while( rc==LSM_OK ){ 441 Page *pNext; 442 int flags; 443 444 /* Copy data from pPg into the output buffer. */ 445 int nCopy = LSM_MIN(nRem, iEnd-i); 446 if( nCopy>0 ){ 447 memcpy(&aDest[nByte-nRem], &aData[i], nCopy); 448 nRem -= nCopy; 449 i += nCopy; 450 assert( nRem==0 || i==iEnd ); 451 } 452 assert( nRem>=0 ); 453 if( nRem==0 ) break; 454 i -= iEnd; 455 456 /* Grab the next page in the segment */ 457 458 do { 459 rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext); 460 if( rc==LSM_OK && pNext==0 ){ 461 rc = LSM_CORRUPT_BKPT; 462 } 463 if( rc ) break; 464 lsmFsPageRelease(pPg); 465 pPg = pNext; 466 aData = fsPageData(pPg, &nData); 467 flags = lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]); 468 }while( flags&SEGMENT_BTREE_FLAG ); 469 470 iEnd = SEGMENT_EOF(nData, lsmGetU16(&aData[nData-2])); 471 assert( iEnd>0 && iEnd<nData ); 472 } 473 474 lsmFsPageRelease(pPg); 475 } 476 477 return rc; 478 } 479 480 static int pageGetNRec(u8 *aData, int nData){ 481 return (int)lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]); 482 } 483 484 static Pgno pageGetPtr(u8 *aData, int nData){ 485 return (Pgno)lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]); 486 } 487 488 static int pageGetFlags(u8 *aData, int nData){ 489 return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]); 490 } 491 492 static u8 *pageGetCell(u8 *aData, int nData, int iCell){ 493 return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])]; 494 } 495 496 /* 497 ** Return the number of cells on page pPg. 498 */ 499 static int pageObjGetNRec(Page *pPg){ 500 int nData; 501 u8 *aData = lsmFsPageData(pPg, &nData); 502 return pageGetNRec(aData, nData); 503 } 504 505 /* 506 ** Return the decoded (possibly relative) pointer value stored in cell 507 ** iCell from page aData/nData. 508 */ 509 static Pgno pageGetRecordPtr(u8 *aData, int nData, int iCell){ 510 Pgno iRet; /* Return value */ 511 u8 *aCell; /* Pointer to cell iCell */ 512 513 assert( iCell<pageGetNRec(aData, nData) && iCell>=0 ); 514 aCell = pageGetCell(aData, nData, iCell); 515 lsmVarintGet64(&aCell[1], &iRet); 516 return iRet; 517 } 518 519 static u8 *pageGetKey( 520 Segment *pSeg, /* Segment pPg belongs to */ 521 Page *pPg, /* Page to read from */ 522 int iCell, /* Index of cell on page to read */ 523 int *piTopic, /* OUT: Topic associated with this key */ 524 int *pnKey, /* OUT: Size of key in bytes */ 525 Blob *pBlob /* If required, use this for dynamic memory */ 526 ){ 527 u8 *pKey; 528 int nDummy; 529 int eType; 530 u8 *aData; 531 int nData; 532 533 aData = fsPageData(pPg, &nData); 534 535 assert( !(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) ); 536 assert( iCell<pageGetNRec(aData, nData) ); 537 538 pKey = pageGetCell(aData, nData, iCell); 539 eType = *pKey++; 540 pKey += lsmVarintGet32(pKey, &nDummy); 541 pKey += lsmVarintGet32(pKey, pnKey); 542 if( rtIsWrite(eType) ){ 543 pKey += lsmVarintGet32(pKey, &nDummy); 544 } 545 *piTopic = rtTopic(eType); 546 547 sortedReadData(pSeg, pPg, pKey-aData, *pnKey, (void **)&pKey, pBlob); 548 return pKey; 549 } 550 551 static int pageGetKeyCopy( 552 lsm_env *pEnv, /* Environment handle */ 553 Segment *pSeg, /* Segment pPg belongs to */ 554 Page *pPg, /* Page to read from */ 555 int iCell, /* Index of cell on page to read */ 556 int *piTopic, /* OUT: Topic associated with this key */ 557 Blob *pBlob /* If required, use this for dynamic memory */ 558 ){ 559 int rc = LSM_OK; 560 int nKey; 561 u8 *aKey; 562 563 aKey = pageGetKey(pSeg, pPg, iCell, piTopic, &nKey, pBlob); 564 assert( (void *)aKey!=pBlob->pData || nKey==pBlob->nData ); 565 if( (void *)aKey!=pBlob->pData ){ 566 rc = sortedBlobSet(pEnv, pBlob, aKey, nKey); 567 } 568 569 return rc; 570 } 571 572 static Pgno pageGetBtreeRef(Page *pPg, int iKey){ 573 Pgno iRef; 574 u8 *aData; 575 int nData; 576 u8 *aCell; 577 578 aData = fsPageData(pPg, &nData); 579 aCell = pageGetCell(aData, nData, iKey); 580 assert( aCell[0]==0 ); 581 aCell++; 582 aCell += lsmVarintGet64(aCell, &iRef); 583 lsmVarintGet64(aCell, &iRef); 584 assert( iRef>0 ); 585 return iRef; 586 } 587 588 #define GETVARINT64(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet64((a), &(i))) 589 #define GETVARINT32(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet32((a), &(i))) 590 591 static int pageGetBtreeKey( 592 Segment *pSeg, /* Segment page pPg belongs to */ 593 Page *pPg, 594 int iKey, 595 Pgno *piPtr, 596 int *piTopic, 597 void **ppKey, 598 int *pnKey, 599 Blob *pBlob 600 ){ 601 u8 *aData; 602 int nData; 603 u8 *aCell; 604 int eType; 605 606 aData = fsPageData(pPg, &nData); 607 assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) ); 608 assert( iKey>=0 && iKey<pageGetNRec(aData, nData) ); 609 610 aCell = pageGetCell(aData, nData, iKey); 611 eType = *aCell++; 612 aCell += GETVARINT64(aCell, *piPtr); 613 614 if( eType==0 ){ 615 int rc; 616 Pgno iRef; /* Page number of referenced page */ 617 Page *pRef; 618 aCell += GETVARINT64(aCell, iRef); 619 rc = lsmFsDbPageGet(lsmPageFS(pPg), pSeg, iRef, &pRef); 620 if( rc!=LSM_OK ) return rc; 621 pageGetKeyCopy(lsmPageEnv(pPg), pSeg, pRef, 0, &eType, pBlob); 622 lsmFsPageRelease(pRef); 623 *ppKey = pBlob->pData; 624 *pnKey = pBlob->nData; 625 }else{ 626 aCell += GETVARINT32(aCell, *pnKey); 627 *ppKey = aCell; 628 } 629 if( piTopic ) *piTopic = rtTopic(eType); 630 631 return LSM_OK; 632 } 633 634 static int btreeCursorLoadKey(BtreeCursor *pCsr){ 635 int rc = LSM_OK; 636 if( pCsr->iPg<0 ){ 637 pCsr->pKey = 0; 638 pCsr->nKey = 0; 639 pCsr->eType = 0; 640 }else{ 641 Pgno dummy; 642 int iPg = pCsr->iPg; 643 int iCell = pCsr->aPg[iPg].iCell; 644 while( iCell<0 && (--iPg)>=0 ){ 645 iCell = pCsr->aPg[iPg].iCell-1; 646 } 647 if( iPg<0 || iCell<0 ) return LSM_CORRUPT_BKPT; 648 649 rc = pageGetBtreeKey( 650 pCsr->pSeg, 651 pCsr->aPg[iPg].pPage, iCell, 652 &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob 653 ); 654 pCsr->eType |= LSM_SEPARATOR; 655 } 656 657 return rc; 658 } 659 660 static int btreeCursorPtr(u8 *aData, int nData, int iCell){ 661 int nCell; 662 663 nCell = pageGetNRec(aData, nData); 664 if( iCell>=nCell ){ 665 return (int)pageGetPtr(aData, nData); 666 } 667 return (int)pageGetRecordPtr(aData, nData, iCell); 668 } 669 670 static int btreeCursorNext(BtreeCursor *pCsr){ 671 int rc = LSM_OK; 672 673 BtreePg *pPg = &pCsr->aPg[pCsr->iPg]; 674 int nCell; 675 u8 *aData; 676 int nData; 677 678 assert( pCsr->iPg>=0 ); 679 assert( pCsr->iPg==pCsr->nDepth-1 ); 680 681 aData = fsPageData(pPg->pPage, &nData); 682 nCell = pageGetNRec(aData, nData); 683 assert( pPg->iCell<=nCell ); 684 pPg->iCell++; 685 if( pPg->iCell==nCell ){ 686 Pgno iLoad; 687 688 /* Up to parent. */ 689 lsmFsPageRelease(pPg->pPage); 690 pPg->pPage = 0; 691 pCsr->iPg--; 692 while( pCsr->iPg>=0 ){ 693 pPg = &pCsr->aPg[pCsr->iPg]; 694 aData = fsPageData(pPg->pPage, &nData); 695 if( pPg->iCell<pageGetNRec(aData, nData) ) break; 696 lsmFsPageRelease(pPg->pPage); 697 pCsr->iPg--; 698 } 699 700 /* Read the key */ 701 rc = btreeCursorLoadKey(pCsr); 702 703 /* Unless the cursor is at EOF, descend to cell -1 (yes, negative one) of 704 ** the left-most most descendent. */ 705 if( pCsr->iPg>=0 ){ 706 pCsr->aPg[pCsr->iPg].iCell++; 707 708 iLoad = btreeCursorPtr(aData, nData, pPg->iCell); 709 do { 710 Page *pLoad; 711 pCsr->iPg++; 712 rc = lsmFsDbPageGet(pCsr->pFS, pCsr->pSeg, iLoad, &pLoad); 713 pCsr->aPg[pCsr->iPg].pPage = pLoad; 714 pCsr->aPg[pCsr->iPg].iCell = 0; 715 if( rc==LSM_OK ){ 716 if( pCsr->iPg==(pCsr->nDepth-1) ) break; 717 aData = fsPageData(pLoad, &nData); 718 iLoad = btreeCursorPtr(aData, nData, 0); 719 } 720 }while( rc==LSM_OK && pCsr->iPg<(pCsr->nDepth-1) ); 721 pCsr->aPg[pCsr->iPg].iCell = -1; 722 } 723 724 }else{ 725 rc = btreeCursorLoadKey(pCsr); 726 } 727 728 if( rc==LSM_OK && pCsr->iPg>=0 ){ 729 aData = fsPageData(pCsr->aPg[pCsr->iPg].pPage, &nData); 730 pCsr->iPtr = btreeCursorPtr(aData, nData, pCsr->aPg[pCsr->iPg].iCell+1); 731 } 732 733 return rc; 734 } 735 736 static void btreeCursorFree(BtreeCursor *pCsr){ 737 if( pCsr ){ 738 int i; 739 lsm_env *pEnv = lsmFsEnv(pCsr->pFS); 740 for(i=0; i<=pCsr->iPg; i++){ 741 lsmFsPageRelease(pCsr->aPg[i].pPage); 742 } 743 sortedBlobFree(&pCsr->blob); 744 lsmFree(pEnv, pCsr->aPg); 745 lsmFree(pEnv, pCsr); 746 } 747 } 748 749 static int btreeCursorFirst(BtreeCursor *pCsr){ 750 int rc; 751 752 Page *pPg = 0; 753 FileSystem *pFS = pCsr->pFS; 754 int iPg = (int)pCsr->pSeg->iRoot; 755 756 do { 757 rc = lsmFsDbPageGet(pFS, pCsr->pSeg, iPg, &pPg); 758 assert( (rc==LSM_OK)==(pPg!=0) ); 759 if( rc==LSM_OK ){ 760 u8 *aData; 761 int nData; 762 int flags; 763 764 aData = fsPageData(pPg, &nData); 765 flags = pageGetFlags(aData, nData); 766 if( (flags & SEGMENT_BTREE_FLAG)==0 ) break; 767 768 if( (pCsr->nDepth % 8)==0 ){ 769 int nNew = pCsr->nDepth + 8; 770 pCsr->aPg = (BtreePg *)lsmReallocOrFreeRc( 771 lsmFsEnv(pFS), pCsr->aPg, sizeof(BtreePg) * nNew, &rc 772 ); 773 if( rc==LSM_OK ){ 774 memset(&pCsr->aPg[pCsr->nDepth], 0, sizeof(BtreePg) * 8); 775 } 776 } 777 778 if( rc==LSM_OK ){ 779 assert( pCsr->aPg[pCsr->nDepth].iCell==0 ); 780 pCsr->aPg[pCsr->nDepth].pPage = pPg; 781 pCsr->nDepth++; 782 iPg = (int)pageGetRecordPtr(aData, nData, 0); 783 } 784 } 785 }while( rc==LSM_OK ); 786 lsmFsPageRelease(pPg); 787 pCsr->iPg = pCsr->nDepth-1; 788 789 if( rc==LSM_OK && pCsr->nDepth ){ 790 pCsr->aPg[pCsr->iPg].iCell = -1; 791 rc = btreeCursorNext(pCsr); 792 } 793 794 return rc; 795 } 796 797 static void btreeCursorPosition(BtreeCursor *pCsr, MergeInput *p){ 798 if( pCsr->iPg>=0 ){ 799 p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage); 800 p->iCell = ((pCsr->aPg[pCsr->iPg].iCell + 1) << 8) + pCsr->nDepth; 801 }else{ 802 p->iPg = 0; 803 p->iCell = 0; 804 } 805 } 806 807 static void btreeCursorSplitkey(BtreeCursor *pCsr, MergeInput *p){ 808 int iCell = pCsr->aPg[pCsr->iPg].iCell; 809 if( iCell>=0 ){ 810 p->iCell = iCell; 811 p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage); 812 }else{ 813 int i; 814 for(i=pCsr->iPg-1; i>=0; i--){ 815 if( pCsr->aPg[i].iCell>0 ) break; 816 } 817 assert( i>=0 ); 818 p->iCell = pCsr->aPg[i].iCell-1; 819 p->iPg = lsmFsPageNumber(pCsr->aPg[i].pPage); 820 } 821 } 822 823 static int sortedKeyCompare( 824 int (*xCmp)(void *, int, void *, int), 825 int iLhsTopic, void *pLhsKey, int nLhsKey, 826 int iRhsTopic, void *pRhsKey, int nRhsKey 827 ){ 828 int res = iLhsTopic - iRhsTopic; 829 if( res==0 ){ 830 res = xCmp(pLhsKey, nLhsKey, pRhsKey, nRhsKey); 831 } 832 return res; 833 } 834 835 static int btreeCursorRestore( 836 BtreeCursor *pCsr, 837 int (*xCmp)(void *, int, void *, int), 838 MergeInput *p 839 ){ 840 int rc = LSM_OK; 841 842 if( p->iPg ){ 843 lsm_env *pEnv = lsmFsEnv(pCsr->pFS); 844 int iCell; /* Current cell number on leaf page */ 845 Pgno iLeaf; /* Page number of current leaf page */ 846 int nDepth; /* Depth of b-tree structure */ 847 Segment *pSeg = pCsr->pSeg; 848 849 /* Decode the MergeInput structure */ 850 iLeaf = p->iPg; 851 nDepth = (p->iCell & 0x00FF); 852 iCell = (p->iCell >> 8) - 1; 853 854 /* Allocate the BtreeCursor.aPg[] array */ 855 assert( pCsr->aPg==0 ); 856 pCsr->aPg = (BtreePg *)lsmMallocZeroRc(pEnv, sizeof(BtreePg) * nDepth, &rc); 857 858 /* Populate the last entry of the aPg[] array */ 859 if( rc==LSM_OK ){ 860 Page **pp = &pCsr->aPg[nDepth-1].pPage; 861 pCsr->iPg = nDepth-1; 862 pCsr->nDepth = nDepth; 863 pCsr->aPg[pCsr->iPg].iCell = iCell; 864 rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLeaf, pp); 865 } 866 867 /* Populate any other aPg[] array entries */ 868 if( rc==LSM_OK && nDepth>1 ){ 869 Blob blob = {0,0,0}; 870 void *pSeek; 871 int nSeek; 872 int iTopicSeek; 873 int iPg = 0; 874 int iLoad = (int)pSeg->iRoot; 875 Page *pPg = pCsr->aPg[nDepth-1].pPage; 876 877 if( pageObjGetNRec(pPg)==0 ){ 878 /* This can happen when pPg is the right-most leaf in the b-tree. 879 ** In this case, set the iTopicSeek/pSeek/nSeek key to a value 880 ** greater than any real key. */ 881 assert( iCell==-1 ); 882 iTopicSeek = 1000; 883 pSeek = 0; 884 nSeek = 0; 885 }else{ 886 Pgno dummy; 887 rc = pageGetBtreeKey(pSeg, pPg, 888 0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob 889 ); 890 } 891 892 do { 893 Page *pPg2; 894 rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLoad, &pPg2); 895 assert( rc==LSM_OK || pPg2==0 ); 896 if( rc==LSM_OK ){ 897 u8 *aData; /* Buffer containing page data */ 898 int nData; /* Size of aData[] in bytes */ 899 int iMin; 900 int iMax; 901 int iCell2; 902 903 aData = fsPageData(pPg2, &nData); 904 assert( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) ); 905 906 iLoad = (int)pageGetPtr(aData, nData); 907 iCell2 = pageGetNRec(aData, nData); 908 iMax = iCell2-1; 909 iMin = 0; 910 911 while( iMax>=iMin ){ 912 int iTry = (iMin+iMax)/2; 913 void *pKey; int nKey; /* Key for cell iTry */ 914 int iTopic; /* Topic for key pKeyT/nKeyT */ 915 Pgno iPtr; /* Pointer for cell iTry */ 916 int res; /* (pSeek - pKeyT) */ 917 918 rc = pageGetBtreeKey( 919 pSeg, pPg2, iTry, &iPtr, &iTopic, &pKey, &nKey, &blob 920 ); 921 if( rc!=LSM_OK ) break; 922 923 res = sortedKeyCompare( 924 xCmp, iTopicSeek, pSeek, nSeek, iTopic, pKey, nKey 925 ); 926 assert( res!=0 ); 927 928 if( res<0 ){ 929 iLoad = (int)iPtr; 930 iCell2 = iTry; 931 iMax = iTry-1; 932 }else{ 933 iMin = iTry+1; 934 } 935 } 936 937 pCsr->aPg[iPg].pPage = pPg2; 938 pCsr->aPg[iPg].iCell = iCell2; 939 iPg++; 940 assert( iPg!=nDepth-1 941 || lsmFsRedirectPage(pCsr->pFS, pSeg->pRedirect, iLoad)==iLeaf 942 ); 943 } 944 }while( rc==LSM_OK && iPg<(nDepth-1) ); 945 sortedBlobFree(&blob); 946 } 947 948 /* Load the current key and pointer */ 949 if( rc==LSM_OK ){ 950 BtreePg *pBtreePg; 951 u8 *aData; 952 int nData; 953 954 pBtreePg = &pCsr->aPg[pCsr->iPg]; 955 aData = fsPageData(pBtreePg->pPage, &nData); 956 pCsr->iPtr = btreeCursorPtr(aData, nData, pBtreePg->iCell+1); 957 if( pBtreePg->iCell<0 ){ 958 Pgno dummy; 959 int i; 960 for(i=pCsr->iPg-1; i>=0; i--){ 961 if( pCsr->aPg[i].iCell>0 ) break; 962 } 963 assert( i>=0 ); 964 rc = pageGetBtreeKey(pSeg, 965 pCsr->aPg[i].pPage, pCsr->aPg[i].iCell-1, 966 &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob 967 ); 968 pCsr->eType |= LSM_SEPARATOR; 969 970 }else{ 971 rc = btreeCursorLoadKey(pCsr); 972 } 973 } 974 } 975 return rc; 976 } 977 978 static int btreeCursorNew( 979 lsm_db *pDb, 980 Segment *pSeg, 981 BtreeCursor **ppCsr 982 ){ 983 int rc = LSM_OK; 984 BtreeCursor *pCsr; 985 986 assert( pSeg->iRoot ); 987 pCsr = lsmMallocZeroRc(pDb->pEnv, sizeof(BtreeCursor), &rc); 988 if( pCsr ){ 989 pCsr->pFS = pDb->pFS; 990 pCsr->pSeg = pSeg; 991 pCsr->iPg = -1; 992 } 993 994 *ppCsr = pCsr; 995 return rc; 996 } 997 998 static void segmentPtrSetPage(SegmentPtr *pPtr, Page *pNext){ 999 lsmFsPageRelease(pPtr->pPg); 1000 if( pNext ){ 1001 int nData; 1002 u8 *aData = fsPageData(pNext, &nData); 1003 pPtr->nCell = pageGetNRec(aData, nData); 1004 pPtr->flags = (u16)pageGetFlags(aData, nData); 1005 pPtr->iPtr = pageGetPtr(aData, nData); 1006 } 1007 pPtr->pPg = pNext; 1008 } 1009 1010 /* 1011 ** Load a new page into the SegmentPtr object pPtr. 1012 */ 1013 static int segmentPtrLoadPage( 1014 FileSystem *pFS, 1015 SegmentPtr *pPtr, /* Load page into this SegmentPtr object */ 1016 int iNew /* Page number of new page */ 1017 ){ 1018 Page *pPg = 0; /* The new page */ 1019 int rc; /* Return Code */ 1020 1021 rc = lsmFsDbPageGet(pFS, pPtr->pSeg, iNew, &pPg); 1022 assert( rc==LSM_OK || pPg==0 ); 1023 segmentPtrSetPage(pPtr, pPg); 1024 1025 return rc; 1026 } 1027 1028 static int segmentPtrReadData( 1029 SegmentPtr *pPtr, 1030 int iOff, 1031 int nByte, 1032 void **ppData, 1033 Blob *pBlob 1034 ){ 1035 return sortedReadData(pPtr->pSeg, pPtr->pPg, iOff, nByte, ppData, pBlob); 1036 } 1037 1038 static int segmentPtrNextPage( 1039 SegmentPtr *pPtr, /* Load page into this SegmentPtr object */ 1040 int eDir /* +1 for next(), -1 for prev() */ 1041 ){ 1042 Page *pNext; /* New page to load */ 1043 int rc; /* Return code */ 1044 1045 assert( eDir==1 || eDir==-1 ); 1046 assert( pPtr->pPg ); 1047 assert( pPtr->pSeg || eDir>0 ); 1048 1049 rc = lsmFsDbPageNext(pPtr->pSeg, pPtr->pPg, eDir, &pNext); 1050 assert( rc==LSM_OK || pNext==0 ); 1051 segmentPtrSetPage(pPtr, pNext); 1052 return rc; 1053 } 1054 1055 static int segmentPtrLoadCell( 1056 SegmentPtr *pPtr, /* Load page into this SegmentPtr object */ 1057 int iNew /* Cell number of new cell */ 1058 ){ 1059 int rc = LSM_OK; 1060 if( pPtr->pPg ){ 1061 u8 *aData; /* Pointer to page data buffer */ 1062 int iOff; /* Offset in aData[] to read from */ 1063 int nPgsz; /* Size of page (aData[]) in bytes */ 1064 1065 assert( iNew<pPtr->nCell ); 1066 pPtr->iCell = iNew; 1067 aData = fsPageData(pPtr->pPg, &nPgsz); 1068 iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nPgsz, pPtr->iCell)]); 1069 pPtr->eType = aData[iOff]; 1070 iOff++; 1071 iOff += GETVARINT64(&aData[iOff], pPtr->iPgPtr); 1072 iOff += GETVARINT32(&aData[iOff], pPtr->nKey); 1073 if( rtIsWrite(pPtr->eType) ){ 1074 iOff += GETVARINT32(&aData[iOff], pPtr->nVal); 1075 } 1076 assert( pPtr->nKey>=0 ); 1077 1078 rc = segmentPtrReadData( 1079 pPtr, iOff, pPtr->nKey, &pPtr->pKey, &pPtr->blob1 1080 ); 1081 if( rc==LSM_OK && rtIsWrite(pPtr->eType) ){ 1082 rc = segmentPtrReadData( 1083 pPtr, iOff+pPtr->nKey, pPtr->nVal, &pPtr->pVal, &pPtr->blob2 1084 ); 1085 }else{ 1086 pPtr->nVal = 0; 1087 pPtr->pVal = 0; 1088 } 1089 } 1090 1091 return rc; 1092 } 1093 1094 1095 static Segment *sortedSplitkeySegment(Level *pLevel){ 1096 Merge *pMerge = pLevel->pMerge; 1097 MergeInput *p = &pMerge->splitkey; 1098 Segment *pSeg; 1099 int i; 1100 1101 for(i=0; i<pMerge->nInput; i++){ 1102 if( p->iPg==pMerge->aInput[i].iPg ) break; 1103 } 1104 if( pMerge->nInput==(pLevel->nRight+1) && i>=(pMerge->nInput-1) ){ 1105 pSeg = &pLevel->pNext->lhs; 1106 }else{ 1107 pSeg = &pLevel->aRhs[i]; 1108 } 1109 1110 return pSeg; 1111 } 1112 1113 static void sortedSplitkey(lsm_db *pDb, Level *pLevel, int *pRc){ 1114 Segment *pSeg; 1115 Page *pPg = 0; 1116 lsm_env *pEnv = pDb->pEnv; /* Environment handle */ 1117 int rc = *pRc; 1118 Merge *pMerge = pLevel->pMerge; 1119 1120 pSeg = sortedSplitkeySegment(pLevel); 1121 if( rc==LSM_OK ){ 1122 rc = lsmFsDbPageGet(pDb->pFS, pSeg, pMerge->splitkey.iPg, &pPg); 1123 } 1124 if( rc==LSM_OK ){ 1125 int iTopic; 1126 Blob blob = {0, 0, 0, 0}; 1127 u8 *aData; 1128 int nData; 1129 1130 aData = lsmFsPageData(pPg, &nData); 1131 if( pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG ){ 1132 void *pKey; 1133 int nKey; 1134 Pgno dummy; 1135 rc = pageGetBtreeKey(pSeg, 1136 pPg, pMerge->splitkey.iCell, &dummy, &iTopic, &pKey, &nKey, &blob 1137 ); 1138 if( rc==LSM_OK && blob.pData!=pKey ){ 1139 rc = sortedBlobSet(pEnv, &blob, pKey, nKey); 1140 } 1141 }else{ 1142 rc = pageGetKeyCopy( 1143 pEnv, pSeg, pPg, pMerge->splitkey.iCell, &iTopic, &blob 1144 ); 1145 } 1146 1147 pLevel->iSplitTopic = iTopic; 1148 pLevel->pSplitKey = blob.pData; 1149 pLevel->nSplitKey = blob.nData; 1150 lsmFsPageRelease(pPg); 1151 } 1152 1153 *pRc = rc; 1154 } 1155 1156 /* 1157 ** Reset a segment cursor. Also free its buffers if they are nThreshold 1158 ** bytes or larger in size. 1159 */ 1160 static void segmentPtrReset(SegmentPtr *pPtr, int nThreshold){ 1161 lsmFsPageRelease(pPtr->pPg); 1162 pPtr->pPg = 0; 1163 pPtr->nCell = 0; 1164 pPtr->pKey = 0; 1165 pPtr->nKey = 0; 1166 pPtr->pVal = 0; 1167 pPtr->nVal = 0; 1168 pPtr->eType = 0; 1169 pPtr->iCell = 0; 1170 if( pPtr->blob1.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob1); 1171 if( pPtr->blob2.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob2); 1172 } 1173 1174 static int segmentPtrIgnoreSeparators(MultiCursor *pCsr, SegmentPtr *pPtr){ 1175 return (pCsr->flags & CURSOR_READ_SEPARATORS)==0 1176 || (pPtr!=&pCsr->aPtr[pCsr->nPtr-1]); 1177 } 1178 1179 static int segmentPtrAdvance( 1180 MultiCursor *pCsr, 1181 SegmentPtr *pPtr, 1182 int bReverse 1183 ){ 1184 int eDir = (bReverse ? -1 : 1); 1185 Level *pLvl = pPtr->pLevel; 1186 do { 1187 int rc; 1188 int iCell; /* Number of new cell in page */ 1189 int svFlags = 0; /* SegmentPtr.eType before advance */ 1190 1191 iCell = pPtr->iCell + eDir; 1192 assert( pPtr->pPg ); 1193 assert( iCell<=pPtr->nCell && iCell>=-1 ); 1194 1195 if( bReverse && pPtr->pSeg!=&pPtr->pLevel->lhs ){ 1196 svFlags = pPtr->eType; 1197 assert( svFlags ); 1198 } 1199 1200 if( iCell>=pPtr->nCell || iCell<0 ){ 1201 do { 1202 rc = segmentPtrNextPage(pPtr, eDir); 1203 }while( rc==LSM_OK 1204 && pPtr->pPg 1205 && (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG) ) 1206 ); 1207 if( rc!=LSM_OK ) return rc; 1208 iCell = bReverse ? (pPtr->nCell-1) : 0; 1209 } 1210 rc = segmentPtrLoadCell(pPtr, iCell); 1211 if( rc!=LSM_OK ) return rc; 1212 1213 if( svFlags && pPtr->pPg ){ 1214 int res = sortedKeyCompare(pCsr->pDb->xCmp, 1215 rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey, 1216 pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey 1217 ); 1218 if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD); 1219 } 1220 1221 if( pPtr->pPg==0 && (svFlags & LSM_END_DELETE) ){ 1222 Segment *pSeg = pPtr->pSeg; 1223 rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, pSeg->iFirst, &pPtr->pPg); 1224 if( rc!=LSM_OK ) return rc; 1225 pPtr->eType = LSM_START_DELETE | LSM_POINT_DELETE; 1226 pPtr->eType |= (pLvl->iSplitTopic ? LSM_SYSTEMKEY : 0); 1227 pPtr->pKey = pLvl->pSplitKey; 1228 pPtr->nKey = pLvl->nSplitKey; 1229 } 1230 1231 }while( pCsr 1232 && pPtr->pPg 1233 && segmentPtrIgnoreSeparators(pCsr, pPtr) 1234 && rtIsSeparator(pPtr->eType) 1235 ); 1236 1237 return LSM_OK; 1238 } 1239 1240 static void segmentPtrEndPage( 1241 FileSystem *pFS, 1242 SegmentPtr *pPtr, 1243 int bLast, 1244 int *pRc 1245 ){ 1246 if( *pRc==LSM_OK ){ 1247 Segment *pSeg = pPtr->pSeg; 1248 Page *pNew = 0; 1249 if( bLast ){ 1250 *pRc = lsmFsDbPageLast(pFS, pSeg, &pNew); 1251 }else{ 1252 *pRc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pNew); 1253 } 1254 segmentPtrSetPage(pPtr, pNew); 1255 } 1256 } 1257 1258 1259 /* 1260 ** Try to move the segment pointer passed as the second argument so that it 1261 ** points at either the first (bLast==0) or last (bLast==1) cell in the valid 1262 ** region of the segment defined by pPtr->iFirst and pPtr->iLast. 1263 ** 1264 ** Return LSM_OK if successful or an lsm error code if something goes 1265 ** wrong (IO error, OOM etc.). 1266 */ 1267 static int segmentPtrEnd(MultiCursor *pCsr, SegmentPtr *pPtr, int bLast){ 1268 Level *pLvl = pPtr->pLevel; 1269 int rc = LSM_OK; 1270 FileSystem *pFS = pCsr->pDb->pFS; 1271 int bIgnore; 1272 1273 segmentPtrEndPage(pFS, pPtr, bLast, &rc); 1274 while( rc==LSM_OK && pPtr->pPg 1275 && (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG)) 1276 ){ 1277 rc = segmentPtrNextPage(pPtr, (bLast ? -1 : 1)); 1278 } 1279 1280 if( rc==LSM_OK && pPtr->pPg ){ 1281 rc = segmentPtrLoadCell(pPtr, bLast ? (pPtr->nCell-1) : 0); 1282 if( rc==LSM_OK && bLast && pPtr->pSeg!=&pLvl->lhs ){ 1283 int res = sortedKeyCompare(pCsr->pDb->xCmp, 1284 rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey, 1285 pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey 1286 ); 1287 if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD); 1288 } 1289 } 1290 1291 bIgnore = segmentPtrIgnoreSeparators(pCsr, pPtr); 1292 if( rc==LSM_OK && pPtr->pPg && bIgnore && rtIsSeparator(pPtr->eType) ){ 1293 rc = segmentPtrAdvance(pCsr, pPtr, bLast); 1294 } 1295 1296 #if 0 1297 if( bLast && rc==LSM_OK && pPtr->pPg 1298 && pPtr->pSeg==&pLvl->lhs 1299 && pLvl->nRight && (pPtr->eType & LSM_START_DELETE) 1300 ){ 1301 pPtr->iCell++; 1302 pPtr->eType = LSM_END_DELETE | (pLvl->iSplitTopic); 1303 pPtr->pKey = pLvl->pSplitKey; 1304 pPtr->nKey = pLvl->nSplitKey; 1305 pPtr->pVal = 0; 1306 pPtr->nVal = 0; 1307 } 1308 #endif 1309 1310 return rc; 1311 } 1312 1313 static void segmentPtrKey(SegmentPtr *pPtr, void **ppKey, int *pnKey){ 1314 assert( pPtr->pPg ); 1315 *ppKey = pPtr->pKey; 1316 *pnKey = pPtr->nKey; 1317 } 1318 1319 #if 0 /* NOT USED */ 1320 static char *keyToString(lsm_env *pEnv, void *pKey, int nKey){ 1321 int i; 1322 u8 *aKey = (u8 *)pKey; 1323 char *zRet = (char *)lsmMalloc(pEnv, nKey+1); 1324 1325 for(i=0; i<nKey; i++){ 1326 zRet[i] = (char)(isalnum(aKey[i]) ? aKey[i] : '.'); 1327 } 1328 zRet[nKey] = '\0'; 1329 return zRet; 1330 } 1331 #endif 1332 1333 #if 0 /* NOT USED */ 1334 /* 1335 ** Check that the page that pPtr currently has loaded is the correct page 1336 ** to search for key (pKey/nKey). If it is, return 1. Otherwise, an assert 1337 ** fails and this function does not return. 1338 */ 1339 static int assertKeyLocation( 1340 MultiCursor *pCsr, 1341 SegmentPtr *pPtr, 1342 void *pKey, int nKey 1343 ){ 1344 lsm_env *pEnv = lsmFsEnv(pCsr->pDb->pFS); 1345 Blob blob = {0, 0, 0}; 1346 int eDir; 1347 int iTopic = 0; /* TODO: Fix me */ 1348 1349 for(eDir=-1; eDir<=1; eDir+=2){ 1350 Page *pTest = pPtr->pPg; 1351 1352 lsmFsPageRef(pTest); 1353 while( pTest ){ 1354 Segment *pSeg = pPtr->pSeg; 1355 Page *pNext; 1356 1357 int rc = lsmFsDbPageNext(pSeg, pTest, eDir, &pNext); 1358 lsmFsPageRelease(pTest); 1359 if( rc ) return 1; 1360 pTest = pNext; 1361 1362 if( pTest ){ 1363 int nData; 1364 u8 *aData = fsPageData(pTest, &nData); 1365 int nCell = pageGetNRec(aData, nData); 1366 int flags = pageGetFlags(aData, nData); 1367 if( nCell && 0==(flags&SEGMENT_BTREE_FLAG) ){ 1368 int nPgKey; 1369 int iPgTopic; 1370 u8 *pPgKey; 1371 int res; 1372 int iCell; 1373 1374 iCell = ((eDir < 0) ? (nCell-1) : 0); 1375 pPgKey = pageGetKey(pSeg, pTest, iCell, &iPgTopic, &nPgKey, &blob); 1376 res = iTopic - iPgTopic; 1377 if( res==0 ) res = pCsr->pDb->xCmp(pKey, nKey, pPgKey, nPgKey); 1378 if( (eDir==1 && res>0) || (eDir==-1 && res<0) ){ 1379 /* Taking this branch means something has gone wrong. */ 1380 char *zMsg = lsmMallocPrintf(pEnv, "Key \"%s\" is not on page %d", 1381 keyToString(pEnv, pKey, nKey), lsmFsPageNumber(pPtr->pPg) 1382 ); 1383 fprintf(stderr, "%s\n", zMsg); 1384 assert( !"assertKeyLocation() failed" ); 1385 } 1386 lsmFsPageRelease(pTest); 1387 pTest = 0; 1388 } 1389 } 1390 } 1391 } 1392 1393 sortedBlobFree(&blob); 1394 return 1; 1395 } 1396 #endif 1397 1398 #ifndef NDEBUG 1399 static int assertSeekResult( 1400 MultiCursor *pCsr, 1401 SegmentPtr *pPtr, 1402 int iTopic, 1403 void *pKey, 1404 int nKey, 1405 int eSeek 1406 ){ 1407 if( pPtr->pPg ){ 1408 int res; 1409 res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey, 1410 rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey 1411 ); 1412 1413 if( eSeek==LSM_SEEK_EQ ) return (res==0); 1414 if( eSeek==LSM_SEEK_LE ) return (res>=0); 1415 if( eSeek==LSM_SEEK_GE ) return (res<=0); 1416 } 1417 1418 return 1; 1419 } 1420 #endif 1421 1422 static int segmentPtrSearchOversized( 1423 MultiCursor *pCsr, /* Cursor context */ 1424 SegmentPtr *pPtr, /* Pointer to seek */ 1425 int iTopic, /* Topic of key to search for */ 1426 void *pKey, int nKey /* Key to seek to */ 1427 ){ 1428 int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; 1429 int rc = LSM_OK; 1430 1431 /* If the OVERSIZED flag is set, then there is no pointer in the 1432 ** upper level to the next page in the segment that contains at least 1433 ** one key. So compare the largest key on the current page with the 1434 ** key being sought (pKey/nKey). If (pKey/nKey) is larger, advance 1435 ** to the next page in the segment that contains at least one key. 1436 */ 1437 while( rc==LSM_OK && (pPtr->flags & PGFTR_SKIP_NEXT_FLAG) ){ 1438 u8 *pLastKey; 1439 int nLastKey; 1440 int iLastTopic; 1441 int res; /* Result of comparison */ 1442 Page *pNext; 1443 1444 /* Load the last key on the current page. */ 1445 pLastKey = pageGetKey(pPtr->pSeg, 1446 pPtr->pPg, pPtr->nCell-1, &iLastTopic, &nLastKey, &pPtr->blob1 1447 ); 1448 1449 /* If the loaded key is >= than (pKey/nKey), break out of the loop. 1450 ** If (pKey/nKey) is present in this array, it must be on the current 1451 ** page. */ 1452 res = sortedKeyCompare( 1453 xCmp, iLastTopic, pLastKey, nLastKey, iTopic, pKey, nKey 1454 ); 1455 if( res>=0 ) break; 1456 1457 /* Advance to the next page that contains at least one key. */ 1458 pNext = pPtr->pPg; 1459 lsmFsPageRef(pNext); 1460 while( 1 ){ 1461 Page *pLoad; 1462 u8 *aData; int nData; 1463 1464 rc = lsmFsDbPageNext(pPtr->pSeg, pNext, 1, &pLoad); 1465 lsmFsPageRelease(pNext); 1466 pNext = pLoad; 1467 if( pNext==0 ) break; 1468 1469 assert( rc==LSM_OK ); 1470 aData = lsmFsPageData(pNext, &nData); 1471 if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0 1472 && pageGetNRec(aData, nData)>0 1473 ){ 1474 break; 1475 } 1476 } 1477 if( pNext==0 ) break; 1478 segmentPtrSetPage(pPtr, pNext); 1479 1480 /* This should probably be an LSM_CORRUPT error. */ 1481 assert( rc!=LSM_OK || (pPtr->flags & PGFTR_SKIP_THIS_FLAG) ); 1482 } 1483 1484 return rc; 1485 } 1486 1487 static int ptrFwdPointer( 1488 Page *pPage, 1489 int iCell, 1490 Segment *pSeg, 1491 Pgno *piPtr, 1492 int *pbFound 1493 ){ 1494 Page *pPg = pPage; 1495 int iFirst = iCell; 1496 int rc = LSM_OK; 1497 1498 do { 1499 Page *pNext = 0; 1500 u8 *aData; 1501 int nData; 1502 1503 aData = lsmFsPageData(pPg, &nData); 1504 if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0 ){ 1505 int i; 1506 int nCell = pageGetNRec(aData, nData); 1507 for(i=iFirst; i<nCell; i++){ 1508 u8 eType = *pageGetCell(aData, nData, i); 1509 if( (eType & LSM_START_DELETE)==0 ){ 1510 *pbFound = 1; 1511 *piPtr = pageGetRecordPtr(aData, nData, i) + pageGetPtr(aData, nData); 1512 lsmFsPageRelease(pPg); 1513 return LSM_OK; 1514 } 1515 } 1516 } 1517 1518 rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext); 1519 lsmFsPageRelease(pPg); 1520 pPg = pNext; 1521 iFirst = 0; 1522 }while( pPg && rc==LSM_OK ); 1523 lsmFsPageRelease(pPg); 1524 1525 *pbFound = 0; 1526 return rc; 1527 } 1528 1529 static int sortedRhsFirst(MultiCursor *pCsr, Level *pLvl, SegmentPtr *pPtr){ 1530 int rc; 1531 rc = segmentPtrEnd(pCsr, pPtr, 0); 1532 while( pPtr->pPg && rc==LSM_OK ){ 1533 int res = sortedKeyCompare(pCsr->pDb->xCmp, 1534 pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey, 1535 rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey 1536 ); 1537 if( res<=0 ) break; 1538 rc = segmentPtrAdvance(pCsr, pPtr, 0); 1539 } 1540 return rc; 1541 } 1542 1543 1544 /* 1545 ** This function is called as part of a SEEK_GE op on a multi-cursor if the 1546 ** FC pointer read from segment *pPtr comes from an entry with the 1547 ** LSM_START_DELETE flag set. In this case the pointer value cannot be 1548 ** trusted. Instead, the pointer that should be followed is that associated 1549 ** with the next entry in *pPtr that does not have LSM_START_DELETE set. 1550 ** 1551 ** Why the pointers can't be trusted: 1552 ** 1553 ** 1554 ** 1555 ** TODO: This is a stop-gap solution: 1556 ** 1557 ** At the moment, this function is called from within segmentPtrSeek(), 1558 ** as part of the initial lsmMCursorSeek() call. However, consider a 1559 ** database where the following has occurred: 1560 ** 1561 ** 1. A range delete removes keys 1..9999 using a range delete. 1562 ** 2. Keys 1 through 9999 are reinserted. 1563 ** 3. The levels containing the ops in 1. and 2. above are merged. Call 1564 ** this level N. Level N contains FC pointers to level N+1. 1565 ** 1566 ** Then, if the user attempts to query for (key>=2 LIMIT 10), the 1567 ** lsmMCursorSeek() call will iterate through 9998 entries searching for a 1568 ** pointer down to the level N+1 that is never actually used. It would be 1569 ** much better if the multi-cursor could do this lazily - only seek to the 1570 ** level (N+1) page after the user has moved the cursor on level N passed 1571 ** the big range-delete. 1572 */ 1573 static int segmentPtrFwdPointer( 1574 MultiCursor *pCsr, /* Multi-cursor pPtr belongs to */ 1575 SegmentPtr *pPtr, /* Segment-pointer to extract FC ptr from */ 1576 Pgno *piPtr /* OUT: FC pointer value */ 1577 ){ 1578 Level *pLvl = pPtr->pLevel; 1579 Level *pNext = pLvl->pNext; 1580 Page *pPg = pPtr->pPg; 1581 int rc; 1582 int bFound; 1583 Pgno iOut = 0; 1584 1585 if( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[pLvl->nRight-1] ){ 1586 if( pNext==0 1587 || (pNext->nRight==0 && pNext->lhs.iRoot) 1588 || (pNext->nRight!=0 && pNext->aRhs[0].iRoot) 1589 ){ 1590 /* Do nothing. The pointer will not be used anyway. */ 1591 return LSM_OK; 1592 } 1593 }else{ 1594 if( pPtr[1].pSeg->iRoot ){ 1595 return LSM_OK; 1596 } 1597 } 1598 1599 /* Search for a pointer within the current segment. */ 1600 lsmFsPageRef(pPg); 1601 rc = ptrFwdPointer(pPg, pPtr->iCell, pPtr->pSeg, &iOut, &bFound); 1602 1603 if( rc==LSM_OK && bFound==0 ){ 1604 /* This case happens when pPtr points to the left-hand-side of a segment 1605 ** currently undergoing an incremental merge. In this case, jump to the 1606 ** oldest segment in the right-hand-side of the same level and continue 1607 ** searching. But - do not consider any keys smaller than the levels 1608 ** split-key. */ 1609 SegmentPtr ptr; 1610 1611 if( pPtr->pLevel->nRight==0 || pPtr->pSeg!=&pPtr->pLevel->lhs ){ 1612 return LSM_CORRUPT_BKPT; 1613 } 1614 1615 memset(&ptr, 0, sizeof(SegmentPtr)); 1616 ptr.pLevel = pPtr->pLevel; 1617 ptr.pSeg = &ptr.pLevel->aRhs[ptr.pLevel->nRight-1]; 1618 rc = sortedRhsFirst(pCsr, ptr.pLevel, &ptr); 1619 if( rc==LSM_OK ){ 1620 rc = ptrFwdPointer(ptr.pPg, ptr.iCell, ptr.pSeg, &iOut, &bFound); 1621 ptr.pPg = 0; 1622 } 1623 segmentPtrReset(&ptr, 0); 1624 } 1625 1626 *piPtr = iOut; 1627 return rc; 1628 } 1629 1630 static int segmentPtrSeek( 1631 MultiCursor *pCsr, /* Cursor context */ 1632 SegmentPtr *pPtr, /* Pointer to seek */ 1633 int iTopic, /* Key topic to seek to */ 1634 void *pKey, int nKey, /* Key to seek to */ 1635 int eSeek, /* Search bias - see above */ 1636 int *piPtr, /* OUT: FC pointer */ 1637 int *pbStop 1638 ){ 1639 int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; 1640 int res = 0; /* Result of comparison operation */ 1641 int rc = LSM_OK; 1642 int iMin; 1643 int iMax; 1644 Pgno iPtrOut = 0; 1645 1646 /* If the current page contains an oversized entry, then there are no 1647 ** pointers to one or more of the subsequent pages in the sorted run. 1648 ** The following call ensures that the segment-ptr points to the correct 1649 ** page in this case. */ 1650 rc = segmentPtrSearchOversized(pCsr, pPtr, iTopic, pKey, nKey); 1651 iPtrOut = pPtr->iPtr; 1652 1653 /* Assert that this page is the right page of this segment for the key 1654 ** that we are searching for. Do this by loading page (iPg-1) and testing 1655 ** that pKey/nKey is greater than all keys on that page, and then by 1656 ** loading (iPg+1) and testing that pKey/nKey is smaller than all 1657 ** the keys it houses. 1658 ** 1659 ** TODO: With range-deletes in the tree, the test described above may fail. 1660 */ 1661 #if 0 1662 assert( assertKeyLocation(pCsr, pPtr, pKey, nKey) ); 1663 #endif 1664 1665 assert( pPtr->nCell>0 1666 || pPtr->pSeg->nSize==1 1667 || lsmFsDbPageIsLast(pPtr->pSeg, pPtr->pPg) 1668 ); 1669 if( pPtr->nCell==0 ){ 1670 segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD); 1671 }else{ 1672 iMin = 0; 1673 iMax = pPtr->nCell-1; 1674 1675 while( 1 ){ 1676 int iTry = (iMin+iMax)/2; 1677 void *pKeyT; int nKeyT; /* Key for cell iTry */ 1678 int iTopicT; 1679 1680 assert( iTry<iMax || iMin==iMax ); 1681 1682 rc = segmentPtrLoadCell(pPtr, iTry); 1683 if( rc!=LSM_OK ) break; 1684 1685 segmentPtrKey(pPtr, &pKeyT, &nKeyT); 1686 iTopicT = rtTopic(pPtr->eType); 1687 1688 res = sortedKeyCompare(xCmp, iTopicT, pKeyT, nKeyT, iTopic, pKey, nKey); 1689 if( res<=0 ){ 1690 iPtrOut = pPtr->iPtr + pPtr->iPgPtr; 1691 } 1692 1693 if( res==0 || iMin==iMax ){ 1694 break; 1695 }else if( res>0 ){ 1696 iMax = LSM_MAX(iTry-1, iMin); 1697 }else{ 1698 iMin = iTry+1; 1699 } 1700 } 1701 1702 if( rc==LSM_OK ){ 1703 assert( res==0 || (iMin==iMax && iMin>=0 && iMin<pPtr->nCell) ); 1704 if( res ){ 1705 rc = segmentPtrLoadCell(pPtr, iMin); 1706 } 1707 assert( rc!=LSM_OK || res>0 || iPtrOut==(pPtr->iPtr + pPtr->iPgPtr) ); 1708 1709 if( rc==LSM_OK ){ 1710 switch( eSeek ){ 1711 case LSM_SEEK_EQ: { 1712 int eType = pPtr->eType; 1713 if( (res<0 && (eType & LSM_START_DELETE)) 1714 || (res>0 && (eType & LSM_END_DELETE)) 1715 || (res==0 && (eType & LSM_POINT_DELETE)) 1716 ){ 1717 *pbStop = 1; 1718 }else if( res==0 && (eType & LSM_INSERT) ){ 1719 lsm_env *pEnv = pCsr->pDb->pEnv; 1720 *pbStop = 1; 1721 pCsr->eType = pPtr->eType; 1722 rc = sortedBlobSet(pEnv, &pCsr->key, pPtr->pKey, pPtr->nKey); 1723 if( rc==LSM_OK ){ 1724 rc = sortedBlobSet(pEnv, &pCsr->val, pPtr->pVal, pPtr->nVal); 1725 } 1726 pCsr->flags |= CURSOR_SEEK_EQ; 1727 } 1728 segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD); 1729 break; 1730 } 1731 case LSM_SEEK_LE: 1732 if( res>0 ) rc = segmentPtrAdvance(pCsr, pPtr, 1); 1733 break; 1734 case LSM_SEEK_GE: { 1735 /* Figure out if we need to 'skip' the pointer forward or not */ 1736 if( (res<=0 && (pPtr->eType & LSM_START_DELETE)) 1737 || (res>0 && (pPtr->eType & LSM_END_DELETE)) 1738 ){ 1739 rc = segmentPtrFwdPointer(pCsr, pPtr, &iPtrOut); 1740 } 1741 if( res<0 && rc==LSM_OK ){ 1742 rc = segmentPtrAdvance(pCsr, pPtr, 0); 1743 } 1744 break; 1745 } 1746 } 1747 } 1748 } 1749 1750 /* If the cursor seek has found a separator key, and this cursor is 1751 ** supposed to ignore separators keys, advance to the next entry. */ 1752 if( rc==LSM_OK && pPtr->pPg 1753 && segmentPtrIgnoreSeparators(pCsr, pPtr) 1754 && rtIsSeparator(pPtr->eType) 1755 ){ 1756 assert( eSeek!=LSM_SEEK_EQ ); 1757 rc = segmentPtrAdvance(pCsr, pPtr, eSeek==LSM_SEEK_LE); 1758 } 1759 } 1760 1761 assert( rc!=LSM_OK || assertSeekResult(pCsr,pPtr,iTopic,pKey,nKey,eSeek) ); 1762 *piPtr = (int)iPtrOut; 1763 return rc; 1764 } 1765 1766 static int seekInBtree( 1767 MultiCursor *pCsr, /* Multi-cursor object */ 1768 Segment *pSeg, /* Seek within this segment */ 1769 int iTopic, 1770 void *pKey, int nKey, /* Key to seek to */ 1771 Pgno *aPg, /* OUT: Page numbers */ 1772 Page **ppPg /* OUT: Leaf (sorted-run) page reference */ 1773 ){ 1774 int i = 0; 1775 int rc; 1776 int iPg; 1777 Page *pPg = 0; 1778 Blob blob = {0, 0, 0}; 1779 1780 iPg = (int)pSeg->iRoot; 1781 do { 1782 Pgno *piFirst = 0; 1783 if( aPg ){ 1784 aPg[i++] = iPg; 1785 piFirst = &aPg[i]; 1786 } 1787 1788 rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, iPg, &pPg); 1789 assert( rc==LSM_OK || pPg==0 ); 1790 if( rc==LSM_OK ){ 1791 u8 *aData; /* Buffer containing page data */ 1792 int nData; /* Size of aData[] in bytes */ 1793 int iMin; 1794 int iMax; 1795 int nRec; 1796 int flags; 1797 1798 aData = fsPageData(pPg, &nData); 1799 flags = pageGetFlags(aData, nData); 1800 if( (flags & SEGMENT_BTREE_FLAG)==0 ) break; 1801 1802 iPg = (int)pageGetPtr(aData, nData); 1803 nRec = pageGetNRec(aData, nData); 1804 1805 iMin = 0; 1806 iMax = nRec-1; 1807 while( iMax>=iMin ){ 1808 int iTry = (iMin+iMax)/2; 1809 void *pKeyT; int nKeyT; /* Key for cell iTry */ 1810 int iTopicT; /* Topic for key pKeyT/nKeyT */ 1811 Pgno iPtr; /* Pointer associated with cell iTry */ 1812 int res; /* (pKey - pKeyT) */ 1813 1814 rc = pageGetBtreeKey( 1815 pSeg, pPg, iTry, &iPtr, &iTopicT, &pKeyT, &nKeyT, &blob 1816 ); 1817 if( rc!=LSM_OK ) break; 1818 if( piFirst && pKeyT==blob.pData ){ 1819 *piFirst = pageGetBtreeRef(pPg, iTry); 1820 piFirst = 0; 1821 i++; 1822 } 1823 1824 res = sortedKeyCompare( 1825 pCsr->pDb->xCmp, iTopic, pKey, nKey, iTopicT, pKeyT, nKeyT 1826 ); 1827 if( res<0 ){ 1828 iPg = (int)iPtr; 1829 iMax = iTry-1; 1830 }else{ 1831 iMin = iTry+1; 1832 } 1833 } 1834 lsmFsPageRelease(pPg); 1835 pPg = 0; 1836 } 1837 }while( rc==LSM_OK ); 1838 1839 sortedBlobFree(&blob); 1840 assert( (rc==LSM_OK)==(pPg!=0) ); 1841 if( ppPg ){ 1842 *ppPg = pPg; 1843 }else{ 1844 lsmFsPageRelease(pPg); 1845 } 1846 return rc; 1847 } 1848 1849 static int seekInSegment( 1850 MultiCursor *pCsr, 1851 SegmentPtr *pPtr, 1852 int iTopic, 1853 void *pKey, int nKey, 1854 int iPg, /* Page to search */ 1855 int eSeek, /* Search bias - see above */ 1856 int *piPtr, /* OUT: FC pointer */ 1857 int *pbStop /* OUT: Stop search flag */ 1858 ){ 1859 int iPtr = iPg; 1860 int rc = LSM_OK; 1861 1862 if( pPtr->pSeg->iRoot ){ 1863 Page *pPg; 1864 assert( pPtr->pSeg->iRoot!=0 ); 1865 rc = seekInBtree(pCsr, pPtr->pSeg, iTopic, pKey, nKey, 0, &pPg); 1866 if( rc==LSM_OK ) segmentPtrSetPage(pPtr, pPg); 1867 }else{ 1868 if( iPtr==0 ){ 1869 iPtr = (int)pPtr->pSeg->iFirst; 1870 } 1871 if( rc==LSM_OK ){ 1872 rc = segmentPtrLoadPage(pCsr->pDb->pFS, pPtr, iPtr); 1873 } 1874 } 1875 1876 if( rc==LSM_OK ){ 1877 rc = segmentPtrSeek(pCsr, pPtr, iTopic, pKey, nKey, eSeek, piPtr, pbStop); 1878 } 1879 return rc; 1880 } 1881 1882 /* 1883 ** Seek each segment pointer in the array of (pLvl->nRight+1) at aPtr[]. 1884 ** 1885 ** pbStop: 1886 ** This parameter is only significant if parameter eSeek is set to 1887 ** LSM_SEEK_EQ. In this case, it is set to true before returning if 1888 ** the seek operation is finished. This can happen in two ways: 1889 ** 1890 ** a) A key matching (pKey/nKey) is found, or 1891 ** b) A point-delete or range-delete deleting the key is found. 1892 ** 1893 ** In case (a), the multi-cursor CURSOR_SEEK_EQ flag is set and the pCsr->key 1894 ** and pCsr->val blobs populated before returning. 1895 */ 1896 static int seekInLevel( 1897 MultiCursor *pCsr, /* Sorted cursor object to seek */ 1898 SegmentPtr *aPtr, /* Pointer to array of (nRhs+1) SPs */ 1899 int eSeek, /* Search bias - see above */ 1900 int iTopic, /* Key topic to search for */ 1901 void *pKey, int nKey, /* Key to search for */ 1902 Pgno *piPgno, /* IN/OUT: fraction cascade pointer (or 0) */ 1903 int *pbStop /* OUT: See above */ 1904 ){ 1905 Level *pLvl = aPtr[0].pLevel; /* Level to seek within */ 1906 int rc = LSM_OK; /* Return code */ 1907 int iOut = 0; /* Pointer to return to caller */ 1908 int res = -1; /* Result of xCmp(pKey, split) */ 1909 int nRhs = pLvl->nRight; /* Number of right-hand-side segments */ 1910 int bStop = 0; 1911 1912 /* If this is a composite level (one currently undergoing an incremental 1913 ** merge), figure out if the search key is larger or smaller than the 1914 ** levels split-key. */ 1915 if( nRhs ){ 1916 res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey, 1917 pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey 1918 ); 1919 } 1920 1921 /* If (res<0), then key pKey/nKey is smaller than the split-key (or this 1922 ** is not a composite level and there is no split-key). Search the 1923 ** left-hand-side of the level in this case. */ 1924 if( res<0 ){ 1925 int iPtr = 0; 1926 if( nRhs==0 ) iPtr = (int)*piPgno; 1927 1928 rc = seekInSegment( 1929 pCsr, &aPtr[0], iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop 1930 ); 1931 if( rc==LSM_OK && nRhs>0 && eSeek==LSM_SEEK_GE && aPtr[0].pPg==0 ){ 1932 res = 0; 1933 } 1934 } 1935 1936 if( res>=0 ){ 1937 int bHit = 0; /* True if at least one rhs is not EOF */ 1938 int iPtr = (int)*piPgno; 1939 int i; 1940 for(i=1; rc==LSM_OK && i<=nRhs && bStop==0; i++){ 1941 SegmentPtr *pPtr = &aPtr[i]; 1942 iOut = 0; 1943 rc = seekInSegment( 1944 pCsr, pPtr, iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop 1945 ); 1946 iPtr = iOut; 1947 1948 /* If the segment-pointer has settled on a key that is smaller than 1949 ** the splitkey, invalidate the segment-pointer. */ 1950 if( pPtr->pPg ){ 1951 res = sortedKeyCompare(pCsr->pDb->xCmp, 1952 rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey, 1953 pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey 1954 ); 1955 if( res<0 ){ 1956 if( pPtr->eType & LSM_START_DELETE ){ 1957 pPtr->eType &= ~LSM_INSERT; 1958 pPtr->pKey = pLvl->pSplitKey; 1959 pPtr->nKey = pLvl->nSplitKey; 1960 pPtr->pVal = 0; 1961 pPtr->nVal = 0; 1962 }else{ 1963 segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD); 1964 } 1965 } 1966 } 1967 1968 if( aPtr[i].pKey ) bHit = 1; 1969 } 1970 1971 if( rc==LSM_OK && eSeek==LSM_SEEK_LE && bHit==0 ){ 1972 rc = segmentPtrEnd(pCsr, &aPtr[0], 1); 1973 } 1974 } 1975 1976 assert( eSeek==LSM_SEEK_EQ || bStop==0 ); 1977 *piPgno = iOut; 1978 *pbStop = bStop; 1979 return rc; 1980 } 1981 1982 static void multiCursorGetKey( 1983 MultiCursor *pCsr, 1984 int iKey, 1985 int *peType, /* OUT: Key type (SORTED_WRITE etc.) */ 1986 void **ppKey, /* OUT: Pointer to buffer containing key */ 1987 int *pnKey /* OUT: Size of *ppKey in bytes */ 1988 ){ 1989 int nKey = 0; 1990 void *pKey = 0; 1991 int eType = 0; 1992 1993 switch( iKey ){ 1994 case CURSOR_DATA_TREE0: 1995 case CURSOR_DATA_TREE1: { 1996 TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0]; 1997 if( lsmTreeCursorValid(pTreeCsr) ){ 1998 lsmTreeCursorKey(pTreeCsr, &eType, &pKey, &nKey); 1999 } 2000 break; 2001 } 2002 2003 case CURSOR_DATA_SYSTEM: { 2004 Snapshot *pWorker = pCsr->pDb->pWorker; 2005 if( pWorker && (pCsr->flags & CURSOR_FLUSH_FREELIST) ){ 2006 int nEntry = pWorker->freelist.nEntry; 2007 if( pCsr->iFree < (nEntry*2) ){ 2008 FreelistEntry *aEntry = pWorker->freelist.aEntry; 2009 int i = nEntry - 1 - (pCsr->iFree / 2); 2010 u32 iKey2 = 0; 2011 2012 if( (pCsr->iFree % 2) ){ 2013 eType = LSM_END_DELETE|LSM_SYSTEMKEY; 2014 iKey2 = aEntry[i].iBlk-1; 2015 }else if( aEntry[i].iId>=0 ){ 2016 eType = LSM_INSERT|LSM_SYSTEMKEY; 2017 iKey2 = aEntry[i].iBlk; 2018 2019 /* If the in-memory entry immediately before this one was a 2020 ** DELETE, and the block number is one greater than the current 2021 ** block number, mark this entry as an "end-delete-range". */ 2022 if( i<(nEntry-1) && aEntry[i+1].iBlk==iKey2+1 && aEntry[i+1].iId<0 ){ 2023 eType |= LSM_END_DELETE; 2024 } 2025 2026 }else{ 2027 eType = LSM_START_DELETE|LSM_SYSTEMKEY; 2028 iKey2 = aEntry[i].iBlk + 1; 2029 } 2030 2031 /* If the in-memory entry immediately after this one is a 2032 ** DELETE, and the block number is one less than the current 2033 ** key, mark this entry as an "start-delete-range". */ 2034 if( i>0 && aEntry[i-1].iBlk==iKey2-1 && aEntry[i-1].iId<0 ){ 2035 eType |= LSM_START_DELETE; 2036 } 2037 2038 pKey = pCsr->pSystemVal; 2039 nKey = 4; 2040 lsmPutU32(pKey, ~iKey2); 2041 } 2042 } 2043 break; 2044 } 2045 2046 default: { 2047 int iPtr = iKey - CURSOR_DATA_SEGMENT; 2048 assert( iPtr>=0 ); 2049 if( iPtr==pCsr->nPtr ){ 2050 if( pCsr->pBtCsr ){ 2051 pKey = pCsr->pBtCsr->pKey; 2052 nKey = pCsr->pBtCsr->nKey; 2053 eType = pCsr->pBtCsr->eType; 2054 } 2055 }else if( iPtr<pCsr->nPtr ){ 2056 SegmentPtr *pPtr = &pCsr->aPtr[iPtr]; 2057 if( pPtr->pPg ){ 2058 pKey = pPtr->pKey; 2059 nKey = pPtr->nKey; 2060 eType = pPtr->eType; 2061 } 2062 } 2063 break; 2064 } 2065 } 2066 2067 if( peType ) *peType = eType; 2068 if( pnKey ) *pnKey = nKey; 2069 if( ppKey ) *ppKey = pKey; 2070 } 2071 2072 static int sortedDbKeyCompare( 2073 MultiCursor *pCsr, 2074 int iLhsFlags, void *pLhsKey, int nLhsKey, 2075 int iRhsFlags, void *pRhsKey, int nRhsKey 2076 ){ 2077 int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; 2078 int res; 2079 2080 /* Compare the keys, including the system flag. */ 2081 res = sortedKeyCompare(xCmp, 2082 rtTopic(iLhsFlags), pLhsKey, nLhsKey, 2083 rtTopic(iRhsFlags), pRhsKey, nRhsKey 2084 ); 2085 2086 /* If a key has the LSM_START_DELETE flag set, but not the LSM_INSERT or 2087 ** LSM_POINT_DELETE flags, it is considered a delta larger. This prevents 2088 ** the beginning of an open-ended set from masking a database entry or 2089 ** delete at a lower level. */ 2090 if( res==0 && (pCsr->flags & CURSOR_IGNORE_DELETE) ){ 2091 const int m = LSM_POINT_DELETE|LSM_INSERT|LSM_END_DELETE |LSM_START_DELETE; 2092 int iDel1 = 0; 2093 int iDel2 = 0; 2094 2095 if( LSM_START_DELETE==(iLhsFlags & m) ) iDel1 = +1; 2096 if( LSM_END_DELETE ==(iLhsFlags & m) ) iDel1 = -1; 2097 if( LSM_START_DELETE==(iRhsFlags & m) ) iDel2 = +1; 2098 if( LSM_END_DELETE ==(iRhsFlags & m) ) iDel2 = -1; 2099 2100 res = (iDel1 - iDel2); 2101 } 2102 2103 return res; 2104 } 2105 2106 static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){ 2107 int i1; 2108 int i2; 2109 int iRes; 2110 void *pKey1; int nKey1; int eType1; 2111 void *pKey2; int nKey2; int eType2; 2112 const int mul = (bReverse ? -1 : 1); 2113 2114 assert( pCsr->aTree && iOut<pCsr->nTree ); 2115 if( iOut>=(pCsr->nTree/2) ){ 2116 i1 = (iOut - pCsr->nTree/2) * 2; 2117 i2 = i1 + 1; 2118 }else{ 2119 i1 = pCsr->aTree[iOut*2]; 2120 i2 = pCsr->aTree[iOut*2+1]; 2121 } 2122 2123 multiCursorGetKey(pCsr, i1, &eType1, &pKey1, &nKey1); 2124 multiCursorGetKey(pCsr, i2, &eType2, &pKey2, &nKey2); 2125 2126 if( pKey1==0 ){ 2127 iRes = i2; 2128 }else if( pKey2==0 ){ 2129 iRes = i1; 2130 }else{ 2131 int res; 2132 2133 /* Compare the keys */ 2134 res = sortedDbKeyCompare(pCsr, 2135 eType1, pKey1, nKey1, eType2, pKey2, nKey2 2136 ); 2137 2138 res = res * mul; 2139 if( res==0 ){ 2140 /* The two keys are identical. Normally, this means that the key from 2141 ** the newer run clobbers the old. However, if the newer key is a 2142 ** separator key, or a range-delete-boundary only, do not allow it 2143 ** to clobber an older entry. */ 2144 int nc1 = (eType1 & (LSM_INSERT|LSM_POINT_DELETE))==0; 2145 int nc2 = (eType2 & (LSM_INSERT|LSM_POINT_DELETE))==0; 2146 iRes = (nc1 > nc2) ? i2 : i1; 2147 }else if( res<0 ){ 2148 iRes = i1; 2149 }else{ 2150 iRes = i2; 2151 } 2152 } 2153 2154 pCsr->aTree[iOut] = iRes; 2155 } 2156 2157 /* 2158 ** This function advances segment pointer iPtr belonging to multi-cursor 2159 ** pCsr forward (bReverse==0) or backward (bReverse!=0). 2160 ** 2161 ** If the segment pointer points to a segment that is part of a composite 2162 ** level, then the following special case is handled. 2163 ** 2164 ** * If iPtr is the lhs of a composite level, and the cursor is being 2165 ** advanced forwards, and segment iPtr is at EOF, move all pointers 2166 ** that correspond to rhs segments of the same level to the first 2167 ** key in their respective data. 2168 */ 2169 static int segmentCursorAdvance( 2170 MultiCursor *pCsr, 2171 int iPtr, 2172 int bReverse 2173 ){ 2174 int rc; 2175 SegmentPtr *pPtr = &pCsr->aPtr[iPtr]; 2176 Level *pLvl = pPtr->pLevel; 2177 int bComposite; /* True if pPtr is part of composite level */ 2178 2179 /* Advance the segment-pointer object. */ 2180 rc = segmentPtrAdvance(pCsr, pPtr, bReverse); 2181 if( rc!=LSM_OK ) return rc; 2182 2183 bComposite = (pLvl->nRight>0 && pCsr->nPtr>pLvl->nRight); 2184 if( bComposite && pPtr->pPg==0 ){ 2185 int bFix = 0; 2186 if( (bReverse==0)==(pPtr->pSeg==&pLvl->lhs) ){ 2187 int i; 2188 if( bReverse ){ 2189 SegmentPtr *pLhs = &pCsr->aPtr[iPtr - 1 - (pPtr->pSeg - pLvl->aRhs)]; 2190 for(i=0; i<pLvl->nRight; i++){ 2191 if( pLhs[i+1].pPg ) break; 2192 } 2193 if( i==pLvl->nRight ){ 2194 bFix = 1; 2195 rc = segmentPtrEnd(pCsr, pLhs, 1); 2196 } 2197 }else{ 2198 bFix = 1; 2199 for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){ 2200 rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]); 2201 } 2202 } 2203 } 2204 2205 if( bFix ){ 2206 int i; 2207 for(i=pCsr->nTree-1; i>0; i--){ 2208 multiCursorDoCompare(pCsr, i, bReverse); 2209 } 2210 } 2211 } 2212 2213 #if 0 2214 if( bComposite && pPtr->pSeg==&pLvl->lhs /* lhs of composite level */ 2215 && bReverse==0 /* csr advanced forwards */ 2216 && pPtr->pPg==0 /* segment at EOF */ 2217 ){ 2218 int i; 2219 for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){ 2220 rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]); 2221 } 2222 for(i=pCsr->nTree-1; i>0; i--){ 2223 multiCursorDoCompare(pCsr, i, 0); 2224 } 2225 } 2226 #endif 2227 2228 return rc; 2229 } 2230 2231 static void mcursorFreeComponents(MultiCursor *pCsr){ 2232 int i; 2233 lsm_env *pEnv = pCsr->pDb->pEnv; 2234 2235 /* Close the tree cursor, if any. */ 2236 lsmTreeCursorDestroy(pCsr->apTreeCsr[0]); 2237 lsmTreeCursorDestroy(pCsr->apTreeCsr[1]); 2238 2239 /* Reset the segment pointers */ 2240 for(i=0; i<pCsr->nPtr; i++){ 2241 segmentPtrReset(&pCsr->aPtr[i], 0); 2242 } 2243 2244 /* And the b-tree cursor, if any */ 2245 btreeCursorFree(pCsr->pBtCsr); 2246 2247 /* Free allocations */ 2248 lsmFree(pEnv, pCsr->aPtr); 2249 lsmFree(pEnv, pCsr->aTree); 2250 lsmFree(pEnv, pCsr->pSystemVal); 2251 2252 /* Zero fields */ 2253 pCsr->nPtr = 0; 2254 pCsr->aPtr = 0; 2255 pCsr->nTree = 0; 2256 pCsr->aTree = 0; 2257 pCsr->pSystemVal = 0; 2258 pCsr->apTreeCsr[0] = 0; 2259 pCsr->apTreeCsr[1] = 0; 2260 pCsr->pBtCsr = 0; 2261 } 2262 2263 void lsmMCursorFreeCache(lsm_db *pDb){ 2264 MultiCursor *p; 2265 MultiCursor *pNext; 2266 for(p=pDb->pCsrCache; p; p=pNext){ 2267 pNext = p->pNext; 2268 lsmMCursorClose(p, 0); 2269 } 2270 pDb->pCsrCache = 0; 2271 } 2272 2273 /* 2274 ** Close the cursor passed as the first argument. 2275 ** 2276 ** If the bCache parameter is true, then shift the cursor to the pCsrCache 2277 ** list for possible reuse instead of actually deleting it. 2278 */ 2279 void lsmMCursorClose(MultiCursor *pCsr, int bCache){ 2280 if( pCsr ){ 2281 lsm_db *pDb = pCsr->pDb; 2282 MultiCursor **pp; /* Iterator variable */ 2283 2284 /* The cursor may or may not be currently part of the linked list 2285 ** starting at lsm_db.pCsr. If it is, extract it. */ 2286 for(pp=&pDb->pCsr; *pp; pp=&((*pp)->pNext)){ 2287 if( *pp==pCsr ){ 2288 *pp = pCsr->pNext; 2289 break; 2290 } 2291 } 2292 2293 if( bCache ){ 2294 int i; /* Used to iterate through segment-pointers */ 2295 2296 /* Release any page references held by this cursor. */ 2297 assert( !pCsr->pBtCsr ); 2298 for(i=0; i<pCsr->nPtr; i++){ 2299 SegmentPtr *pPtr = &pCsr->aPtr[i]; 2300 lsmFsPageRelease(pPtr->pPg); 2301 pPtr->pPg = 0; 2302 } 2303 2304 /* Reset the tree cursors */ 2305 lsmTreeCursorReset(pCsr->apTreeCsr[0]); 2306 lsmTreeCursorReset(pCsr->apTreeCsr[1]); 2307 2308 /* Add the cursor to the pCsrCache list */ 2309 pCsr->pNext = pDb->pCsrCache; 2310 pDb->pCsrCache = pCsr; 2311 }else{ 2312 /* Free the allocation used to cache the current key, if any. */ 2313 sortedBlobFree(&pCsr->key); 2314 sortedBlobFree(&pCsr->val); 2315 2316 /* Free the component cursors */ 2317 mcursorFreeComponents(pCsr); 2318 2319 /* Free the cursor structure itself */ 2320 lsmFree(pDb->pEnv, pCsr); 2321 } 2322 } 2323 } 2324 2325 #define TREE_NONE 0 2326 #define TREE_OLD 1 2327 #define TREE_BOTH 2 2328 2329 /* 2330 ** Parameter eTree is one of TREE_OLD or TREE_BOTH. 2331 */ 2332 static int multiCursorAddTree(MultiCursor *pCsr, Snapshot *pSnap, int eTree){ 2333 int rc = LSM_OK; 2334 lsm_db *db = pCsr->pDb; 2335 2336 /* Add a tree cursor on the 'old' tree, if it exists. */ 2337 if( eTree!=TREE_NONE 2338 && lsmTreeHasOld(db) 2339 && db->treehdr.iOldLog!=pSnap->iLogOff 2340 ){ 2341 rc = lsmTreeCursorNew(db, 1, &pCsr->apTreeCsr[1]); 2342 } 2343 2344 /* Add a tree cursor on the 'current' tree, if required. */ 2345 if( rc==LSM_OK && eTree==TREE_BOTH ){ 2346 rc = lsmTreeCursorNew(db, 0, &pCsr->apTreeCsr[0]); 2347 } 2348 2349 return rc; 2350 } 2351 2352 static int multiCursorAddRhs(MultiCursor *pCsr, Level *pLvl){ 2353 int i; 2354 int nRhs = pLvl->nRight; 2355 2356 assert( pLvl->nRight>0 ); 2357 assert( pCsr->aPtr==0 ); 2358 pCsr->aPtr = lsmMallocZero(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nRhs); 2359 if( !pCsr->aPtr ) return LSM_NOMEM_BKPT; 2360 pCsr->nPtr = nRhs; 2361 2362 for(i=0; i<nRhs; i++){ 2363 pCsr->aPtr[i].pSeg = &pLvl->aRhs[i]; 2364 pCsr->aPtr[i].pLevel = pLvl; 2365 } 2366 2367 return LSM_OK; 2368 } 2369 2370 static void multiCursorAddOne(MultiCursor *pCsr, Level *pLvl, int *pRc){ 2371 if( *pRc==LSM_OK ){ 2372 int iPtr = pCsr->nPtr; 2373 int i; 2374 pCsr->aPtr[iPtr].pLevel = pLvl; 2375 pCsr->aPtr[iPtr].pSeg = &pLvl->lhs; 2376 iPtr++; 2377 for(i=0; i<pLvl->nRight; i++){ 2378 pCsr->aPtr[iPtr].pLevel = pLvl; 2379 pCsr->aPtr[iPtr].pSeg = &pLvl->aRhs[i]; 2380 iPtr++; 2381 } 2382 2383 if( pLvl->nRight && pLvl->pSplitKey==0 ){ 2384 sortedSplitkey(pCsr->pDb, pLvl, pRc); 2385 } 2386 pCsr->nPtr = iPtr; 2387 } 2388 } 2389 2390 static int multiCursorAddAll(MultiCursor *pCsr, Snapshot *pSnap){ 2391 Level *pLvl; 2392 int nPtr = 0; 2393 int rc = LSM_OK; 2394 2395 for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){ 2396 /* If the LEVEL_INCOMPLETE flag is set, then this function is being 2397 ** called (indirectly) from within a sortedNewToplevel() call to 2398 ** construct pLvl. In this case ignore pLvl - this cursor is going to 2399 ** be used to retrieve a freelist entry from the LSM, and the partially 2400 ** complete level may confuse it. */ 2401 if( pLvl->flags & LEVEL_INCOMPLETE ) continue; 2402 nPtr += (1 + pLvl->nRight); 2403 } 2404 2405 assert( pCsr->aPtr==0 ); 2406 pCsr->aPtr = lsmMallocZeroRc(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nPtr, &rc); 2407 2408 for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){ 2409 if( (pLvl->flags & LEVEL_INCOMPLETE)==0 ){ 2410 multiCursorAddOne(pCsr, pLvl, &rc); 2411 } 2412 } 2413 2414 return rc; 2415 } 2416 2417 static int multiCursorInit(MultiCursor *pCsr, Snapshot *pSnap){ 2418 int rc; 2419 rc = multiCursorAddAll(pCsr, pSnap); 2420 if( rc==LSM_OK ){ 2421 rc = multiCursorAddTree(pCsr, pSnap, TREE_BOTH); 2422 } 2423 pCsr->flags |= (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE); 2424 return rc; 2425 } 2426 2427 static MultiCursor *multiCursorNew(lsm_db *db, int *pRc){ 2428 MultiCursor *pCsr; 2429 pCsr = (MultiCursor *)lsmMallocZeroRc(db->pEnv, sizeof(MultiCursor), pRc); 2430 if( pCsr ){ 2431 pCsr->pNext = db->pCsr; 2432 db->pCsr = pCsr; 2433 pCsr->pDb = db; 2434 } 2435 return pCsr; 2436 } 2437 2438 2439 void lsmSortedRemap(lsm_db *pDb){ 2440 MultiCursor *pCsr; 2441 for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){ 2442 int iPtr; 2443 if( pCsr->pBtCsr ){ 2444 btreeCursorLoadKey(pCsr->pBtCsr); 2445 } 2446 for(iPtr=0; iPtr<pCsr->nPtr; iPtr++){ 2447 segmentPtrLoadCell(&pCsr->aPtr[iPtr], pCsr->aPtr[iPtr].iCell); 2448 } 2449 } 2450 } 2451 2452 static void multiCursorReadSeparators(MultiCursor *pCsr){ 2453 if( pCsr->nPtr>0 ){ 2454 pCsr->flags |= CURSOR_READ_SEPARATORS; 2455 } 2456 } 2457 2458 /* 2459 ** Have this cursor skip over SORTED_DELETE entries. 2460 */ 2461 static void multiCursorIgnoreDelete(MultiCursor *pCsr){ 2462 if( pCsr ) pCsr->flags |= CURSOR_IGNORE_DELETE; 2463 } 2464 2465 /* 2466 ** If the free-block list is not empty, then have this cursor visit a key 2467 ** with (a) the system bit set, and (b) the key "FREELIST" and (c) a value 2468 ** blob containing the serialized free-block list. 2469 */ 2470 static int multiCursorVisitFreelist(MultiCursor *pCsr){ 2471 int rc = LSM_OK; 2472 pCsr->flags |= CURSOR_FLUSH_FREELIST; 2473 pCsr->pSystemVal = lsmMallocRc(pCsr->pDb->pEnv, 4 + 8, &rc); 2474 return rc; 2475 } 2476 2477 /* 2478 ** Allocate and return a new database cursor. 2479 ** 2480 ** This method should only be called to allocate user cursors. As it may 2481 ** recycle a cursor from lsm_db.pCsrCache. 2482 */ 2483 int lsmMCursorNew( 2484 lsm_db *pDb, /* Database handle */ 2485 MultiCursor **ppCsr /* OUT: Allocated cursor */ 2486 ){ 2487 MultiCursor *pCsr = 0; 2488 int rc = LSM_OK; 2489 2490 if( pDb->pCsrCache ){ 2491 int bOld; /* True if there is an old in-memory tree */ 2492 2493 /* Remove a cursor from the pCsrCache list and add it to the open list. */ 2494 pCsr = pDb->pCsrCache; 2495 pDb->pCsrCache = pCsr->pNext; 2496 pCsr->pNext = pDb->pCsr; 2497 pDb->pCsr = pCsr; 2498 2499 /* The cursor can almost be used as is, except that the old in-memory 2500 ** tree cursor may be present and not required, or required and not 2501 ** present. Fix this if required. */ 2502 bOld = (lsmTreeHasOld(pDb) && pDb->treehdr.iOldLog!=pDb->pClient->iLogOff); 2503 if( !bOld && pCsr->apTreeCsr[1] ){ 2504 lsmTreeCursorDestroy(pCsr->apTreeCsr[1]); 2505 pCsr->apTreeCsr[1] = 0; 2506 }else if( bOld && !pCsr->apTreeCsr[1] ){ 2507 rc = lsmTreeCursorNew(pDb, 1, &pCsr->apTreeCsr[1]); 2508 } 2509 2510 pCsr->flags = (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE); 2511 2512 }else{ 2513 pCsr = multiCursorNew(pDb, &rc); 2514 if( rc==LSM_OK ) rc = multiCursorInit(pCsr, pDb->pClient); 2515 } 2516 2517 if( rc!=LSM_OK ){ 2518 lsmMCursorClose(pCsr, 0); 2519 pCsr = 0; 2520 } 2521 assert( (rc==LSM_OK)==(pCsr!=0) ); 2522 *ppCsr = pCsr; 2523 return rc; 2524 } 2525 2526 static int multiCursorGetVal( 2527 MultiCursor *pCsr, 2528 int iVal, 2529 void **ppVal, 2530 int *pnVal 2531 ){ 2532 int rc = LSM_OK; 2533 2534 *ppVal = 0; 2535 *pnVal = 0; 2536 2537 switch( iVal ){ 2538 case CURSOR_DATA_TREE0: 2539 case CURSOR_DATA_TREE1: { 2540 TreeCursor *pTreeCsr = pCsr->apTreeCsr[iVal-CURSOR_DATA_TREE0]; 2541 if( lsmTreeCursorValid(pTreeCsr) ){ 2542 lsmTreeCursorValue(pTreeCsr, ppVal, pnVal); 2543 }else{ 2544 *ppVal = 0; 2545 *pnVal = 0; 2546 } 2547 break; 2548 } 2549 2550 case CURSOR_DATA_SYSTEM: { 2551 Snapshot *pWorker = pCsr->pDb->pWorker; 2552 if( pWorker 2553 && (pCsr->iFree % 2)==0 2554 && pCsr->iFree < (pWorker->freelist.nEntry*2) 2555 ){ 2556 int iEntry = pWorker->freelist.nEntry - 1 - (pCsr->iFree / 2); 2557 u8 *aVal = &((u8 *)(pCsr->pSystemVal))[4]; 2558 lsmPutU64(aVal, pWorker->freelist.aEntry[iEntry].iId); 2559 *ppVal = aVal; 2560 *pnVal = 8; 2561 } 2562 break; 2563 } 2564 2565 default: { 2566 int iPtr = iVal-CURSOR_DATA_SEGMENT; 2567 if( iPtr<pCsr->nPtr ){ 2568 SegmentPtr *pPtr = &pCsr->aPtr[iPtr]; 2569 if( pPtr->pPg ){ 2570 *ppVal = pPtr->pVal; 2571 *pnVal = pPtr->nVal; 2572 } 2573 } 2574 } 2575 } 2576 2577 assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) ); 2578 return rc; 2579 } 2580 2581 static int multiCursorAdvance(MultiCursor *pCsr, int bReverse); 2582 2583 /* 2584 ** This function is called by worker connections to walk the part of the 2585 ** free-list stored within the LSM data structure. 2586 */ 2587 int lsmSortedWalkFreelist( 2588 lsm_db *pDb, /* Database handle */ 2589 int bReverse, /* True to iterate from largest to smallest */ 2590 int (*x)(void *, int, i64), /* Callback function */ 2591 void *pCtx /* First argument to pass to callback */ 2592 ){ 2593 MultiCursor *pCsr; /* Cursor used to read db */ 2594 int rc = LSM_OK; /* Return Code */ 2595 Snapshot *pSnap = 0; 2596 2597 assert( pDb->pWorker ); 2598 if( pDb->bIncrMerge ){ 2599 rc = lsmCheckpointDeserialize(pDb, 0, pDb->pShmhdr->aSnap1, &pSnap); 2600 if( rc!=LSM_OK ) return rc; 2601 }else{ 2602 pSnap = pDb->pWorker; 2603 } 2604 2605 pCsr = multiCursorNew(pDb, &rc); 2606 if( pCsr ){ 2607 rc = multiCursorAddAll(pCsr, pSnap); 2608 pCsr->flags |= CURSOR_IGNORE_DELETE; 2609 } 2610 2611 if( rc==LSM_OK ){ 2612 if( bReverse==0 ){ 2613 rc = lsmMCursorLast(pCsr); 2614 }else{ 2615 rc = lsmMCursorSeek(pCsr, 1, "", 0, LSM_SEEK_GE); 2616 } 2617 2618 while( rc==LSM_OK && lsmMCursorValid(pCsr) && rtIsSystem(pCsr->eType) ){ 2619 void *pKey; int nKey; 2620 void *pVal = 0; int nVal = 0; 2621 2622 rc = lsmMCursorKey(pCsr, &pKey, &nKey); 2623 if( rc==LSM_OK ) rc = lsmMCursorValue(pCsr, &pVal, &nVal); 2624 if( rc==LSM_OK && (nKey!=4 || nVal!=8) ) rc = LSM_CORRUPT_BKPT; 2625 2626 if( rc==LSM_OK ){ 2627 int iBlk; 2628 i64 iSnap; 2629 iBlk = (int)(~(lsmGetU32((u8 *)pKey))); 2630 iSnap = (i64)lsmGetU64((u8 *)pVal); 2631 if( x(pCtx, iBlk, iSnap) ) break; 2632 rc = multiCursorAdvance(pCsr, !bReverse); 2633 } 2634 } 2635 } 2636 2637 lsmMCursorClose(pCsr, 0); 2638 if( pSnap!=pDb->pWorker ){ 2639 lsmFreeSnapshot(pDb->pEnv, pSnap); 2640 } 2641 2642 return rc; 2643 } 2644 2645 int lsmSortedLoadFreelist( 2646 lsm_db *pDb, /* Database handle (must be worker) */ 2647 void **ppVal, /* OUT: Blob containing LSM free-list */ 2648 int *pnVal /* OUT: Size of *ppVal blob in bytes */ 2649 ){ 2650 MultiCursor *pCsr; /* Cursor used to retreive free-list */ 2651 int rc = LSM_OK; /* Return Code */ 2652 2653 assert( pDb->pWorker ); 2654 assert( *ppVal==0 && *pnVal==0 ); 2655 2656 pCsr = multiCursorNew(pDb, &rc); 2657 if( pCsr ){ 2658 rc = multiCursorAddAll(pCsr, pDb->pWorker); 2659 pCsr->flags |= CURSOR_IGNORE_DELETE; 2660 } 2661 2662 if( rc==LSM_OK ){ 2663 rc = lsmMCursorLast(pCsr); 2664 if( rc==LSM_OK 2665 && rtIsWrite(pCsr->eType) && rtIsSystem(pCsr->eType) 2666 && pCsr->key.nData==8 2667 && 0==memcmp(pCsr->key.pData, "FREELIST", 8) 2668 ){ 2669 void *pVal; int nVal; /* Value read from database */ 2670 rc = lsmMCursorValue(pCsr, &pVal, &nVal); 2671 if( rc==LSM_OK ){ 2672 *ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc); 2673 if( *ppVal ){ 2674 memcpy(*ppVal, pVal, nVal); 2675 *pnVal = nVal; 2676 } 2677 } 2678 } 2679 2680 lsmMCursorClose(pCsr, 0); 2681 } 2682 2683 return rc; 2684 } 2685 2686 static int multiCursorAllocTree(MultiCursor *pCsr){ 2687 int rc = LSM_OK; 2688 if( pCsr->aTree==0 ){ 2689 int nByte; /* Bytes of space to allocate */ 2690 int nMin; /* Total number of cursors being merged */ 2691 2692 nMin = CURSOR_DATA_SEGMENT + pCsr->nPtr + (pCsr->pBtCsr!=0); 2693 pCsr->nTree = 2; 2694 while( pCsr->nTree<nMin ){ 2695 pCsr->nTree = pCsr->nTree*2; 2696 } 2697 2698 nByte = sizeof(int)*pCsr->nTree*2; 2699 pCsr->aTree = (int *)lsmMallocZeroRc(pCsr->pDb->pEnv, nByte, &rc); 2700 } 2701 return rc; 2702 } 2703 2704 static void multiCursorCacheKey(MultiCursor *pCsr, int *pRc){ 2705 if( *pRc==LSM_OK ){ 2706 void *pKey; 2707 int nKey; 2708 multiCursorGetKey(pCsr, pCsr->aTree[1], &pCsr->eType, &pKey, &nKey); 2709 *pRc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->key, pKey, nKey); 2710 } 2711 } 2712 2713 #ifdef LSM_DEBUG_EXPENSIVE 2714 static void assertCursorTree(MultiCursor *pCsr){ 2715 int bRev = !!(pCsr->flags & CURSOR_PREV_OK); 2716 int *aSave = pCsr->aTree; 2717 int nSave = pCsr->nTree; 2718 int rc; 2719 2720 pCsr->aTree = 0; 2721 pCsr->nTree = 0; 2722 rc = multiCursorAllocTree(pCsr); 2723 if( rc==LSM_OK ){ 2724 int i; 2725 for(i=pCsr->nTree-1; i>0; i--){ 2726 multiCursorDoCompare(pCsr, i, bRev); 2727 } 2728 2729 assert( nSave==pCsr->nTree 2730 && 0==memcmp(aSave, pCsr->aTree, sizeof(int)*nSave) 2731 ); 2732 2733 lsmFree(pCsr->pDb->pEnv, pCsr->aTree); 2734 } 2735 2736 pCsr->aTree = aSave; 2737 pCsr->nTree = nSave; 2738 } 2739 #else 2740 # define assertCursorTree(x) 2741 #endif 2742 2743 static int mcursorLocationOk(MultiCursor *pCsr, int bDeleteOk){ 2744 int eType = pCsr->eType; 2745 int iKey; 2746 int i; 2747 int rdmask; 2748 2749 assert( pCsr->flags & (CURSOR_NEXT_OK|CURSOR_PREV_OK) ); 2750 assertCursorTree(pCsr); 2751 2752 rdmask = (pCsr->flags & CURSOR_NEXT_OK) ? LSM_END_DELETE : LSM_START_DELETE; 2753 2754 /* If the cursor does not currently point to an actual database key (i.e. 2755 ** it points to a delete key, or the start or end of a range-delete), and 2756 ** the CURSOR_IGNORE_DELETE flag is set, skip past this entry. */ 2757 if( (pCsr->flags & CURSOR_IGNORE_DELETE) && bDeleteOk==0 ){ 2758 if( (eType & LSM_INSERT)==0 ) return 0; 2759 } 2760 2761 /* If the cursor points to a system key (free-list entry), and the 2762 ** CURSOR_IGNORE_SYSTEM flag is set, skip thie entry. */ 2763 if( (pCsr->flags & CURSOR_IGNORE_SYSTEM) && rtTopic(eType)!=0 ){ 2764 return 0; 2765 } 2766 2767 #ifndef NDEBUG 2768 /* This block fires assert() statements to check one of the assumptions 2769 ** in the comment below - that if the lhs sub-cursor of a level undergoing 2770 ** a merge is valid, then all the rhs sub-cursors must be at EOF. 2771 ** 2772 ** Also assert that all rhs sub-cursors are either at EOF or point to 2773 ** a key that is not less than the level split-key. */ 2774 for(i=0; i<pCsr->nPtr; i++){ 2775 SegmentPtr *pPtr = &pCsr->aPtr[i]; 2776 Level *pLvl = pPtr->pLevel; 2777 if( pLvl->nRight && pPtr->pPg ){ 2778 if( pPtr->pSeg==&pLvl->lhs ){ 2779 int j; 2780 for(j=0; j<pLvl->nRight; j++) assert( pPtr[j+1].pPg==0 ); 2781 }else{ 2782 int res = sortedKeyCompare(pCsr->pDb->xCmp, 2783 rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey, 2784 pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey 2785 ); 2786 assert( res>=0 ); 2787 } 2788 } 2789 } 2790 #endif 2791 2792 /* Now check if this key has already been deleted by a range-delete. If 2793 ** so, skip past it. 2794 ** 2795 ** Assume, for the moment, that the tree contains no levels currently 2796 ** undergoing incremental merge, and that this cursor is iterating forwards 2797 ** through the database keys. The cursor currently points to a key in 2798 ** level L. This key has already been deleted if any of the sub-cursors 2799 ** that point to levels newer than L (or to the in-memory tree) point to 2800 ** a key greater than the current key with the LSM_END_DELETE flag set. 2801 ** 2802 ** Or, if the cursor is iterating backwards through data keys, if any 2803 ** such sub-cursor points to a key smaller than the current key with the 2804 ** LSM_START_DELETE flag set. 2805 ** 2806 ** Why it works with levels undergoing a merge too: 2807 ** 2808 ** When a cursor iterates forwards, the sub-cursors for the rhs of a 2809 ** level are only activated once the lhs reaches EOF. So when iterating 2810 ** forwards, the keys visited are the same as if the level was completely 2811 ** merged. 2812 ** 2813 ** If the cursor is iterating backwards, then the lhs sub-cursor is not 2814 ** initialized until the last of the rhs sub-cursors has reached EOF. 2815 ** Additionally, if the START_DELETE flag is set on the last entry (in 2816 ** reverse order - so the entry with the smallest key) of a rhs sub-cursor, 2817 ** then a pseudo-key equal to the levels split-key with the END_DELETE 2818 ** flag set is visited by the sub-cursor. 2819 */ 2820 iKey = pCsr->aTree[1]; 2821 for(i=0; i<iKey; i++){ 2822 int csrflags; 2823 multiCursorGetKey(pCsr, i, &csrflags, 0, 0); 2824 if( (rdmask & csrflags) ){ 2825 const int SD_ED = (LSM_START_DELETE|LSM_END_DELETE); 2826 if( (csrflags & SD_ED)==SD_ED 2827 || (pCsr->flags & CURSOR_IGNORE_DELETE)==0 2828 ){ 2829 void *pKey; int nKey; 2830 multiCursorGetKey(pCsr, i, 0, &pKey, &nKey); 2831 if( 0==sortedKeyCompare(pCsr->pDb->xCmp, 2832 rtTopic(eType), pCsr->key.pData, pCsr->key.nData, 2833 rtTopic(csrflags), pKey, nKey 2834 )){ 2835 continue; 2836 } 2837 } 2838 return 0; 2839 } 2840 } 2841 2842 /* The current cursor position is one this cursor should visit. Return 1. */ 2843 return 1; 2844 } 2845 2846 static int multiCursorSetupTree(MultiCursor *pCsr, int bRev){ 2847 int rc; 2848 2849 rc = multiCursorAllocTree(pCsr); 2850 if( rc==LSM_OK ){ 2851 int i; 2852 for(i=pCsr->nTree-1; i>0; i--){ 2853 multiCursorDoCompare(pCsr, i, bRev); 2854 } 2855 } 2856 2857 assertCursorTree(pCsr); 2858 multiCursorCacheKey(pCsr, &rc); 2859 2860 if( rc==LSM_OK && mcursorLocationOk(pCsr, 0)==0 ){ 2861 rc = multiCursorAdvance(pCsr, bRev); 2862 } 2863 return rc; 2864 } 2865 2866 2867 static int multiCursorEnd(MultiCursor *pCsr, int bLast){ 2868 int rc = LSM_OK; 2869 int i; 2870 2871 pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK); 2872 pCsr->flags |= (bLast ? CURSOR_PREV_OK : CURSOR_NEXT_OK); 2873 pCsr->iFree = 0; 2874 2875 /* Position the two in-memory tree cursors */ 2876 for(i=0; rc==LSM_OK && i<2; i++){ 2877 if( pCsr->apTreeCsr[i] ){ 2878 rc = lsmTreeCursorEnd(pCsr->apTreeCsr[i], bLast); 2879 } 2880 } 2881 2882 for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){ 2883 SegmentPtr *pPtr = &pCsr->aPtr[i]; 2884 Level *pLvl = pPtr->pLevel; 2885 int iRhs; 2886 int bHit = 0; 2887 2888 if( bLast ){ 2889 for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){ 2890 rc = segmentPtrEnd(pCsr, &pPtr[iRhs+1], 1); 2891 if( pPtr[iRhs+1].pPg ) bHit = 1; 2892 } 2893 if( bHit==0 && rc==LSM_OK ){ 2894 rc = segmentPtrEnd(pCsr, pPtr, 1); 2895 }else{ 2896 segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD); 2897 } 2898 }else{ 2899 int bLhs = (pPtr->pSeg==&pLvl->lhs); 2900 assert( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[0] ); 2901 2902 if( bLhs ){ 2903 rc = segmentPtrEnd(pCsr, pPtr, 0); 2904 if( pPtr->pKey ) bHit = 1; 2905 } 2906 for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){ 2907 if( bHit ){ 2908 segmentPtrReset(&pPtr[iRhs+1], LSM_SEGMENTPTR_FREE_THRESHOLD); 2909 }else{ 2910 rc = sortedRhsFirst(pCsr, pLvl, &pPtr[iRhs+bLhs]); 2911 } 2912 } 2913 } 2914 i += pLvl->nRight; 2915 } 2916 2917 /* And the b-tree cursor, if applicable */ 2918 if( rc==LSM_OK && pCsr->pBtCsr ){ 2919 assert( bLast==0 ); 2920 rc = btreeCursorFirst(pCsr->pBtCsr); 2921 } 2922 2923 if( rc==LSM_OK ){ 2924 rc = multiCursorSetupTree(pCsr, bLast); 2925 } 2926 2927 return rc; 2928 } 2929 2930 2931 int mcursorSave(MultiCursor *pCsr){ 2932 int rc = LSM_OK; 2933 if( pCsr->aTree ){ 2934 int iTree = pCsr->aTree[1]; 2935 if( iTree==CURSOR_DATA_TREE0 || iTree==CURSOR_DATA_TREE1 ){ 2936 multiCursorCacheKey(pCsr, &rc); 2937 } 2938 } 2939 mcursorFreeComponents(pCsr); 2940 return rc; 2941 } 2942 2943 int mcursorRestore(lsm_db *pDb, MultiCursor *pCsr){ 2944 int rc; 2945 rc = multiCursorInit(pCsr, pDb->pClient); 2946 if( rc==LSM_OK && pCsr->key.pData ){ 2947 rc = lsmMCursorSeek(pCsr, 2948 rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, +1 2949 ); 2950 } 2951 return rc; 2952 } 2953 2954 int lsmSaveCursors(lsm_db *pDb){ 2955 int rc = LSM_OK; 2956 MultiCursor *pCsr; 2957 2958 for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){ 2959 rc = mcursorSave(pCsr); 2960 } 2961 return rc; 2962 } 2963 2964 int lsmRestoreCursors(lsm_db *pDb){ 2965 int rc = LSM_OK; 2966 MultiCursor *pCsr; 2967 2968 for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){ 2969 rc = mcursorRestore(pDb, pCsr); 2970 } 2971 return rc; 2972 } 2973 2974 int lsmMCursorFirst(MultiCursor *pCsr){ 2975 return multiCursorEnd(pCsr, 0); 2976 } 2977 2978 int lsmMCursorLast(MultiCursor *pCsr){ 2979 return multiCursorEnd(pCsr, 1); 2980 } 2981 2982 lsm_db *lsmMCursorDb(MultiCursor *pCsr){ 2983 return pCsr->pDb; 2984 } 2985 2986 void lsmMCursorReset(MultiCursor *pCsr){ 2987 int i; 2988 lsmTreeCursorReset(pCsr->apTreeCsr[0]); 2989 lsmTreeCursorReset(pCsr->apTreeCsr[1]); 2990 for(i=0; i<pCsr->nPtr; i++){ 2991 segmentPtrReset(&pCsr->aPtr[i], LSM_SEGMENTPTR_FREE_THRESHOLD); 2992 } 2993 pCsr->key.nData = 0; 2994 } 2995 2996 static int treeCursorSeek( 2997 MultiCursor *pCsr, 2998 TreeCursor *pTreeCsr, 2999 void *pKey, int nKey, 3000 int eSeek, 3001 int *pbStop 3002 ){ 3003 int rc = LSM_OK; 3004 if( pTreeCsr ){ 3005 int res = 0; 3006 lsmTreeCursorSeek(pTreeCsr, pKey, nKey, &res); 3007 switch( eSeek ){ 3008 case LSM_SEEK_EQ: { 3009 int eType = lsmTreeCursorFlags(pTreeCsr); 3010 if( (res<0 && (eType & LSM_START_DELETE)) 3011 || (res>0 && (eType & LSM_END_DELETE)) 3012 || (res==0 && (eType & LSM_POINT_DELETE)) 3013 ){ 3014 *pbStop = 1; 3015 }else if( res==0 && (eType & LSM_INSERT) ){ 3016 lsm_env *pEnv = pCsr->pDb->pEnv; 3017 void *p; int n; /* Key/value from tree-cursor */ 3018 *pbStop = 1; 3019 pCsr->flags |= CURSOR_SEEK_EQ; 3020 rc = lsmTreeCursorKey(pTreeCsr, &pCsr->eType, &p, &n); 3021 if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->key, p, n); 3022 if( rc==LSM_OK ) rc = lsmTreeCursorValue(pTreeCsr, &p, &n); 3023 if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->val, p, n); 3024 } 3025 lsmTreeCursorReset(pTreeCsr); 3026 break; 3027 } 3028 case LSM_SEEK_GE: 3029 if( res<0 && lsmTreeCursorValid(pTreeCsr) ){ 3030 lsmTreeCursorNext(pTreeCsr); 3031 } 3032 break; 3033 default: 3034 if( res>0 ){ 3035 assert( lsmTreeCursorValid(pTreeCsr) ); 3036 lsmTreeCursorPrev(pTreeCsr); 3037 } 3038 break; 3039 } 3040 } 3041 return rc; 3042 } 3043 3044 3045 /* 3046 ** Seek the cursor. 3047 */ 3048 int lsmMCursorSeek( 3049 MultiCursor *pCsr, 3050 int iTopic, 3051 void *pKey, int nKey, 3052 int eSeek 3053 ){ 3054 int eESeek = eSeek; /* Effective eSeek parameter */ 3055 int bStop = 0; /* Set to true to halt search operation */ 3056 int rc = LSM_OK; /* Return code */ 3057 int iPtr = 0; /* Used to iterate through pCsr->aPtr[] */ 3058 Pgno iPgno = 0; /* FC pointer value */ 3059 3060 assert( pCsr->apTreeCsr[0]==0 || iTopic==0 ); 3061 assert( pCsr->apTreeCsr[1]==0 || iTopic==0 ); 3062 3063 if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE; 3064 3065 assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE ); 3066 assert( (pCsr->flags & CURSOR_FLUSH_FREELIST)==0 ); 3067 assert( pCsr->nPtr==0 || pCsr->aPtr[0].pLevel ); 3068 3069 pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ); 3070 rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[0], pKey, nKey, eESeek, &bStop); 3071 if( rc==LSM_OK && bStop==0 ){ 3072 rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[1], pKey, nKey, eESeek, &bStop); 3073 } 3074 3075 /* Seek all segment pointers. */ 3076 for(iPtr=0; iPtr<pCsr->nPtr && rc==LSM_OK && bStop==0; iPtr++){ 3077 SegmentPtr *pPtr = &pCsr->aPtr[iPtr]; 3078 assert( pPtr->pSeg==&pPtr->pLevel->lhs ); 3079 rc = seekInLevel(pCsr, pPtr, eESeek, iTopic, pKey, nKey, &iPgno, &bStop); 3080 iPtr += pPtr->pLevel->nRight; 3081 } 3082 3083 if( eSeek!=LSM_SEEK_EQ ){ 3084 if( rc==LSM_OK ){ 3085 rc = multiCursorAllocTree(pCsr); 3086 } 3087 if( rc==LSM_OK ){ 3088 int i; 3089 for(i=pCsr->nTree-1; i>0; i--){ 3090 multiCursorDoCompare(pCsr, i, eESeek==LSM_SEEK_LE); 3091 } 3092 if( eSeek==LSM_SEEK_GE ) pCsr->flags |= CURSOR_NEXT_OK; 3093 if( eSeek==LSM_SEEK_LE ) pCsr->flags |= CURSOR_PREV_OK; 3094 } 3095 3096 multiCursorCacheKey(pCsr, &rc); 3097 if( rc==LSM_OK && eSeek!=LSM_SEEK_LEFAST && 0==mcursorLocationOk(pCsr, 0) ){ 3098 switch( eESeek ){ 3099 case LSM_SEEK_EQ: 3100 lsmMCursorReset(pCsr); 3101 break; 3102 case LSM_SEEK_GE: 3103 rc = lsmMCursorNext(pCsr); 3104 break; 3105 default: 3106 rc = lsmMCursorPrev(pCsr); 3107 break; 3108 } 3109 } 3110 } 3111 3112 return rc; 3113 } 3114 3115 int lsmMCursorValid(MultiCursor *pCsr){ 3116 int res = 0; 3117 if( pCsr->flags & CURSOR_SEEK_EQ ){ 3118 res = 1; 3119 }else if( pCsr->aTree ){ 3120 int iKey = pCsr->aTree[1]; 3121 if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){ 3122 res = lsmTreeCursorValid(pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0]); 3123 }else{ 3124 void *pKey; 3125 multiCursorGetKey(pCsr, iKey, 0, &pKey, 0); 3126 res = pKey!=0; 3127 } 3128 } 3129 return res; 3130 } 3131 3132 static int mcursorAdvanceOk( 3133 MultiCursor *pCsr, 3134 int bReverse, 3135 int *pRc 3136 ){ 3137 void *pNew; /* Pointer to buffer containing new key */ 3138 int nNew; /* Size of buffer pNew in bytes */ 3139 int eNewType; /* Type of new record */ 3140 3141 if( *pRc ) return 1; 3142 3143 /* Check the current key value. If it is not greater than (if bReverse==0) 3144 ** or less than (if bReverse!=0) the key currently cached in pCsr->key, 3145 ** then the cursor has not yet been successfully advanced. 3146 */ 3147 multiCursorGetKey(pCsr, pCsr->aTree[1], &eNewType, &pNew, &nNew); 3148 if( pNew ){ 3149 int typemask = (pCsr->flags & CURSOR_IGNORE_DELETE) ? ~(0) : LSM_SYSTEMKEY; 3150 int res = sortedDbKeyCompare(pCsr, 3151 eNewType & typemask, pNew, nNew, 3152 pCsr->eType & typemask, pCsr->key.pData, pCsr->key.nData 3153 ); 3154 3155 if( (bReverse==0 && res<=0) || (bReverse!=0 && res>=0) ){ 3156 return 0; 3157 } 3158 3159 multiCursorCacheKey(pCsr, pRc); 3160 assert( pCsr->eType==eNewType ); 3161 3162 /* If this cursor is configured to skip deleted keys, and the current 3163 ** cursor points to a SORTED_DELETE entry, then the cursor has not been 3164 ** successfully advanced. 3165 ** 3166 ** Similarly, if the cursor is configured to skip system keys and the 3167 ** current cursor points to a system key, it has not yet been advanced. 3168 */ 3169 if( *pRc==LSM_OK && 0==mcursorLocationOk(pCsr, 0) ) return 0; 3170 } 3171 return 1; 3172 } 3173 3174 static void flCsrAdvance(MultiCursor *pCsr){ 3175 assert( pCsr->flags & CURSOR_FLUSH_FREELIST ); 3176 if( pCsr->iFree % 2 ){ 3177 pCsr->iFree++; 3178 }else{ 3179 int nEntry = pCsr->pDb->pWorker->freelist.nEntry; 3180 FreelistEntry *aEntry = pCsr->pDb->pWorker->freelist.aEntry; 3181 3182 int i = nEntry - 1 - (pCsr->iFree / 2); 3183 3184 /* If the current entry is a delete and the "end-delete" key will not 3185 ** be attached to the next entry, increment iFree by 1 only. */ 3186 if( aEntry[i].iId<0 ){ 3187 while( 1 ){ 3188 if( i==0 || aEntry[i-1].iBlk!=aEntry[i].iBlk-1 ){ 3189 pCsr->iFree--; 3190 break; 3191 } 3192 if( aEntry[i-1].iId>=0 ) break; 3193 pCsr->iFree += 2; 3194 i--; 3195 } 3196 } 3197 pCsr->iFree += 2; 3198 } 3199 } 3200 3201 static int multiCursorAdvance(MultiCursor *pCsr, int bReverse){ 3202 int rc = LSM_OK; /* Return Code */ 3203 if( lsmMCursorValid(pCsr) ){ 3204 do { 3205 int iKey = pCsr->aTree[1]; 3206 3207 assertCursorTree(pCsr); 3208 3209 /* If this multi-cursor is advancing forwards, and the sub-cursor 3210 ** being advanced is the one that separator keys may be being read 3211 ** from, record the current absolute pointer value. */ 3212 if( pCsr->pPrevMergePtr ){ 3213 if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){ 3214 assert( pCsr->pBtCsr ); 3215 *pCsr->pPrevMergePtr = pCsr->pBtCsr->iPtr; 3216 }else if( pCsr->pBtCsr==0 && pCsr->nPtr>0 3217 && iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr-1) 3218 ){ 3219 SegmentPtr *pPtr = &pCsr->aPtr[iKey-CURSOR_DATA_SEGMENT]; 3220 *pCsr->pPrevMergePtr = pPtr->iPtr+pPtr->iPgPtr; 3221 } 3222 } 3223 3224 if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){ 3225 TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0]; 3226 if( bReverse ){ 3227 rc = lsmTreeCursorPrev(pTreeCsr); 3228 }else{ 3229 rc = lsmTreeCursorNext(pTreeCsr); 3230 } 3231 }else if( iKey==CURSOR_DATA_SYSTEM ){ 3232 assert( pCsr->flags & CURSOR_FLUSH_FREELIST ); 3233 assert( bReverse==0 ); 3234 flCsrAdvance(pCsr); 3235 }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){ 3236 assert( bReverse==0 && pCsr->pBtCsr ); 3237 rc = btreeCursorNext(pCsr->pBtCsr); 3238 }else{ 3239 rc = segmentCursorAdvance(pCsr, iKey-CURSOR_DATA_SEGMENT, bReverse); 3240 } 3241 if( rc==LSM_OK ){ 3242 int i; 3243 for(i=(iKey+pCsr->nTree)/2; i>0; i=i/2){ 3244 multiCursorDoCompare(pCsr, i, bReverse); 3245 } 3246 assertCursorTree(pCsr); 3247 } 3248 }while( mcursorAdvanceOk(pCsr, bReverse, &rc)==0 ); 3249 } 3250 return rc; 3251 } 3252 3253 int lsmMCursorNext(MultiCursor *pCsr){ 3254 if( (pCsr->flags & CURSOR_NEXT_OK)==0 ) return LSM_MISUSE_BKPT; 3255 return multiCursorAdvance(pCsr, 0); 3256 } 3257 3258 int lsmMCursorPrev(MultiCursor *pCsr){ 3259 if( (pCsr->flags & CURSOR_PREV_OK)==0 ) return LSM_MISUSE_BKPT; 3260 return multiCursorAdvance(pCsr, 1); 3261 } 3262 3263 int lsmMCursorKey(MultiCursor *pCsr, void **ppKey, int *pnKey){ 3264 if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){ 3265 *pnKey = pCsr->key.nData; 3266 *ppKey = pCsr->key.pData; 3267 }else{ 3268 int iKey = pCsr->aTree[1]; 3269 3270 if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){ 3271 TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0]; 3272 lsmTreeCursorKey(pTreeCsr, 0, ppKey, pnKey); 3273 }else{ 3274 int nKey; 3275 3276 #ifndef NDEBUG 3277 void *pKey; 3278 int eType; 3279 multiCursorGetKey(pCsr, iKey, &eType, &pKey, &nKey); 3280 assert( eType==pCsr->eType ); 3281 assert( nKey==pCsr->key.nData ); 3282 assert( memcmp(pKey, pCsr->key.pData, nKey)==0 ); 3283 #endif 3284 3285 nKey = pCsr->key.nData; 3286 if( nKey==0 ){ 3287 *ppKey = 0; 3288 }else{ 3289 *ppKey = pCsr->key.pData; 3290 } 3291 *pnKey = nKey; 3292 } 3293 } 3294 return LSM_OK; 3295 } 3296 3297 /* 3298 ** Compare the current key that cursor csr points to with pKey/nKey. Set 3299 ** *piRes to the result and return LSM_OK. 3300 */ 3301 int lsm_csr_cmp(lsm_cursor *csr, const void *pKey, int nKey, int *piRes){ 3302 MultiCursor *pCsr = (MultiCursor *)csr; 3303 void *pCsrkey; int nCsrkey; 3304 int rc; 3305 rc = lsmMCursorKey(pCsr, &pCsrkey, &nCsrkey); 3306 if( rc==LSM_OK ){ 3307 int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; 3308 *piRes = sortedKeyCompare(xCmp, 0, pCsrkey, nCsrkey, 0, (void *)pKey, nKey); 3309 } 3310 return rc; 3311 } 3312 3313 int lsmMCursorValue(MultiCursor *pCsr, void **ppVal, int *pnVal){ 3314 void *pVal; 3315 int nVal; 3316 int rc; 3317 if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){ 3318 rc = LSM_OK; 3319 nVal = pCsr->val.nData; 3320 pVal = pCsr->val.pData; 3321 }else{ 3322 3323 assert( pCsr->aTree ); 3324 assert( mcursorLocationOk(pCsr, (pCsr->flags & CURSOR_IGNORE_DELETE)) ); 3325 3326 rc = multiCursorGetVal(pCsr, pCsr->aTree[1], &pVal, &nVal); 3327 if( pVal && rc==LSM_OK ){ 3328 rc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->val, pVal, nVal); 3329 pVal = pCsr->val.pData; 3330 } 3331 3332 if( rc!=LSM_OK ){ 3333 pVal = 0; 3334 nVal = 0; 3335 } 3336 } 3337 *ppVal = pVal; 3338 *pnVal = nVal; 3339 return rc; 3340 } 3341 3342 int lsmMCursorType(MultiCursor *pCsr, int *peType){ 3343 assert( pCsr->aTree ); 3344 multiCursorGetKey(pCsr, pCsr->aTree[1], peType, 0, 0); 3345 return LSM_OK; 3346 } 3347 3348 /* 3349 ** Buffer aData[], size nData, is assumed to contain a valid b-tree 3350 ** hierarchy page image. Return the offset in aData[] of the next free 3351 ** byte in the data area (where a new cell may be written if there is 3352 ** space). 3353 */ 3354 static int mergeWorkerPageOffset(u8 *aData, int nData){ 3355 int nRec; 3356 int iOff; 3357 int nKey; 3358 int eType; 3359 3360 nRec = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]); 3361 iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec-1)]); 3362 eType = aData[iOff++]; 3363 assert( eType==0 3364 || eType==(LSM_SYSTEMKEY|LSM_SEPARATOR) 3365 || eType==(LSM_SEPARATOR) 3366 ); 3367 3368 iOff += lsmVarintGet32(&aData[iOff], &nKey); 3369 iOff += lsmVarintGet32(&aData[iOff], &nKey); 3370 3371 return iOff + (eType ? nKey : 0); 3372 } 3373 3374 /* 3375 ** Following a checkpoint operation, database pages that are part of the 3376 ** checkpointed state of the LSM are deemed read-only. This includes the 3377 ** right-most page of the b-tree hierarchy of any separators array under 3378 ** construction, and all pages between it and the b-tree root, inclusive. 3379 ** This is a problem, as when further pages are appended to the separators 3380 ** array, entries must be added to the indicated b-tree hierarchy pages. 3381 ** 3382 ** This function copies all such b-tree pages to new locations, so that 3383 ** they can be modified as required. 3384 ** 3385 ** The complication is that not all database pages are the same size - due 3386 ** to the way the file.c module works some (the first and last in each block) 3387 ** are 4 bytes smaller than the others. 3388 */ 3389 static int mergeWorkerMoveHierarchy( 3390 MergeWorker *pMW, /* Merge worker */ 3391 int bSep /* True for separators run */ 3392 ){ 3393 lsm_db *pDb = pMW->pDb; /* Database handle */ 3394 int rc = LSM_OK; /* Return code */ 3395 int i; 3396 Page **apHier = pMW->hier.apHier; 3397 int nHier = pMW->hier.nHier; 3398 3399 for(i=0; rc==LSM_OK && i<nHier; i++){ 3400 Page *pNew = 0; 3401 rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &pNew); 3402 assert( rc==LSM_OK ); 3403 3404 if( rc==LSM_OK ){ 3405 u8 *a1; int n1; 3406 u8 *a2; int n2; 3407 3408 a1 = fsPageData(pNew, &n1); 3409 a2 = fsPageData(apHier[i], &n2); 3410 3411 assert( n1==n2 || n1+4==n2 ); 3412 3413 if( n1==n2 ){ 3414 memcpy(a1, a2, n2); 3415 }else{ 3416 int nEntry = pageGetNRec(a2, n2); 3417 int iEof1 = SEGMENT_EOF(n1, nEntry); 3418 int iEof2 = SEGMENT_EOF(n2, nEntry); 3419 3420 memcpy(a1, a2, iEof2 - 4); 3421 memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2); 3422 } 3423 3424 lsmFsPageRelease(apHier[i]); 3425 apHier[i] = pNew; 3426 3427 #if 0 3428 assert( n1==n2 || n1+4==n2 || n2+4==n1 ); 3429 if( n1>=n2 ){ 3430 /* If n1 (size of the new page) is equal to or greater than n2 (the 3431 ** size of the old page), then copy the data into the new page. If 3432 ** n1==n2, this could be done with a single memcpy(). However, 3433 ** since sometimes n1>n2, the page content and footer must be copied 3434 ** separately. */ 3435 int nEntry = pageGetNRec(a2, n2); 3436 int iEof1 = SEGMENT_EOF(n1, nEntry); 3437 int iEof2 = SEGMENT_EOF(n2, nEntry); 3438 memcpy(a1, a2, iEof2); 3439 memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2); 3440 lsmFsPageRelease(apHier[i]); 3441 apHier[i] = pNew; 3442 }else{ 3443 lsmPutU16(&a1[SEGMENT_FLAGS_OFFSET(n1)], SEGMENT_BTREE_FLAG); 3444 lsmPutU16(&a1[SEGMENT_NRECORD_OFFSET(n1)], 0); 3445 lsmPutU64(&a1[SEGMENT_POINTER_OFFSET(n1)], 0); 3446 i = i - 1; 3447 lsmFsPageRelease(pNew); 3448 } 3449 #endif 3450 } 3451 } 3452 3453 #ifdef LSM_DEBUG 3454 if( rc==LSM_OK ){ 3455 for(i=0; i<nHier; i++) assert( lsmFsPageWritable(apHier[i]) ); 3456 } 3457 #endif 3458 3459 return rc; 3460 } 3461 3462 /* 3463 ** Allocate and populate the MergeWorker.apHier[] array. 3464 */ 3465 static int mergeWorkerLoadHierarchy(MergeWorker *pMW){ 3466 int rc = LSM_OK; 3467 Segment *pSeg; 3468 Hierarchy *p; 3469 3470 pSeg = &pMW->pLevel->lhs; 3471 p = &pMW->hier; 3472 3473 if( p->apHier==0 && pSeg->iRoot!=0 ){ 3474 FileSystem *pFS = pMW->pDb->pFS; 3475 lsm_env *pEnv = pMW->pDb->pEnv; 3476 Page **apHier = 0; 3477 int nHier = 0; 3478 int iPg = (int)pSeg->iRoot; 3479 3480 do { 3481 Page *pPg = 0; 3482 u8 *aData; 3483 int nData; 3484 int flags; 3485 3486 rc = lsmFsDbPageGet(pFS, pSeg, iPg, &pPg); 3487 if( rc!=LSM_OK ) break; 3488 3489 aData = fsPageData(pPg, &nData); 3490 flags = pageGetFlags(aData, nData); 3491 if( flags&SEGMENT_BTREE_FLAG ){ 3492 Page **apNew = (Page **)lsmRealloc( 3493 pEnv, apHier, sizeof(Page *)*(nHier+1) 3494 ); 3495 if( apNew==0 ){ 3496 rc = LSM_NOMEM_BKPT; 3497 break; 3498 } 3499 apHier = apNew; 3500 memmove(&apHier[1], &apHier[0], sizeof(Page *) * nHier); 3501 nHier++; 3502 3503 apHier[0] = pPg; 3504 iPg = (int)pageGetPtr(aData, nData); 3505 }else{ 3506 lsmFsPageRelease(pPg); 3507 break; 3508 } 3509 }while( 1 ); 3510 3511 if( rc==LSM_OK ){ 3512 u8 *aData; 3513 int nData; 3514 aData = fsPageData(apHier[0], &nData); 3515 pMW->aSave[0].iPgno = pageGetPtr(aData, nData); 3516 p->nHier = nHier; 3517 p->apHier = apHier; 3518 rc = mergeWorkerMoveHierarchy(pMW, 0); 3519 }else{ 3520 int i; 3521 for(i=0; i<nHier; i++){ 3522 lsmFsPageRelease(apHier[i]); 3523 } 3524 lsmFree(pEnv, apHier); 3525 } 3526 } 3527 3528 return rc; 3529 } 3530 3531 /* 3532 ** B-tree pages use almost the same format as regular pages. The 3533 ** differences are: 3534 ** 3535 ** 1. The record format is (usually, see below) as follows: 3536 ** 3537 ** + Type byte (always SORTED_SEPARATOR or SORTED_SYSTEM_SEPARATOR), 3538 ** + Absolute pointer value (varint), 3539 ** + Number of bytes in key (varint), 3540 ** + Blob containing key data. 3541 ** 3542 ** 2. All pointer values are stored as absolute values (not offsets 3543 ** relative to the footer pointer value). 3544 ** 3545 ** 3. Each pointer that is part of a record points to a page that 3546 ** contains keys smaller than the records key (note: not "equal to or 3547 ** smaller than - smaller than"). 3548 ** 3549 ** 4. The pointer in the page footer of a b-tree page points to a page 3550 ** that contains keys equal to or larger than the largest key on the 3551 ** b-tree page. 3552 ** 3553 ** The reason for having the page footer pointer point to the right-child 3554 ** (instead of the left) is that doing things this way makes the 3555 ** mergeWorkerMoveHierarchy() operation less complicated (since the pointers 3556 ** that need to be updated are all stored as fixed-size integers within the 3557 ** page footer, not varints in page records). 3558 ** 3559 ** Records may not span b-tree pages. If this function is called to add a 3560 ** record larger than (page-size / 4) bytes, then a pointer to the indexed 3561 ** array page that contains the main record is added to the b-tree instead. 3562 ** In this case the record format is: 3563 ** 3564 ** + 0x00 byte (1 byte) 3565 ** + Absolute pointer value (varint), 3566 ** + Absolute page number of page containing key (varint). 3567 ** 3568 ** See function seekInBtree() for the code that traverses b-tree pages. 3569 */ 3570 3571 static int mergeWorkerBtreeWrite( 3572 MergeWorker *pMW, 3573 u8 eType, 3574 Pgno iPtr, 3575 Pgno iKeyPg, 3576 void *pKey, 3577 int nKey 3578 ){ 3579 Hierarchy *p = &pMW->hier; 3580 lsm_db *pDb = pMW->pDb; /* Database handle */ 3581 int rc = LSM_OK; /* Return Code */ 3582 int iLevel; /* Level of b-tree hierachy to write to */ 3583 int nData; /* Size of aData[] in bytes */ 3584 u8 *aData; /* Page data for level iLevel */ 3585 int iOff; /* Offset on b-tree page to write record to */ 3586 int nRec; /* Initial number of records on b-tree page */ 3587 3588 /* iKeyPg should be zero for an ordinary b-tree key, or non-zero for an 3589 ** indirect key. The flags byte for an indirect key is 0x00. */ 3590 assert( (eType==0)==(iKeyPg!=0) ); 3591 3592 /* The MergeWorker.apHier[] array contains the right-most leaf of the b-tree 3593 ** hierarchy, the root node, and all nodes that lie on the path between. 3594 ** apHier[0] is the right-most leaf and apHier[pMW->nHier-1] is the current 3595 ** root page. 3596 ** 3597 ** This loop searches for a node with enough space to store the key on, 3598 ** starting with the leaf and iterating up towards the root. When the loop 3599 ** exits, the key may be written to apHier[iLevel]. */ 3600 for(iLevel=0; iLevel<=p->nHier; iLevel++){ 3601 int nByte; /* Number of free bytes required */ 3602 3603 if( iLevel==p->nHier ){ 3604 /* Extend the array and allocate a new root page. */ 3605 Page **aNew; 3606 aNew = (Page **)lsmRealloc( 3607 pMW->pDb->pEnv, p->apHier, sizeof(Page *)*(p->nHier+1) 3608 ); 3609 if( !aNew ){ 3610 return LSM_NOMEM_BKPT; 3611 } 3612 p->apHier = aNew; 3613 }else{ 3614 Page *pOld; 3615 int nFree; 3616 3617 /* If the key will fit on this page, break out of the loop here. 3618 ** The new entry will be written to page apHier[iLevel]. */ 3619 pOld = p->apHier[iLevel]; 3620 assert( lsmFsPageWritable(pOld) ); 3621 aData = fsPageData(pOld, &nData); 3622 if( eType==0 ){ 3623 nByte = 2 + 1 + lsmVarintLen32((int)iPtr) + lsmVarintLen32((int)iKeyPg); 3624 }else{ 3625 nByte = 2 + 1 + lsmVarintLen32((int)iPtr) + lsmVarintLen32(nKey) + nKey; 3626 } 3627 nRec = pageGetNRec(aData, nData); 3628 nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData); 3629 if( nByte<=nFree ) break; 3630 3631 /* Otherwise, this page is full. Set the right-hand-child pointer 3632 ** to iPtr and release it. */ 3633 lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr); 3634 assert( lsmFsPageNumber(pOld)==0 ); 3635 rc = lsmFsPagePersist(pOld); 3636 if( rc==LSM_OK ){ 3637 iPtr = lsmFsPageNumber(pOld); 3638 lsmFsPageRelease(pOld); 3639 } 3640 } 3641 3642 /* Allocate a new page for apHier[iLevel]. */ 3643 p->apHier[iLevel] = 0; 3644 if( rc==LSM_OK ){ 3645 rc = lsmFsSortedAppend( 3646 pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &p->apHier[iLevel] 3647 ); 3648 } 3649 if( rc!=LSM_OK ) return rc; 3650 3651 aData = fsPageData(p->apHier[iLevel], &nData); 3652 memset(aData, 0, nData); 3653 lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG); 3654 lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0); 3655 3656 if( iLevel==p->nHier ){ 3657 p->nHier++; 3658 break; 3659 } 3660 } 3661 3662 /* Write the key into page apHier[iLevel]. */ 3663 aData = fsPageData(p->apHier[iLevel], &nData); 3664 iOff = mergeWorkerPageOffset(aData, nData); 3665 nRec = pageGetNRec(aData, nData); 3666 lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff); 3667 lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1)); 3668 if( eType==0 ){ 3669 aData[iOff++] = 0x00; 3670 iOff += lsmVarintPut32(&aData[iOff], (int)iPtr); 3671 iOff += lsmVarintPut32(&aData[iOff], (int)iKeyPg); 3672 }else{ 3673 aData[iOff++] = eType; 3674 iOff += lsmVarintPut32(&aData[iOff], (int)iPtr); 3675 iOff += lsmVarintPut32(&aData[iOff], nKey); 3676 memcpy(&aData[iOff], pKey, nKey); 3677 } 3678 3679 return rc; 3680 } 3681 3682 static int mergeWorkerBtreeIndirect(MergeWorker *pMW){ 3683 int rc = LSM_OK; 3684 if( pMW->iIndirect ){ 3685 Pgno iKeyPg = pMW->aSave[1].iPgno; 3686 rc = mergeWorkerBtreeWrite(pMW, 0, pMW->iIndirect, iKeyPg, 0, 0); 3687 pMW->iIndirect = 0; 3688 } 3689 return rc; 3690 } 3691 3692 /* 3693 ** Append the database key (iTopic/pKey/nKey) to the b-tree under 3694 ** construction. This key has not yet been written to a segment page. 3695 ** The pointer that will accompany the new key in the b-tree - that 3696 ** points to the completed segment page that contains keys smaller than 3697 ** (pKey/nKey) is currently stored in pMW->aSave[0].iPgno. 3698 */ 3699 static int mergeWorkerPushHierarchy( 3700 MergeWorker *pMW, /* Merge worker object */ 3701 int iTopic, /* Topic value for this key */ 3702 void *pKey, /* Pointer to key buffer */ 3703 int nKey /* Size of pKey buffer in bytes */ 3704 ){ 3705 int rc = LSM_OK; /* Return Code */ 3706 Pgno iPtr; /* Pointer value to accompany pKey/nKey */ 3707 3708 assert( pMW->aSave[0].bStore==0 ); 3709 assert( pMW->aSave[1].bStore==0 ); 3710 rc = mergeWorkerBtreeIndirect(pMW); 3711 3712 /* Obtain the absolute pointer value to store along with the key in the 3713 ** page body. This pointer points to a page that contains keys that are 3714 ** smaller than pKey/nKey. */ 3715 iPtr = pMW->aSave[0].iPgno; 3716 assert( iPtr!=0 ); 3717 3718 /* Determine if the indirect format should be used. */ 3719 if( (nKey*4 > lsmFsPageSize(pMW->pDb->pFS)) ){ 3720 pMW->iIndirect = iPtr; 3721 pMW->aSave[1].bStore = 1; 3722 }else{ 3723 rc = mergeWorkerBtreeWrite( 3724 pMW, (u8)(iTopic | LSM_SEPARATOR), iPtr, 0, pKey, nKey 3725 ); 3726 } 3727 3728 /* Ensure that the SortedRun.iRoot field is correct. */ 3729 return rc; 3730 } 3731 3732 static int mergeWorkerFinishHierarchy( 3733 MergeWorker *pMW /* Merge worker object */ 3734 ){ 3735 int i; /* Used to loop through apHier[] */ 3736 int rc = LSM_OK; /* Return code */ 3737 Pgno iPtr; /* New right-hand-child pointer value */ 3738 3739 iPtr = pMW->aSave[0].iPgno; 3740 for(i=0; i<pMW->hier.nHier && rc==LSM_OK; i++){ 3741 Page *pPg = pMW->hier.apHier[i]; 3742 int nData; /* Size of aData[] in bytes */ 3743 u8 *aData; /* Page data for pPg */ 3744 3745 aData = fsPageData(pPg, &nData); 3746 lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr); 3747 3748 rc = lsmFsPagePersist(pPg); 3749 iPtr = lsmFsPageNumber(pPg); 3750 lsmFsPageRelease(pPg); 3751 } 3752 3753 if( pMW->hier.nHier ){ 3754 pMW->pLevel->lhs.iRoot = iPtr; 3755 lsmFree(pMW->pDb->pEnv, pMW->hier.apHier); 3756 pMW->hier.apHier = 0; 3757 pMW->hier.nHier = 0; 3758 } 3759 3760 return rc; 3761 } 3762 3763 static int mergeWorkerAddPadding( 3764 MergeWorker *pMW /* Merge worker object */ 3765 ){ 3766 FileSystem *pFS = pMW->pDb->pFS; 3767 return lsmFsSortedPadding(pFS, pMW->pDb->pWorker, &pMW->pLevel->lhs); 3768 } 3769 3770 /* 3771 ** Release all page references currently held by the merge-worker passed 3772 ** as the only argument. Unless an error has occurred, all pages have 3773 ** already been released. 3774 */ 3775 static void mergeWorkerReleaseAll(MergeWorker *pMW){ 3776 int i; 3777 lsmFsPageRelease(pMW->pPage); 3778 pMW->pPage = 0; 3779 3780 for(i=0; i<pMW->hier.nHier; i++){ 3781 lsmFsPageRelease(pMW->hier.apHier[i]); 3782 pMW->hier.apHier[i] = 0; 3783 } 3784 lsmFree(pMW->pDb->pEnv, pMW->hier.apHier); 3785 pMW->hier.apHier = 0; 3786 pMW->hier.nHier = 0; 3787 } 3788 3789 static int keyszToSkip(FileSystem *pFS, int nKey){ 3790 int nPgsz; /* Nominal database page size */ 3791 nPgsz = lsmFsPageSize(pFS); 3792 return LSM_MIN(((nKey * 4) / nPgsz), 3); 3793 } 3794 3795 /* 3796 ** Release the reference to the current output page of merge-worker *pMW 3797 ** (reference pMW->pPage). Set the page number values in aSave[] as 3798 ** required (see comments above struct MergeWorker for details). 3799 */ 3800 static int mergeWorkerPersistAndRelease(MergeWorker *pMW){ 3801 int rc; 3802 int i; 3803 3804 assert( pMW->pPage || (pMW->aSave[0].bStore==0 && pMW->aSave[1].bStore==0) ); 3805 3806 /* Persist the page */ 3807 rc = lsmFsPagePersist(pMW->pPage); 3808 3809 /* If required, save the page number. */ 3810 for(i=0; i<2; i++){ 3811 if( pMW->aSave[i].bStore ){ 3812 pMW->aSave[i].iPgno = lsmFsPageNumber(pMW->pPage); 3813 pMW->aSave[i].bStore = 0; 3814 } 3815 } 3816 3817 /* Release the completed output page. */ 3818 lsmFsPageRelease(pMW->pPage); 3819 pMW->pPage = 0; 3820 return rc; 3821 } 3822 3823 /* 3824 ** Advance to the next page of an output run being populated by merge-worker 3825 ** pMW. The footer of the new page is initialized to indicate that it contains 3826 ** zero records. The flags field is cleared. The page footer pointer field 3827 ** is set to iFPtr. 3828 ** 3829 ** If successful, LSM_OK is returned. Otherwise, an error code. 3830 */ 3831 static int mergeWorkerNextPage( 3832 MergeWorker *pMW, /* Merge worker object to append page to */ 3833 Pgno iFPtr /* Pointer value for footer of new page */ 3834 ){ 3835 int rc = LSM_OK; /* Return code */ 3836 Page *pNext = 0; /* New page appended to run */ 3837 lsm_db *pDb = pMW->pDb; /* Database handle */ 3838 3839 rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 0, &pNext); 3840 assert( rc || pMW->pLevel->lhs.iFirst>0 || pMW->pDb->compress.xCompress ); 3841 3842 if( rc==LSM_OK ){ 3843 u8 *aData; /* Data buffer belonging to page pNext */ 3844 int nData; /* Size of aData[] in bytes */ 3845 3846 rc = mergeWorkerPersistAndRelease(pMW); 3847 3848 pMW->pPage = pNext; 3849 pMW->pLevel->pMerge->iOutputOff = 0; 3850 aData = fsPageData(pNext, &nData); 3851 lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0); 3852 lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], 0); 3853 lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iFPtr); 3854 pMW->nWork++; 3855 } 3856 3857 return rc; 3858 } 3859 3860 /* 3861 ** Write a blob of data into an output segment being populated by a 3862 ** merge-worker object. If argument bSep is true, write into the separators 3863 ** array. Otherwise, the main array. 3864 ** 3865 ** This function is used to write the blobs of data for keys and values. 3866 */ 3867 static int mergeWorkerData( 3868 MergeWorker *pMW, /* Merge worker object */ 3869 int bSep, /* True to write to separators run */ 3870 int iFPtr, /* Footer ptr for new pages */ 3871 u8 *aWrite, /* Write data from this buffer */ 3872 int nWrite /* Size of aWrite[] in bytes */ 3873 ){ 3874 int rc = LSM_OK; /* Return code */ 3875 int nRem = nWrite; /* Number of bytes still to write */ 3876 3877 while( rc==LSM_OK && nRem>0 ){ 3878 Merge *pMerge = pMW->pLevel->pMerge; 3879 int nCopy; /* Number of bytes to copy */ 3880 u8 *aData; /* Pointer to buffer of current output page */ 3881 int nData; /* Size of aData[] in bytes */ 3882 int nRec; /* Number of records on current output page */ 3883 int iOff; /* Offset in aData[] to write to */ 3884 3885 assert( lsmFsPageWritable(pMW->pPage) ); 3886 3887 aData = fsPageData(pMW->pPage, &nData); 3888 nRec = pageGetNRec(aData, nData); 3889 iOff = pMerge->iOutputOff; 3890 nCopy = LSM_MIN(nRem, SEGMENT_EOF(nData, nRec) - iOff); 3891 3892 memcpy(&aData[iOff], &aWrite[nWrite-nRem], nCopy); 3893 nRem -= nCopy; 3894 3895 if( nRem>0 ){ 3896 rc = mergeWorkerNextPage(pMW, iFPtr); 3897 }else{ 3898 pMerge->iOutputOff = iOff + nCopy; 3899 } 3900 } 3901 3902 return rc; 3903 } 3904 3905 3906 /* 3907 ** The MergeWorker passed as the only argument is working to merge two or 3908 ** more existing segments together (not to flush an in-memory tree). It 3909 ** has not yet written the first key to the first page of the output. 3910 */ 3911 static int mergeWorkerFirstPage(MergeWorker *pMW){ 3912 int rc = LSM_OK; /* Return code */ 3913 Page *pPg = 0; /* First page of run pSeg */ 3914 int iFPtr = 0; /* Pointer value read from footer of pPg */ 3915 MultiCursor *pCsr = pMW->pCsr; 3916 3917 assert( pMW->pPage==0 ); 3918 3919 if( pCsr->pBtCsr ){ 3920 rc = LSM_OK; 3921 iFPtr = (int)pMW->pLevel->pNext->lhs.iFirst; 3922 }else if( pCsr->nPtr>0 ){ 3923 Segment *pSeg; 3924 pSeg = pCsr->aPtr[pCsr->nPtr-1].pSeg; 3925 rc = lsmFsDbPageGet(pMW->pDb->pFS, pSeg, pSeg->iFirst, &pPg); 3926 if( rc==LSM_OK ){ 3927 u8 *aData; /* Buffer for page pPg */ 3928 int nData; /* Size of aData[] in bytes */ 3929 aData = fsPageData(pPg, &nData); 3930 iFPtr = (int)pageGetPtr(aData, nData); 3931 lsmFsPageRelease(pPg); 3932 } 3933 } 3934 3935 if( rc==LSM_OK ){ 3936 rc = mergeWorkerNextPage(pMW, iFPtr); 3937 if( pCsr->pPrevMergePtr ) *pCsr->pPrevMergePtr = iFPtr; 3938 pMW->aSave[0].bStore = 1; 3939 } 3940 3941 return rc; 3942 } 3943 3944 static int mergeWorkerWrite( 3945 MergeWorker *pMW, /* Merge worker object to write into */ 3946 int eType, /* One of SORTED_SEPARATOR, WRITE or DELETE */ 3947 void *pKey, int nKey, /* Key value */ 3948 void *pVal, int nVal, /* Value value */ 3949 int iPtr /* Absolute value of page pointer, or 0 */ 3950 ){ 3951 int rc = LSM_OK; /* Return code */ 3952 Merge *pMerge; /* Persistent part of level merge state */ 3953 int nHdr; /* Space required for this record header */ 3954 Page *pPg; /* Page to write to */ 3955 u8 *aData; /* Data buffer for page pWriter->pPage */ 3956 int nData = 0; /* Size of buffer aData[] in bytes */ 3957 int nRec = 0; /* Number of records on page pPg */ 3958 int iFPtr = 0; /* Value of pointer in footer of pPg */ 3959 int iRPtr = 0; /* Value of pointer written into record */ 3960 int iOff = 0; /* Current write offset within page pPg */ 3961 Segment *pSeg; /* Segment being written */ 3962 int flags = 0; /* If != 0, flags value for page footer */ 3963 int bFirst = 0; /* True for first key of output run */ 3964 3965 pMerge = pMW->pLevel->pMerge; 3966 pSeg = &pMW->pLevel->lhs; 3967 3968 if( pSeg->iFirst==0 && pMW->pPage==0 ){ 3969 rc = mergeWorkerFirstPage(pMW); 3970 bFirst = 1; 3971 } 3972 pPg = pMW->pPage; 3973 if( pPg ){ 3974 aData = fsPageData(pPg, &nData); 3975 nRec = pageGetNRec(aData, nData); 3976 iFPtr = (int)pageGetPtr(aData, nData); 3977 iRPtr = iPtr - iFPtr; 3978 } 3979 3980 /* Figure out how much space is required by the new record. The space 3981 ** required is divided into two sections: the header and the body. The 3982 ** header consists of the intial varint fields. The body are the blobs 3983 ** of data that correspond to the key and value data. The entire header 3984 ** must be stored on the page. The body may overflow onto the next and 3985 ** subsequent pages. 3986 ** 3987 ** The header space is: 3988 ** 3989 ** 1) record type - 1 byte. 3990 ** 2) Page-pointer-offset - 1 varint 3991 ** 3) Key size - 1 varint 3992 ** 4) Value size - 1 varint (only if LSM_INSERT flag is set) 3993 */ 3994 if( rc==LSM_OK ){ 3995 nHdr = 1 + lsmVarintLen32(iRPtr) + lsmVarintLen32(nKey); 3996 if( rtIsWrite(eType) ) nHdr += lsmVarintLen32(nVal); 3997 3998 /* If the entire header will not fit on page pPg, or if page pPg is 3999 ** marked read-only, advance to the next page of the output run. */ 4000 iOff = pMerge->iOutputOff; 4001 if( iOff<0 || pPg==0 || iOff+nHdr > SEGMENT_EOF(nData, nRec+1) ){ 4002 iFPtr = (int)*pMW->pCsr->pPrevMergePtr; 4003 iRPtr = iPtr - iFPtr; 4004 iOff = 0; 4005 nRec = 0; 4006 rc = mergeWorkerNextPage(pMW, iFPtr); 4007 pPg = pMW->pPage; 4008 } 4009 } 4010 4011 /* If this record header will be the first on the page, and the page is 4012 ** not the very first in the entire run, add a copy of the key to the 4013 ** b-tree hierarchy. 4014 */ 4015 if( rc==LSM_OK && nRec==0 && bFirst==0 ){ 4016 assert( pMerge->nSkip>=0 ); 4017 4018 if( pMerge->nSkip==0 ){ 4019 rc = mergeWorkerPushHierarchy(pMW, rtTopic(eType), pKey, nKey); 4020 assert( pMW->aSave[0].bStore==0 ); 4021 pMW->aSave[0].bStore = 1; 4022 pMerge->nSkip = keyszToSkip(pMW->pDb->pFS, nKey); 4023 }else{ 4024 pMerge->nSkip--; 4025 flags = PGFTR_SKIP_THIS_FLAG; 4026 } 4027 4028 if( pMerge->nSkip ) flags |= PGFTR_SKIP_NEXT_FLAG; 4029 } 4030 4031 /* Update the output segment */ 4032 if( rc==LSM_OK ){ 4033 aData = fsPageData(pPg, &nData); 4034 4035 /* Update the page footer. */ 4036 lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1)); 4037 lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff); 4038 if( flags ) lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], (u16)flags); 4039 4040 /* Write the entry header into the current page. */ 4041 aData[iOff++] = (u8)eType; /* 1 */ 4042 iOff += lsmVarintPut32(&aData[iOff], iRPtr); /* 2 */ 4043 iOff += lsmVarintPut32(&aData[iOff], nKey); /* 3 */ 4044 if( rtIsWrite(eType) ) iOff += lsmVarintPut32(&aData[iOff], nVal); /* 4 */ 4045 pMerge->iOutputOff = iOff; 4046 4047 /* Write the key and data into the segment. */ 4048 assert( iFPtr==pageGetPtr(aData, nData) ); 4049 rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pKey, nKey); 4050 if( rc==LSM_OK && rtIsWrite(eType) ){ 4051 if( rc==LSM_OK ){ 4052 rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pVal, nVal); 4053 } 4054 } 4055 } 4056 4057 return rc; 4058 } 4059 4060 4061 /* 4062 ** Free all resources allocated by mergeWorkerInit(). 4063 */ 4064 static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){ 4065 int i; /* Iterator variable */ 4066 int rc = *pRc; 4067 MultiCursor *pCsr = pMW->pCsr; 4068 4069 /* Unless the merge has finished, save the cursor position in the 4070 ** Merge.aInput[] array. See function mergeWorkerInit() for the 4071 ** code to restore a cursor position based on aInput[]. */ 4072 if( rc==LSM_OK && pCsr && lsmMCursorValid(pCsr) ){ 4073 Merge *pMerge = pMW->pLevel->pMerge; 4074 int bBtree = (pCsr->pBtCsr!=0); 4075 int iPtr; 4076 4077 /* pMerge->nInput==0 indicates that this is a FlushTree() operation. */ 4078 assert( pMerge->nInput==0 || pMW->pLevel->nRight>0 ); 4079 assert( pMerge->nInput==0 || pMerge->nInput==(pCsr->nPtr+bBtree) ); 4080 4081 for(i=0; i<(pMerge->nInput-bBtree); i++){ 4082 SegmentPtr *pPtr = &pCsr->aPtr[i]; 4083 if( pPtr->pPg ){ 4084 pMerge->aInput[i].iPg = lsmFsPageNumber(pPtr->pPg); 4085 pMerge->aInput[i].iCell = pPtr->iCell; 4086 }else{ 4087 pMerge->aInput[i].iPg = 0; 4088 pMerge->aInput[i].iCell = 0; 4089 } 4090 } 4091 if( bBtree && pMerge->nInput ){ 4092 assert( i==pCsr->nPtr ); 4093 btreeCursorPosition(pCsr->pBtCsr, &pMerge->aInput[i]); 4094 } 4095 4096 /* Store the location of the split-key */ 4097 iPtr = pCsr->aTree[1] - CURSOR_DATA_SEGMENT; 4098 if( iPtr<pCsr->nPtr ){ 4099 pMerge->splitkey = pMerge->aInput[iPtr]; 4100 }else{ 4101 btreeCursorSplitkey(pCsr->pBtCsr, &pMerge->splitkey); 4102 } 4103 4104 pMerge->iOutputOff = -1; 4105 } 4106 4107 lsmMCursorClose(pCsr, 0); 4108 4109 /* Persist and release the output page. */ 4110 if( rc==LSM_OK ) rc = mergeWorkerPersistAndRelease(pMW); 4111 if( rc==LSM_OK ) rc = mergeWorkerBtreeIndirect(pMW); 4112 if( rc==LSM_OK ) rc = mergeWorkerFinishHierarchy(pMW); 4113 if( rc==LSM_OK ) rc = mergeWorkerAddPadding(pMW); 4114 lsmFsFlushWaiting(pMW->pDb->pFS, &rc); 4115 mergeWorkerReleaseAll(pMW); 4116 4117 lsmFree(pMW->pDb->pEnv, pMW->aGobble); 4118 pMW->aGobble = 0; 4119 pMW->pCsr = 0; 4120 4121 *pRc = rc; 4122 } 4123 4124 /* 4125 ** The cursor passed as the first argument is being used as the input for 4126 ** a merge operation. When this function is called, *piFlags contains the 4127 ** database entry flags for the current entry. The entry about to be written 4128 ** to the output. 4129 ** 4130 ** Note that this function only has to work for cursors configured to 4131 ** iterate forwards (not backwards). 4132 */ 4133 static void mergeRangeDeletes(MultiCursor *pCsr, int *piVal, int *piFlags){ 4134 int f = *piFlags; 4135 int iKey = pCsr->aTree[1]; 4136 int i; 4137 4138 assert( pCsr->flags & CURSOR_NEXT_OK ); 4139 if( pCsr->flags & CURSOR_IGNORE_DELETE ){ 4140 /* The ignore-delete flag is set when the output of the merge will form 4141 ** the oldest level in the database. In this case there is no point in 4142 ** retaining any range-delete flags. */ 4143 assert( (f & LSM_POINT_DELETE)==0 ); 4144 f &= ~(LSM_START_DELETE|LSM_END_DELETE); 4145 }else{ 4146 for(i=0; i<(CURSOR_DATA_SEGMENT + pCsr->nPtr); i++){ 4147 if( i!=iKey ){ 4148 int eType; 4149 void *pKey; 4150 int nKey; 4151 int res; 4152 multiCursorGetKey(pCsr, i, &eType, &pKey, &nKey); 4153 4154 if( pKey ){ 4155 res = sortedKeyCompare(pCsr->pDb->xCmp, 4156 rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, 4157 rtTopic(eType), pKey, nKey 4158 ); 4159 assert( res<=0 ); 4160 if( res==0 ){ 4161 if( (f & (LSM_INSERT|LSM_POINT_DELETE))==0 ){ 4162 if( eType & LSM_INSERT ){ 4163 f |= LSM_INSERT; 4164 *piVal = i; 4165 } 4166 else if( eType & LSM_POINT_DELETE ){ 4167 f |= LSM_POINT_DELETE; 4168 } 4169 } 4170 f |= (eType & (LSM_END_DELETE|LSM_START_DELETE)); 4171 } 4172 4173 if( i>iKey && (eType & LSM_END_DELETE) && res<0 ){ 4174 if( f & (LSM_INSERT|LSM_POINT_DELETE) ){ 4175 f |= (LSM_END_DELETE|LSM_START_DELETE); 4176 }else{ 4177 f = 0; 4178 } 4179 break; 4180 } 4181 } 4182 } 4183 } 4184 4185 assert( (f & LSM_INSERT)==0 || (f & LSM_POINT_DELETE)==0 ); 4186 if( (f & LSM_START_DELETE) 4187 && (f & LSM_END_DELETE) 4188 && (f & LSM_POINT_DELETE ) 4189 ){ 4190 f = 0; 4191 } 4192 } 4193 4194 *piFlags = f; 4195 } 4196 4197 static int mergeWorkerStep(MergeWorker *pMW){ 4198 lsm_db *pDb = pMW->pDb; /* Database handle */ 4199 MultiCursor *pCsr; /* Cursor to read input data from */ 4200 int rc = LSM_OK; /* Return code */ 4201 int eType; /* SORTED_SEPARATOR, WRITE or DELETE */ 4202 void *pKey; int nKey; /* Key */ 4203 Pgno iPtr; 4204 int iVal; 4205 4206 pCsr = pMW->pCsr; 4207 4208 /* Pull the next record out of the source cursor. */ 4209 lsmMCursorKey(pCsr, &pKey, &nKey); 4210 eType = pCsr->eType; 4211 4212 /* Figure out if the output record may have a different pointer value 4213 ** than the previous. This is the case if the current key is identical to 4214 ** a key that appears in the lowest level run being merged. If so, set 4215 ** iPtr to the absolute pointer value. If not, leave iPtr set to zero, 4216 ** indicating that the output pointer value should be a copy of the pointer 4217 ** value written with the previous key. */ 4218 iPtr = (pCsr->pPrevMergePtr ? *pCsr->pPrevMergePtr : 0); 4219 if( pCsr->pBtCsr ){ 4220 BtreeCursor *pBtCsr = pCsr->pBtCsr; 4221 if( pBtCsr->pKey ){ 4222 int res = rtTopic(pBtCsr->eType) - rtTopic(eType); 4223 if( res==0 ) res = pDb->xCmp(pBtCsr->pKey, pBtCsr->nKey, pKey, nKey); 4224 if( 0==res ) iPtr = pBtCsr->iPtr; 4225 assert( res>=0 ); 4226 } 4227 }else if( pCsr->nPtr ){ 4228 SegmentPtr *pPtr = &pCsr->aPtr[pCsr->nPtr-1]; 4229 if( pPtr->pPg 4230 && 0==pDb->xCmp(pPtr->pKey, pPtr->nKey, pKey, nKey) 4231 ){ 4232 iPtr = pPtr->iPtr+pPtr->iPgPtr; 4233 } 4234 } 4235 4236 iVal = pCsr->aTree[1]; 4237 mergeRangeDeletes(pCsr, &iVal, &eType); 4238 4239 if( eType!=0 ){ 4240 if( pMW->aGobble ){ 4241 int iGobble = pCsr->aTree[1] - CURSOR_DATA_SEGMENT; 4242 if( iGobble<pCsr->nPtr && iGobble>=0 ){ 4243 SegmentPtr *pGobble = &pCsr->aPtr[iGobble]; 4244 if( (pGobble->flags & PGFTR_SKIP_THIS_FLAG)==0 ){ 4245 pMW->aGobble[iGobble] = lsmFsPageNumber(pGobble->pPg); 4246 } 4247 } 4248 } 4249 4250 /* If this is a separator key and we know that the output pointer has not 4251 ** changed, there is no point in writing an output record. Otherwise, 4252 ** proceed. */ 4253 if( rc==LSM_OK && (rtIsSeparator(eType)==0 || iPtr!=0) ){ 4254 /* Write the record into the main run. */ 4255 void *pVal; int nVal; 4256 rc = multiCursorGetVal(pCsr, iVal, &pVal, &nVal); 4257 if( pVal && rc==LSM_OK ){ 4258 assert( nVal>=0 ); 4259 rc = sortedBlobSet(pDb->pEnv, &pCsr->val, pVal, nVal); 4260 pVal = pCsr->val.pData; 4261 } 4262 if( rc==LSM_OK ){ 4263 rc = mergeWorkerWrite(pMW, eType, pKey, nKey, pVal, nVal, (int)iPtr); 4264 } 4265 } 4266 } 4267 4268 /* Advance the cursor to the next input record (assuming one exists). */ 4269 assert( lsmMCursorValid(pMW->pCsr) ); 4270 if( rc==LSM_OK ) rc = lsmMCursorNext(pMW->pCsr); 4271 4272 return rc; 4273 } 4274 4275 static int mergeWorkerDone(MergeWorker *pMW){ 4276 return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr); 4277 } 4278 4279 static void sortedFreeLevel(lsm_env *pEnv, Level *p){ 4280 if( p ){ 4281 lsmFree(pEnv, p->pSplitKey); 4282 lsmFree(pEnv, p->pMerge); 4283 lsmFree(pEnv, p->aRhs); 4284 lsmFree(pEnv, p); 4285 } 4286 } 4287 4288 static void sortedInvokeWorkHook(lsm_db *pDb){ 4289 if( pDb->xWork ){ 4290 pDb->xWork(pDb, pDb->pWorkCtx); 4291 } 4292 } 4293 4294 static int sortedNewToplevel( 4295 lsm_db *pDb, /* Connection handle */ 4296 int eTree, /* One of the TREE_XXX constants */ 4297 int *pnWrite /* OUT: Number of database pages written */ 4298 ){ 4299 int rc = LSM_OK; /* Return Code */ 4300 MultiCursor *pCsr = 0; 4301 Level *pNext = 0; /* The current top level */ 4302 Level *pNew; /* The new level itself */ 4303 Segment *pLinked = 0; /* Delete separators from this segment */ 4304 Level *pDel = 0; /* Delete this entire level */ 4305 int nWrite = 0; /* Number of database pages written */ 4306 Freelist freelist; 4307 4308 if( eTree!=TREE_NONE ){ 4309 rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk); 4310 } 4311 4312 assert( pDb->bUseFreelist==0 ); 4313 pDb->pFreelist = &freelist; 4314 pDb->bUseFreelist = 1; 4315 memset(&freelist, 0, sizeof(freelist)); 4316 4317 /* Allocate the new level structure to write to. */ 4318 pNext = lsmDbSnapshotLevel(pDb->pWorker); 4319 pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc); 4320 if( pNew ){ 4321 pNew->pNext = pNext; 4322 lsmDbSnapshotSetLevel(pDb->pWorker, pNew); 4323 } 4324 4325 /* Create a cursor to gather the data required by the new segment. The new 4326 ** segment contains everything in the tree and pointers to the next segment 4327 ** in the database (if any). */ 4328 pCsr = multiCursorNew(pDb, &rc); 4329 if( pCsr ){ 4330 pCsr->pDb = pDb; 4331 rc = multiCursorVisitFreelist(pCsr); 4332 if( rc==LSM_OK ){ 4333 rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree); 4334 } 4335 if( rc==LSM_OK && pNext && pNext->pMerge==0 ){ 4336 if( (pNext->flags & LEVEL_FREELIST_ONLY) ){ 4337 pDel = pNext; 4338 pCsr->aPtr = lsmMallocZeroRc(pDb->pEnv, sizeof(SegmentPtr), &rc); 4339 multiCursorAddOne(pCsr, pNext, &rc); 4340 }else if( eTree!=TREE_NONE && pNext->lhs.iRoot ){ 4341 pLinked = &pNext->lhs; 4342 rc = btreeCursorNew(pDb, pLinked, &pCsr->pBtCsr); 4343 } 4344 } 4345 4346 /* If this will be the only segment in the database, discard any delete 4347 ** markers present in the in-memory tree. */ 4348 if( pNext==0 ){ 4349 multiCursorIgnoreDelete(pCsr); 4350 } 4351 } 4352 4353 if( rc!=LSM_OK ){ 4354 lsmMCursorClose(pCsr, 0); 4355 }else{ 4356 Pgno iLeftPtr = 0; 4357 Merge merge; /* Merge object used to create new level */ 4358 MergeWorker mergeworker; /* MergeWorker object for the same purpose */ 4359 4360 memset(&merge, 0, sizeof(Merge)); 4361 memset(&mergeworker, 0, sizeof(MergeWorker)); 4362 4363 pNew->pMerge = &merge; 4364 pNew->flags |= LEVEL_INCOMPLETE; 4365 mergeworker.pDb = pDb; 4366 mergeworker.pLevel = pNew; 4367 mergeworker.pCsr = pCsr; 4368 pCsr->pPrevMergePtr = &iLeftPtr; 4369 4370 /* Mark the separators array for the new level as a "phantom". */ 4371 mergeworker.bFlush = 1; 4372 4373 /* Do the work to create the new merged segment on disk */ 4374 if( rc==LSM_OK ) rc = lsmMCursorFirst(pCsr); 4375 while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){ 4376 rc = mergeWorkerStep(&mergeworker); 4377 } 4378 mergeWorkerShutdown(&mergeworker, &rc); 4379 assert( rc!=LSM_OK || mergeworker.nWork==0 || pNew->lhs.iFirst ); 4380 if( rc==LSM_OK && pNew->lhs.iFirst ){ 4381 rc = lsmFsSortedFinish(pDb->pFS, &pNew->lhs); 4382 } 4383 nWrite = mergeworker.nWork; 4384 pNew->flags &= ~LEVEL_INCOMPLETE; 4385 if( eTree==TREE_NONE ){ 4386 pNew->flags |= LEVEL_FREELIST_ONLY; 4387 } 4388 pNew->pMerge = 0; 4389 } 4390 4391 if( rc!=LSM_OK || pNew->lhs.iFirst==0 ){ 4392 assert( rc!=LSM_OK || pDb->pWorker->freelist.nEntry==0 ); 4393 lsmDbSnapshotSetLevel(pDb->pWorker, pNext); 4394 sortedFreeLevel(pDb->pEnv, pNew); 4395 }else{ 4396 if( pLinked ){ 4397 pLinked->iRoot = 0; 4398 }else if( pDel ){ 4399 assert( pNew->pNext==pDel ); 4400 pNew->pNext = pDel->pNext; 4401 lsmFsSortedDelete(pDb->pFS, pDb->pWorker, 1, &pDel->lhs); 4402 sortedFreeLevel(pDb->pEnv, pDel); 4403 } 4404 4405 #if LSM_LOG_STRUCTURE 4406 lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "new-toplevel"); 4407 #endif 4408 4409 if( freelist.nEntry ){ 4410 Freelist *p = &pDb->pWorker->freelist; 4411 lsmFree(pDb->pEnv, p->aEntry); 4412 memcpy(p, &freelist, sizeof(freelist)); 4413 freelist.aEntry = 0; 4414 }else{ 4415 pDb->pWorker->freelist.nEntry = 0; 4416 } 4417 4418 assertBtreeOk(pDb, &pNew->lhs); 4419 sortedInvokeWorkHook(pDb); 4420 } 4421 4422 if( pnWrite ) *pnWrite = nWrite; 4423 pDb->pWorker->nWrite += nWrite; 4424 pDb->pFreelist = 0; 4425 pDb->bUseFreelist = 0; 4426 lsmFree(pDb->pEnv, freelist.aEntry); 4427 return rc; 4428 } 4429 4430 /* 4431 ** The nMerge levels in the LSM beginning with pLevel consist of a 4432 ** left-hand-side segment only. Replace these levels with a single new 4433 ** level consisting of a new empty segment on the left-hand-side and the 4434 ** nMerge segments from the replaced levels on the right-hand-side. 4435 ** 4436 ** Also, allocate and populate a Merge object and set Level.pMerge to 4437 ** point to it. 4438 */ 4439 static int sortedMergeSetup( 4440 lsm_db *pDb, /* Database handle */ 4441 Level *pLevel, /* First level to merge */ 4442 int nMerge, /* Merge this many levels together */ 4443 Level **ppNew /* New, merged, level */ 4444 ){ 4445 int rc = LSM_OK; /* Return Code */ 4446 Level *pNew; /* New Level object */ 4447 int bUseNext = 0; /* True to link in next separators */ 4448 Merge *pMerge; /* New Merge object */ 4449 int nByte; /* Bytes of space allocated at pMerge */ 4450 4451 #ifdef LSM_DEBUG 4452 int iLevel; 4453 Level *pX = pLevel; 4454 for(iLevel=0; iLevel<nMerge; iLevel++){ 4455 assert( pX->nRight==0 ); 4456 pX = pX->pNext; 4457 } 4458 #endif 4459 4460 /* Allocate the new Level object */ 4461 pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc); 4462 if( pNew ){ 4463 pNew->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv, 4464 nMerge * sizeof(Segment), &rc); 4465 } 4466 4467 /* Populate the new Level object */ 4468 if( rc==LSM_OK ){ 4469 Level *pNext = 0; /* Level following pNew */ 4470 int i; 4471 int bFreeOnly = 1; 4472 Level *pTopLevel; 4473 Level *p = pLevel; 4474 Level **pp; 4475 pNew->nRight = nMerge; 4476 pNew->iAge = pLevel->iAge+1; 4477 for(i=0; i<nMerge; i++){ 4478 assert( p->nRight==0 ); 4479 pNext = p->pNext; 4480 pNew->aRhs[i] = p->lhs; 4481 if( (p->flags & LEVEL_FREELIST_ONLY)==0 ) bFreeOnly = 0; 4482 sortedFreeLevel(pDb->pEnv, p); 4483 p = pNext; 4484 } 4485 4486 if( bFreeOnly ) pNew->flags |= LEVEL_FREELIST_ONLY; 4487 4488 /* Replace the old levels with the new. */ 4489 pTopLevel = lsmDbSnapshotLevel(pDb->pWorker); 4490 pNew->pNext = p; 4491 for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext)); 4492 *pp = pNew; 4493 lsmDbSnapshotSetLevel(pDb->pWorker, pTopLevel); 4494 4495 /* Determine whether or not the next separators will be linked in */ 4496 if( pNext && pNext->pMerge==0 && pNext->lhs.iRoot && pNext 4497 && (bFreeOnly==0 || (pNext->flags & LEVEL_FREELIST_ONLY)) 4498 ){ 4499 bUseNext = 1; 4500 } 4501 } 4502 4503 /* Allocate the merge object */ 4504 nByte = sizeof(Merge) + sizeof(MergeInput) * (nMerge + bUseNext); 4505 pMerge = (Merge *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc); 4506 if( pMerge ){ 4507 pMerge->aInput = (MergeInput *)&pMerge[1]; 4508 pMerge->nInput = nMerge + bUseNext; 4509 pNew->pMerge = pMerge; 4510 } 4511 4512 *ppNew = pNew; 4513 return rc; 4514 } 4515 4516 static int mergeWorkerInit( 4517 lsm_db *pDb, /* Db connection to do merge work */ 4518 Level *pLevel, /* Level to work on merging */ 4519 MergeWorker *pMW /* Object to initialize */ 4520 ){ 4521 int rc = LSM_OK; /* Return code */ 4522 Merge *pMerge = pLevel->pMerge; /* Persistent part of merge state */ 4523 MultiCursor *pCsr = 0; /* Cursor opened for pMW */ 4524 Level *pNext = pLevel->pNext; /* Next level in LSM */ 4525 4526 assert( pDb->pWorker ); 4527 assert( pLevel->pMerge ); 4528 assert( pLevel->nRight>0 ); 4529 4530 memset(pMW, 0, sizeof(MergeWorker)); 4531 pMW->pDb = pDb; 4532 pMW->pLevel = pLevel; 4533 pMW->aGobble = lsmMallocZeroRc(pDb->pEnv, sizeof(Pgno) * pLevel->nRight, &rc); 4534 4535 /* Create a multi-cursor to read the data to write to the new 4536 ** segment. The new segment contains: 4537 ** 4538 ** 1. Records from LHS of each of the nMerge levels being merged. 4539 ** 2. Separators from either the last level being merged, or the 4540 ** separators attached to the LHS of the following level, or neither. 4541 ** 4542 ** If the new level is the lowest (oldest) in the db, discard any 4543 ** delete keys. Key annihilation. 4544 */ 4545 pCsr = multiCursorNew(pDb, &rc); 4546 if( pCsr ){ 4547 pCsr->flags |= CURSOR_NEXT_OK; 4548 rc = multiCursorAddRhs(pCsr, pLevel); 4549 } 4550 if( rc==LSM_OK && pMerge->nInput > pLevel->nRight ){ 4551 rc = btreeCursorNew(pDb, &pNext->lhs, &pCsr->pBtCsr); 4552 }else if( pNext ){ 4553 multiCursorReadSeparators(pCsr); 4554 }else{ 4555 multiCursorIgnoreDelete(pCsr); 4556 } 4557 4558 assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) ); 4559 pMW->pCsr = pCsr; 4560 4561 /* Load the b-tree hierarchy into memory. */ 4562 if( rc==LSM_OK ) rc = mergeWorkerLoadHierarchy(pMW); 4563 if( rc==LSM_OK && pMW->hier.nHier==0 ){ 4564 pMW->aSave[0].iPgno = pLevel->lhs.iFirst; 4565 } 4566 4567 /* Position the cursor. */ 4568 if( rc==LSM_OK ){ 4569 pCsr->pPrevMergePtr = &pMerge->iCurrentPtr; 4570 if( pLevel->lhs.iFirst==0 ){ 4571 /* The output array is still empty. So position the cursor at the very 4572 ** start of the input. */ 4573 rc = multiCursorEnd(pCsr, 0); 4574 }else{ 4575 /* The output array is non-empty. Position the cursor based on the 4576 ** page/cell data saved in the Merge.aInput[] array. */ 4577 int i; 4578 for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){ 4579 MergeInput *pInput = &pMerge->aInput[i]; 4580 if( pInput->iPg ){ 4581 SegmentPtr *pPtr; 4582 assert( pCsr->aPtr[i].pPg==0 ); 4583 pPtr = &pCsr->aPtr[i]; 4584 rc = segmentPtrLoadPage(pDb->pFS, pPtr, (int)pInput->iPg); 4585 if( rc==LSM_OK && pPtr->nCell>0 ){ 4586 rc = segmentPtrLoadCell(pPtr, pInput->iCell); 4587 } 4588 } 4589 } 4590 4591 if( rc==LSM_OK && pCsr->pBtCsr ){ 4592 int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; 4593 assert( i==pCsr->nPtr ); 4594 rc = btreeCursorRestore(pCsr->pBtCsr, xCmp, &pMerge->aInput[i]); 4595 } 4596 4597 if( rc==LSM_OK ){ 4598 rc = multiCursorSetupTree(pCsr, 0); 4599 } 4600 } 4601 pCsr->flags |= CURSOR_NEXT_OK; 4602 } 4603 4604 return rc; 4605 } 4606 4607 static int sortedBtreeGobble( 4608 lsm_db *pDb, /* Worker connection */ 4609 MultiCursor *pCsr, /* Multi-cursor being used for a merge */ 4610 int iGobble /* pCsr->aPtr[] entry to operate on */ 4611 ){ 4612 int rc = LSM_OK; 4613 if( rtTopic(pCsr->eType)==0 ){ 4614 Segment *pSeg = pCsr->aPtr[iGobble].pSeg; 4615 Pgno *aPg; 4616 int nPg; 4617 4618 /* Seek from the root of the b-tree to the segment leaf that may contain 4619 ** a key equal to the one multi-cursor currently points to. Record the 4620 ** page number of each b-tree page and the leaf. The segment may be 4621 ** gobbled up to (but not including) the first of these page numbers. 4622 */ 4623 assert( pSeg->iRoot>0 ); 4624 aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(Pgno)*32, &rc); 4625 if( rc==LSM_OK ){ 4626 rc = seekInBtree(pCsr, pSeg, 4627 rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, aPg, 0 4628 ); 4629 } 4630 4631 if( rc==LSM_OK ){ 4632 for(nPg=0; aPg[nPg]; nPg++); 4633 lsmFsGobble(pDb, pSeg, aPg, nPg); 4634 } 4635 4636 lsmFree(pDb->pEnv, aPg); 4637 } 4638 return rc; 4639 } 4640 4641 /* 4642 ** Argument p points to a level of age N. Return the number of levels in 4643 ** the linked list starting at p that have age=N (always at least 1). 4644 */ 4645 static int sortedCountLevels(Level *p){ 4646 int iAge = p->iAge; 4647 int nRet = 0; 4648 do { 4649 nRet++; 4650 p = p->pNext; 4651 }while( p && p->iAge==iAge ); 4652 return nRet; 4653 } 4654 4655 static int sortedSelectLevel(lsm_db *pDb, int nMerge, Level **ppOut){ 4656 Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker); 4657 int rc = LSM_OK; 4658 Level *pLevel = 0; /* Output value */ 4659 Level *pBest = 0; /* Best level to work on found so far */ 4660 int nBest; /* Number of segments merged at pBest */ 4661 Level *pThis = 0; /* First in run of levels with age=iAge */ 4662 int nThis = 0; /* Number of levels starting at pThis */ 4663 4664 assert( nMerge>=1 ); 4665 nBest = LSM_MAX(1, nMerge-1); 4666 4667 /* Find the longest contiguous run of levels not currently undergoing a 4668 ** merge with the same age in the structure. Or the level being merged 4669 ** with the largest number of right-hand segments. Work on it. */ 4670 for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){ 4671 if( pLevel->nRight==0 && pThis && pLevel->iAge==pThis->iAge ){ 4672 nThis++; 4673 }else{ 4674 if( nThis>nBest ){ 4675 if( (pLevel->iAge!=pThis->iAge+1) 4676 || (pLevel->nRight==0 && sortedCountLevels(pLevel)<=pDb->nMerge) 4677 ){ 4678 pBest = pThis; 4679 nBest = nThis; 4680 } 4681 } 4682 if( pLevel->nRight ){ 4683 if( pLevel->nRight>nBest ){ 4684 nBest = pLevel->nRight; 4685 pBest = pLevel; 4686 } 4687 nThis = 0; 4688 pThis = 0; 4689 }else{ 4690 pThis = pLevel; 4691 nThis = 1; 4692 } 4693 } 4694 } 4695 if( nThis>nBest ){ 4696 assert( pThis ); 4697 pBest = pThis; 4698 nBest = nThis; 4699 } 4700 4701 if( pBest==0 && nMerge==1 ){ 4702 int nFree = 0; 4703 int nUsr = 0; 4704 for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){ 4705 assert( !pLevel->nRight ); 4706 if( pLevel->flags & LEVEL_FREELIST_ONLY ){ 4707 nFree++; 4708 }else{ 4709 nUsr++; 4710 } 4711 } 4712 if( nUsr>1 ){ 4713 pBest = pTopLevel; 4714 nBest = nFree + nUsr; 4715 } 4716 } 4717 4718 if( pBest ){ 4719 if( pBest->nRight==0 ){ 4720 rc = sortedMergeSetup(pDb, pBest, nBest, ppOut); 4721 }else{ 4722 *ppOut = pBest; 4723 } 4724 } 4725 4726 return rc; 4727 } 4728 4729 static int sortedDbIsFull(lsm_db *pDb){ 4730 Level *pTop = lsmDbSnapshotLevel(pDb->pWorker); 4731 4732 if( lsmDatabaseFull(pDb) ) return 1; 4733 if( pTop && pTop->iAge==0 4734 && (pTop->nRight || sortedCountLevels(pTop)>=pDb->nMerge) 4735 ){ 4736 return 1; 4737 } 4738 return 0; 4739 } 4740 4741 typedef struct MoveBlockCtx MoveBlockCtx; 4742 struct MoveBlockCtx { 4743 int iSeen; /* Previous free block on list */ 4744 int iFrom; /* Total number of blocks in file */ 4745 }; 4746 4747 static int moveBlockCb(void *pCtx, int iBlk, i64 iSnapshot){ 4748 MoveBlockCtx *p = (MoveBlockCtx *)pCtx; 4749 assert( p->iFrom==0 ); 4750 if( iBlk==(p->iSeen-1) ){ 4751 p->iSeen = iBlk; 4752 return 0; 4753 } 4754 p->iFrom = p->iSeen-1; 4755 return 1; 4756 } 4757 4758 /* 4759 ** This function is called to further compact a database for which all 4760 ** of the content has already been merged into a single segment. If 4761 ** possible, it moves the contents of a single block from the end of the 4762 ** file to a free-block that lies closer to the start of the file (allowing 4763 ** the file to be eventually truncated). 4764 */ 4765 static int sortedMoveBlock(lsm_db *pDb, int *pnWrite){ 4766 Snapshot *p = pDb->pWorker; 4767 Level *pLvl = lsmDbSnapshotLevel(p); 4768 int iFrom; /* Block to move */ 4769 int iTo; /* Destination to move block to */ 4770 int rc; /* Return code */ 4771 4772 MoveBlockCtx sCtx; 4773 4774 assert( pLvl->pNext==0 && pLvl->nRight==0 ); 4775 assert( p->redirect.n<=LSM_MAX_BLOCK_REDIRECTS ); 4776 4777 *pnWrite = 0; 4778 4779 /* Check that the redirect array is not already full. If it is, return 4780 ** without moving any database content. */ 4781 if( p->redirect.n>=LSM_MAX_BLOCK_REDIRECTS ) return LSM_OK; 4782 4783 /* Find the last block of content in the database file. Do this by 4784 ** traversing the free-list in reverse (descending block number) order. 4785 ** The first block not on the free list is the one that will be moved. 4786 ** Since the db consists of a single segment, there is no ambiguity as 4787 ** to which segment the block belongs to. */ 4788 sCtx.iSeen = p->nBlock+1; 4789 sCtx.iFrom = 0; 4790 rc = lsmWalkFreelist(pDb, 1, moveBlockCb, &sCtx); 4791 if( rc!=LSM_OK || sCtx.iFrom==0 ) return rc; 4792 iFrom = sCtx.iFrom; 4793 4794 /* Find the first free block in the database, ignoring block 1. Block 4795 ** 1 is tricky as it is smaller than the other blocks. */ 4796 rc = lsmBlockAllocate(pDb, iFrom, &iTo); 4797 if( rc!=LSM_OK || iTo==0 ) return rc; 4798 assert( iTo!=1 && iTo<iFrom ); 4799 4800 rc = lsmFsMoveBlock(pDb->pFS, &pLvl->lhs, iTo, iFrom); 4801 if( rc==LSM_OK ){ 4802 if( p->redirect.a==0 ){ 4803 int nByte = sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS; 4804 p->redirect.a = lsmMallocZeroRc(pDb->pEnv, nByte, &rc); 4805 } 4806 if( rc==LSM_OK ){ 4807 4808 /* Check if the block just moved was already redirected. */ 4809 int i; 4810 for(i=0; i<p->redirect.n; i++){ 4811 if( p->redirect.a[i].iTo==iFrom ) break; 4812 } 4813 4814 if( i==p->redirect.n ){ 4815 /* Block iFrom was not already redirected. Add a new array entry. */ 4816 memmove(&p->redirect.a[1], &p->redirect.a[0], 4817 sizeof(struct RedirectEntry) * p->redirect.n 4818 ); 4819 p->redirect.a[0].iFrom = iFrom; 4820 p->redirect.a[0].iTo = iTo; 4821 p->redirect.n++; 4822 }else{ 4823 /* Block iFrom was already redirected. Overwrite existing entry. */ 4824 p->redirect.a[i].iTo = iTo; 4825 } 4826 4827 rc = lsmBlockFree(pDb, iFrom); 4828 4829 *pnWrite = lsmFsBlockSize(pDb->pFS) / lsmFsPageSize(pDb->pFS); 4830 pLvl->lhs.pRedirect = &p->redirect; 4831 } 4832 } 4833 4834 #if LSM_LOG_STRUCTURE 4835 if( rc==LSM_OK ){ 4836 char aBuf[64]; 4837 sprintf(aBuf, "move-block %d/%d", p->redirect.n-1, LSM_MAX_BLOCK_REDIRECTS); 4838 lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, aBuf); 4839 } 4840 #endif 4841 return rc; 4842 } 4843 4844 /* 4845 */ 4846 static int mergeInsertFreelistSegments( 4847 lsm_db *pDb, 4848 int nFree, 4849 MergeWorker *pMW 4850 ){ 4851 int rc = LSM_OK; 4852 if( nFree>0 ){ 4853 MultiCursor *pCsr = pMW->pCsr; 4854 Level *pLvl = pMW->pLevel; 4855 SegmentPtr *aNew1; 4856 Segment *aNew2; 4857 4858 Level *pIter; 4859 Level *pNext; 4860 int i = 0; 4861 4862 aNew1 = (SegmentPtr *)lsmMallocZeroRc( 4863 pDb->pEnv, sizeof(SegmentPtr) * (pCsr->nPtr+nFree), &rc 4864 ); 4865 if( rc ) return rc; 4866 memcpy(&aNew1[nFree], pCsr->aPtr, sizeof(SegmentPtr)*pCsr->nPtr); 4867 pCsr->nPtr += nFree; 4868 lsmFree(pDb->pEnv, pCsr->aTree); 4869 lsmFree(pDb->pEnv, pCsr->aPtr); 4870 pCsr->aTree = 0; 4871 pCsr->aPtr = aNew1; 4872 4873 aNew2 = (Segment *)lsmMallocZeroRc( 4874 pDb->pEnv, sizeof(Segment) * (pLvl->nRight+nFree), &rc 4875 ); 4876 if( rc ) return rc; 4877 memcpy(&aNew2[nFree], pLvl->aRhs, sizeof(Segment)*pLvl->nRight); 4878 pLvl->nRight += nFree; 4879 lsmFree(pDb->pEnv, pLvl->aRhs); 4880 pLvl->aRhs = aNew2; 4881 4882 for(pIter=pDb->pWorker->pLevel; rc==LSM_OK && pIter!=pLvl; pIter=pNext){ 4883 Segment *pSeg = &pLvl->aRhs[i]; 4884 memcpy(pSeg, &pIter->lhs, sizeof(Segment)); 4885 4886 pCsr->aPtr[i].pSeg = pSeg; 4887 pCsr->aPtr[i].pLevel = pLvl; 4888 rc = segmentPtrEnd(pCsr, &pCsr->aPtr[i], 0); 4889 4890 pDb->pWorker->pLevel = pNext = pIter->pNext; 4891 sortedFreeLevel(pDb->pEnv, pIter); 4892 i++; 4893 } 4894 assert( i==nFree ); 4895 assert( rc!=LSM_OK || pDb->pWorker->pLevel==pLvl ); 4896 4897 for(i=nFree; i<pCsr->nPtr; i++){ 4898 pCsr->aPtr[i].pSeg = &pLvl->aRhs[i]; 4899 } 4900 4901 lsmFree(pDb->pEnv, pMW->aGobble); 4902 pMW->aGobble = 0; 4903 } 4904 return rc; 4905 } 4906 4907 static int sortedWork( 4908 lsm_db *pDb, /* Database handle. Must be worker. */ 4909 int nWork, /* Number of pages of work to do */ 4910 int nMerge, /* Try to merge this many levels at once */ 4911 int bFlush, /* Set if call is to make room for a flush */ 4912 int *pnWrite /* OUT: Actual number of pages written */ 4913 ){ 4914 int rc = LSM_OK; /* Return Code */ 4915 int nRemaining = nWork; /* Units of work to do before returning */ 4916 Snapshot *pWorker = pDb->pWorker; 4917 4918 assert( pWorker ); 4919 if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK; 4920 4921 while( nRemaining>0 ){ 4922 Level *pLevel = 0; 4923 4924 /* Find a level to work on. */ 4925 rc = sortedSelectLevel(pDb, nMerge, &pLevel); 4926 assert( rc==LSM_OK || pLevel==0 ); 4927 4928 if( pLevel==0 ){ 4929 int nDone = 0; 4930 Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker); 4931 if( bFlush==0 && nMerge==1 && pTopLevel && pTopLevel->pNext==0 ){ 4932 rc = sortedMoveBlock(pDb, &nDone); 4933 } 4934 nRemaining -= nDone; 4935 4936 /* Could not find any work to do. Finished. */ 4937 if( nDone==0 ) break; 4938 }else{ 4939 int bSave = 0; 4940 Freelist freelist = {0, 0, 0}; 4941 MergeWorker mergeworker; /* State used to work on the level merge */ 4942 4943 assert( pDb->bIncrMerge==0 ); 4944 assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 ); 4945 4946 pDb->bIncrMerge = 1; 4947 rc = mergeWorkerInit(pDb, pLevel, &mergeworker); 4948 assert( mergeworker.nWork==0 ); 4949 4950 while( rc==LSM_OK 4951 && 0==mergeWorkerDone(&mergeworker) 4952 && (mergeworker.nWork<nRemaining || pDb->bUseFreelist) 4953 ){ 4954 int eType = rtTopic(mergeworker.pCsr->eType); 4955 rc = mergeWorkerStep(&mergeworker); 4956 4957 /* If the cursor now points at the first entry past the end of the 4958 ** user data (i.e. either to EOF or to the first free-list entry 4959 ** that will be added to the run), then check if it is possible to 4960 ** merge in any free-list entries that are either in-memory or in 4961 ** free-list-only blocks. */ 4962 if( rc==LSM_OK && nMerge==1 && eType==0 4963 && (rtTopic(mergeworker.pCsr->eType) || mergeWorkerDone(&mergeworker)) 4964 ){ 4965 int nFree = 0; /* Number of free-list-only levels to merge */ 4966 Level *pLvl; 4967 assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 ); 4968 4969 /* Now check if all levels containing data newer than this one 4970 ** are single-segment free-list only levels. If so, they will be 4971 ** merged in now. */ 4972 for(pLvl=pDb->pWorker->pLevel; 4973 pLvl!=mergeworker.pLevel && (pLvl->flags & LEVEL_FREELIST_ONLY); 4974 pLvl=pLvl->pNext 4975 ){ 4976 assert( pLvl->nRight==0 ); 4977 nFree++; 4978 } 4979 if( pLvl==mergeworker.pLevel ){ 4980 4981 rc = mergeInsertFreelistSegments(pDb, nFree, &mergeworker); 4982 if( rc==LSM_OK ){ 4983 rc = multiCursorVisitFreelist(mergeworker.pCsr); 4984 } 4985 if( rc==LSM_OK ){ 4986 rc = multiCursorSetupTree(mergeworker.pCsr, 0); 4987 pDb->pFreelist = &freelist; 4988 pDb->bUseFreelist = 1; 4989 } 4990 } 4991 } 4992 } 4993 nRemaining -= LSM_MAX(mergeworker.nWork, 1); 4994 4995 if( rc==LSM_OK ){ 4996 /* Check if the merge operation is completely finished. If not, 4997 ** gobble up (declare eligible for recycling) any pages from rhs 4998 ** segments for which the content has been completely merged into 4999 ** the lhs of the level. */ 5000 if( mergeWorkerDone(&mergeworker)==0 ){ 5001 int i; 5002 for(i=0; i<pLevel->nRight; i++){ 5003 SegmentPtr *pGobble = &mergeworker.pCsr->aPtr[i]; 5004 if( pGobble->pSeg->iRoot ){ 5005 rc = sortedBtreeGobble(pDb, mergeworker.pCsr, i); 5006 }else if( mergeworker.aGobble[i] ){ 5007 lsmFsGobble(pDb, pGobble->pSeg, &mergeworker.aGobble[i], 1); 5008 } 5009 } 5010 }else{ 5011 int i; 5012 int bEmpty; 5013 mergeWorkerShutdown(&mergeworker, &rc); 5014 bEmpty = (pLevel->lhs.iFirst==0); 5015 5016 if( bEmpty==0 && rc==LSM_OK ){ 5017 rc = lsmFsSortedFinish(pDb->pFS, &pLevel->lhs); 5018 } 5019 5020 if( pDb->bUseFreelist ){ 5021 Freelist *p = &pDb->pWorker->freelist; 5022 lsmFree(pDb->pEnv, p->aEntry); 5023 memcpy(p, &freelist, sizeof(freelist)); 5024 pDb->bUseFreelist = 0; 5025 pDb->pFreelist = 0; 5026 bSave = 1; 5027 } 5028 5029 for(i=0; i<pLevel->nRight; i++){ 5030 lsmFsSortedDelete(pDb->pFS, pWorker, 1, &pLevel->aRhs[i]); 5031 } 5032 5033 if( bEmpty ){ 5034 /* If the new level is completely empty, remove it from the 5035 ** database snapshot. This can only happen if all input keys were 5036 ** annihilated. Since keys are only annihilated if the new level 5037 ** is the last in the linked list (contains the most ancient of 5038 ** database content), this guarantees that pLevel->pNext==0. */ 5039 Level *pTop; /* Top level of worker snapshot */ 5040 Level **pp; /* Read/write iterator for Level.pNext list */ 5041 5042 assert( pLevel->pNext==0 ); 5043 5044 /* Remove the level from the worker snapshot. */ 5045 pTop = lsmDbSnapshotLevel(pWorker); 5046 for(pp=&pTop; *pp!=pLevel; pp=&((*pp)->pNext)); 5047 *pp = pLevel->pNext; 5048 lsmDbSnapshotSetLevel(pWorker, pTop); 5049 5050 /* Free the Level structure. */ 5051 sortedFreeLevel(pDb->pEnv, pLevel); 5052 }else{ 5053 5054 /* Free the separators of the next level, if required. */ 5055 if( pLevel->pMerge->nInput > pLevel->nRight ){ 5056 assert( pLevel->pNext->lhs.iRoot ); 5057 pLevel->pNext->lhs.iRoot = 0; 5058 } 5059 5060 /* Zero the right-hand-side of pLevel */ 5061 lsmFree(pDb->pEnv, pLevel->aRhs); 5062 pLevel->nRight = 0; 5063 pLevel->aRhs = 0; 5064 5065 /* Free the Merge object */ 5066 lsmFree(pDb->pEnv, pLevel->pMerge); 5067 pLevel->pMerge = 0; 5068 } 5069 5070 if( bSave && rc==LSM_OK ){ 5071 pDb->bIncrMerge = 0; 5072 rc = lsmSaveWorker(pDb, 0); 5073 } 5074 } 5075 } 5076 5077 /* Clean up the MergeWorker object initialized above. If no error 5078 ** has occurred, invoke the work-hook to inform the application that 5079 ** the database structure has changed. */ 5080 mergeWorkerShutdown(&mergeworker, &rc); 5081 pDb->bIncrMerge = 0; 5082 if( rc==LSM_OK ) sortedInvokeWorkHook(pDb); 5083 5084 #if LSM_LOG_STRUCTURE 5085 lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "work"); 5086 #endif 5087 assertBtreeOk(pDb, &pLevel->lhs); 5088 assertRunInOrder(pDb, &pLevel->lhs); 5089 5090 /* If bFlush is true and the database is no longer considered "full", 5091 ** break out of the loop even if nRemaining is still greater than 5092 ** zero. The caller has an in-memory tree to flush to disk. */ 5093 if( bFlush && sortedDbIsFull(pDb)==0 ) break; 5094 } 5095 } 5096 5097 if( pnWrite ) *pnWrite = (nWork - nRemaining); 5098 pWorker->nWrite += (nWork - nRemaining); 5099 5100 #ifdef LSM_LOG_WORK 5101 lsmLogMessage(pDb, rc, "sortedWork(): %d pages", (nWork-nRemaining)); 5102 #endif 5103 return rc; 5104 } 5105 5106 /* 5107 ** The database connection passed as the first argument must be a worker 5108 ** connection. This function checks if there exists an "old" in-memory tree 5109 ** ready to be flushed to disk. If so, true is returned. Otherwise false. 5110 ** 5111 ** If an error occurs, *pRc is set to an LSM error code before returning. 5112 ** It is assumed that *pRc is set to LSM_OK when this function is called. 5113 */ 5114 static int sortedTreeHasOld(lsm_db *pDb, int *pRc){ 5115 int rc = LSM_OK; 5116 int bRet = 0; 5117 5118 assert( pDb->pWorker ); 5119 if( *pRc==LSM_OK ){ 5120 if( rc==LSM_OK 5121 && pDb->treehdr.iOldShmid 5122 && pDb->treehdr.iOldLog!=pDb->pWorker->iLogOff 5123 ){ 5124 bRet = 1; 5125 }else{ 5126 bRet = 0; 5127 } 5128 *pRc = rc; 5129 } 5130 assert( *pRc==LSM_OK || bRet==0 ); 5131 return bRet; 5132 } 5133 5134 /* 5135 ** Create a new free-list only top-level segment. Return LSM_OK if successful 5136 ** or an LSM error code if some error occurs. 5137 */ 5138 static int sortedNewFreelistOnly(lsm_db *pDb){ 5139 return sortedNewToplevel(pDb, TREE_NONE, 0); 5140 } 5141 5142 int lsmSaveWorker(lsm_db *pDb, int bFlush){ 5143 Snapshot *p = pDb->pWorker; 5144 if( p->freelist.nEntry>pDb->nMaxFreelist ){ 5145 int rc = sortedNewFreelistOnly(pDb); 5146 if( rc!=LSM_OK ) return rc; 5147 } 5148 return lsmCheckpointSaveWorker(pDb, bFlush); 5149 } 5150 5151 static int doLsmSingleWork( 5152 lsm_db *pDb, 5153 int bShutdown, 5154 int nMerge, /* Minimum segments to merge together */ 5155 int nPage, /* Number of pages to write to disk */ 5156 int *pnWrite, /* OUT: Pages actually written to disk */ 5157 int *pbCkpt /* OUT: True if an auto-checkpoint is req. */ 5158 ){ 5159 Snapshot *pWorker; /* Worker snapshot */ 5160 int rc = LSM_OK; /* Return code */ 5161 int bDirty = 0; 5162 int nMax = nPage; /* Maximum pages to write to disk */ 5163 int nRem = nPage; 5164 int bCkpt = 0; 5165 5166 assert( nPage>0 ); 5167 5168 /* Open the worker 'transaction'. It will be closed before this function 5169 ** returns. */ 5170 assert( pDb->pWorker==0 ); 5171 rc = lsmBeginWork(pDb); 5172 if( rc!=LSM_OK ) return rc; 5173 pWorker = pDb->pWorker; 5174 5175 /* If this connection is doing auto-checkpoints, set nMax (and nRem) so 5176 ** that this call stops writing when the auto-checkpoint is due. The 5177 ** caller will do the checkpoint, then possibly call this function again. */ 5178 if( bShutdown==0 && pDb->nAutockpt ){ 5179 u32 nSync; 5180 u32 nUnsync; 5181 int nPgsz; 5182 5183 lsmCheckpointSynced(pDb, 0, 0, &nSync); 5184 nUnsync = lsmCheckpointNWrite(pDb->pShmhdr->aSnap1, 0); 5185 nPgsz = lsmCheckpointPgsz(pDb->pShmhdr->aSnap1); 5186 5187 nMax = (int)LSM_MIN(nMax, (pDb->nAutockpt/nPgsz) - (int)(nUnsync-nSync)); 5188 if( nMax<nRem ){ 5189 bCkpt = 1; 5190 nRem = LSM_MAX(nMax, 0); 5191 } 5192 } 5193 5194 /* If there exists in-memory data ready to be flushed to disk, attempt 5195 ** to flush it now. */ 5196 if( pDb->nTransOpen==0 ){ 5197 rc = lsmTreeLoadHeader(pDb, 0); 5198 } 5199 if( sortedTreeHasOld(pDb, &rc) ){ 5200 /* sortedDbIsFull() returns non-zero if either (a) there are too many 5201 ** levels in total in the db, or (b) there are too many levels with the 5202 ** the same age in the db. Either way, call sortedWork() to merge 5203 ** existing segments together until this condition is cleared. */ 5204 if( sortedDbIsFull(pDb) ){ 5205 int nPg = 0; 5206 rc = sortedWork(pDb, nRem, nMerge, 1, &nPg); 5207 nRem -= nPg; 5208 assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) ); 5209 bDirty = 1; 5210 } 5211 5212 if( rc==LSM_OK && nRem>0 ){ 5213 int nPg = 0; 5214 rc = sortedNewToplevel(pDb, TREE_OLD, &nPg); 5215 nRem -= nPg; 5216 if( rc==LSM_OK ){ 5217 if( pDb->nTransOpen>0 ){ 5218 lsmTreeDiscardOld(pDb); 5219 } 5220 rc = lsmSaveWorker(pDb, 1); 5221 bDirty = 0; 5222 } 5223 } 5224 } 5225 5226 /* If nPage is still greater than zero, do some merging. */ 5227 if( rc==LSM_OK && nRem>0 && bShutdown==0 ){ 5228 int nPg = 0; 5229 rc = sortedWork(pDb, nRem, nMerge, 0, &nPg); 5230 nRem -= nPg; 5231 if( nPg ) bDirty = 1; 5232 } 5233 5234 /* If the in-memory part of the free-list is too large, write a new 5235 ** top-level containing just the in-memory free-list entries to disk. */ 5236 if( rc==LSM_OK && pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist ){ 5237 int nPg = 0; 5238 while( rc==LSM_OK && lsmDatabaseFull(pDb) ){ 5239 rc = sortedWork(pDb, 16, nMerge, 1, &nPg); 5240 nRem -= nPg; 5241 } 5242 if( rc==LSM_OK ){ 5243 rc = sortedNewFreelistOnly(pDb); 5244 } 5245 nRem -= nPg; 5246 if( nPg ) bDirty = 1; 5247 } 5248 5249 if( rc==LSM_OK ){ 5250 *pnWrite = (nMax - nRem); 5251 *pbCkpt = (bCkpt && nRem<=0); 5252 if( nMerge==1 && pDb->nAutockpt>0 && *pnWrite>0 5253 && pWorker->pLevel 5254 && pWorker->pLevel->nRight==0 5255 && pWorker->pLevel->pNext==0 5256 ){ 5257 *pbCkpt = 1; 5258 } 5259 } 5260 5261 if( rc==LSM_OK && bDirty ){ 5262 lsmFinishWork(pDb, 0, &rc); 5263 }else{ 5264 int rcdummy = LSM_BUSY; 5265 lsmFinishWork(pDb, 0, &rcdummy); 5266 *pnWrite = 0; 5267 } 5268 assert( pDb->pWorker==0 ); 5269 return rc; 5270 } 5271 5272 static int doLsmWork(lsm_db *pDb, int nMerge, int nPage, int *pnWrite){ 5273 int rc = LSM_OK; /* Return code */ 5274 int nWrite = 0; /* Number of pages written */ 5275 5276 assert( nMerge>=1 ); 5277 5278 if( nPage!=0 ){ 5279 int bCkpt = 0; 5280 do { 5281 int nThis = 0; 5282 int nReq = (nPage>=0) ? (nPage-nWrite) : ((int)0x7FFFFFFF); 5283 5284 bCkpt = 0; 5285 rc = doLsmSingleWork(pDb, 0, nMerge, nReq, &nThis, &bCkpt); 5286 nWrite += nThis; 5287 if( rc==LSM_OK && bCkpt ){ 5288 rc = lsm_checkpoint(pDb, 0); 5289 } 5290 }while( rc==LSM_OK && bCkpt && (nWrite<nPage || nPage<0) ); 5291 } 5292 5293 if( pnWrite ){ 5294 if( rc==LSM_OK ){ 5295 *pnWrite = nWrite; 5296 }else{ 5297 *pnWrite = 0; 5298 } 5299 } 5300 return rc; 5301 } 5302 5303 /* 5304 ** Perform work to merge database segments together. 5305 */ 5306 int lsm_work(lsm_db *pDb, int nMerge, int nKB, int *pnWrite){ 5307 int rc; /* Return code */ 5308 int nPgsz; /* Nominal page size in bytes */ 5309 int nPage; /* Equivalent of nKB in pages */ 5310 int nWrite = 0; /* Number of pages written */ 5311 5312 /* This function may not be called if pDb has an open read or write 5313 ** transaction. Return LSM_MISUSE if an application attempts this. */ 5314 if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT; 5315 if( nMerge<=0 ) nMerge = pDb->nMerge; 5316 5317 lsmFsPurgeCache(pDb->pFS); 5318 5319 /* Convert from KB to pages */ 5320 nPgsz = lsmFsPageSize(pDb->pFS); 5321 if( nKB>=0 ){ 5322 nPage = ((i64)nKB * 1024 + nPgsz - 1) / nPgsz; 5323 }else{ 5324 nPage = -1; 5325 } 5326 5327 rc = doLsmWork(pDb, nMerge, nPage, &nWrite); 5328 5329 if( pnWrite ){ 5330 /* Convert back from pages to KB */ 5331 *pnWrite = (int)(((i64)nWrite * 1024 + nPgsz - 1) / nPgsz); 5332 } 5333 return rc; 5334 } 5335 5336 int lsm_flush(lsm_db *db){ 5337 int rc; 5338 5339 if( db->nTransOpen>0 || db->pCsr ){ 5340 rc = LSM_MISUSE_BKPT; 5341 }else{ 5342 rc = lsmBeginWriteTrans(db); 5343 if( rc==LSM_OK ){ 5344 lsmFlushTreeToDisk(db); 5345 lsmTreeDiscardOld(db); 5346 lsmTreeMakeOld(db); 5347 lsmTreeDiscardOld(db); 5348 } 5349 5350 if( rc==LSM_OK ){ 5351 rc = lsmFinishWriteTrans(db, 1); 5352 }else{ 5353 lsmFinishWriteTrans(db, 0); 5354 } 5355 lsmFinishReadTrans(db); 5356 } 5357 5358 return rc; 5359 } 5360 5361 /* 5362 ** This function is called in auto-work mode to perform merging work on 5363 ** the data structure. It performs enough merging work to prevent the 5364 ** height of the tree from growing indefinitely assuming that roughly 5365 ** nUnit database pages worth of data have been written to the database 5366 ** (i.e. the in-memory tree) since the last call. 5367 */ 5368 int lsmSortedAutoWork( 5369 lsm_db *pDb, /* Database handle */ 5370 int nUnit /* Pages of data written to in-memory tree */ 5371 ){ 5372 int rc = LSM_OK; /* Return code */ 5373 int nDepth = 0; /* Current height of tree (longest path) */ 5374 Level *pLevel; /* Used to iterate through levels */ 5375 int bRestore = 0; 5376 5377 assert( pDb->pWorker==0 ); 5378 assert( pDb->nTransOpen>0 ); 5379 5380 /* Determine how many units of work to do before returning. One unit of 5381 ** work is achieved by writing one page (~4KB) of merged data. */ 5382 for(pLevel=lsmDbSnapshotLevel(pDb->pClient); pLevel; pLevel=pLevel->pNext){ 5383 /* nDepth += LSM_MAX(1, pLevel->nRight); */ 5384 nDepth += 1; 5385 } 5386 if( lsmTreeHasOld(pDb) ){ 5387 nDepth += 1; 5388 bRestore = 1; 5389 rc = lsmSaveCursors(pDb); 5390 if( rc!=LSM_OK ) return rc; 5391 } 5392 5393 if( nDepth>0 ){ 5394 int nRemaining; /* Units of work to do before returning */ 5395 5396 nRemaining = nUnit * nDepth; 5397 #ifdef LSM_LOG_WORK 5398 lsmLogMessage(pDb, rc, "lsmSortedAutoWork(): %d*%d = %d pages", 5399 nUnit, nDepth, nRemaining); 5400 #endif 5401 assert( nRemaining>=0 ); 5402 rc = doLsmWork(pDb, pDb->nMerge, nRemaining, 0); 5403 if( rc==LSM_BUSY ) rc = LSM_OK; 5404 5405 if( bRestore && pDb->pCsr ){ 5406 lsmMCursorFreeCache(pDb); 5407 lsmFreeSnapshot(pDb->pEnv, pDb->pClient); 5408 pDb->pClient = 0; 5409 if( rc==LSM_OK ){ 5410 rc = lsmCheckpointLoad(pDb, 0); 5411 } 5412 if( rc==LSM_OK ){ 5413 rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient); 5414 } 5415 if( rc==LSM_OK ){ 5416 rc = lsmRestoreCursors(pDb); 5417 } 5418 } 5419 } 5420 5421 return rc; 5422 } 5423 5424 /* 5425 ** This function is only called during system shutdown. The contents of 5426 ** any in-memory trees present (old or current) are written out to disk. 5427 */ 5428 int lsmFlushTreeToDisk(lsm_db *pDb){ 5429 int rc; 5430 5431 rc = lsmBeginWork(pDb); 5432 while( rc==LSM_OK && sortedDbIsFull(pDb) ){ 5433 rc = sortedWork(pDb, 256, pDb->nMerge, 1, 0); 5434 } 5435 5436 if( rc==LSM_OK ){ 5437 rc = sortedNewToplevel(pDb, TREE_BOTH, 0); 5438 } 5439 5440 lsmFinishWork(pDb, 1, &rc); 5441 return rc; 5442 } 5443 5444 /* 5445 ** Return a string representation of the segment passed as the only argument. 5446 ** Space for the returned string is allocated using lsmMalloc(), and should 5447 ** be freed by the caller using lsmFree(). 5448 */ 5449 static char *segToString(lsm_env *pEnv, Segment *pSeg, int nMin){ 5450 int nSize = pSeg->nSize; 5451 Pgno iRoot = pSeg->iRoot; 5452 Pgno iFirst = pSeg->iFirst; 5453 Pgno iLast = pSeg->iLastPg; 5454 char *z; 5455 5456 char *z1; 5457 char *z2; 5458 int nPad; 5459 5460 z1 = lsmMallocPrintf(pEnv, "%d.%d", iFirst, iLast); 5461 if( iRoot ){ 5462 z2 = lsmMallocPrintf(pEnv, "root=%d", iRoot); 5463 }else{ 5464 z2 = lsmMallocPrintf(pEnv, "size=%d", nSize); 5465 } 5466 5467 nPad = nMin - 2 - strlen(z1) - 1 - strlen(z2); 5468 nPad = LSM_MAX(0, nPad); 5469 5470 if( iRoot ){ 5471 z = lsmMallocPrintf(pEnv, "/%s %*s%s\\", z1, nPad, "", z2); 5472 }else{ 5473 z = lsmMallocPrintf(pEnv, "|%s %*s%s|", z1, nPad, "", z2); 5474 } 5475 lsmFree(pEnv, z1); 5476 lsmFree(pEnv, z2); 5477 5478 return z; 5479 } 5480 5481 static int fileToString( 5482 lsm_db *pDb, /* For xMalloc() */ 5483 char *aBuf, 5484 int nBuf, 5485 int nMin, 5486 Segment *pSeg 5487 ){ 5488 int i = 0; 5489 if( pSeg ){ 5490 char *zSeg; 5491 5492 zSeg = segToString(pDb->pEnv, pSeg, nMin); 5493 snprintf(&aBuf[i], nBuf-i, "%s", zSeg); 5494 i += strlen(&aBuf[i]); 5495 lsmFree(pDb->pEnv, zSeg); 5496 5497 #ifdef LSM_LOG_FREELIST 5498 lsmInfoArrayStructure(pDb, 1, pSeg->iFirst, &zSeg); 5499 snprintf(&aBuf[i], nBuf-1, " (%s)", zSeg); 5500 i += strlen(&aBuf[i]); 5501 lsmFree(pDb->pEnv, zSeg); 5502 #endif 5503 aBuf[nBuf] = 0; 5504 }else{ 5505 aBuf[0] = '\0'; 5506 } 5507 5508 return i; 5509 } 5510 5511 void sortedDumpPage(lsm_db *pDb, Segment *pRun, Page *pPg, int bVals){ 5512 Blob blob = {0, 0, 0}; /* Blob used for keys */ 5513 LsmString s; 5514 int i; 5515 5516 int nRec; 5517 int iPtr; 5518 int flags; 5519 u8 *aData; 5520 int nData; 5521 5522 aData = fsPageData(pPg, &nData); 5523 5524 nRec = pageGetNRec(aData, nData); 5525 iPtr = (int)pageGetPtr(aData, nData); 5526 flags = pageGetFlags(aData, nData); 5527 5528 lsmStringInit(&s, pDb->pEnv); 5529 lsmStringAppendf(&s,"nCell=%d iPtr=%d flags=%d {", nRec, iPtr, flags); 5530 if( flags&SEGMENT_BTREE_FLAG ) iPtr = 0; 5531 5532 for(i=0; i<nRec; i++){ 5533 Page *pRef = 0; /* Pointer to page iRef */ 5534 int iChar; 5535 u8 *aKey; int nKey = 0; /* Key */ 5536 u8 *aVal = 0; int nVal = 0; /* Value */ 5537 int iTopic; 5538 u8 *aCell; 5539 int iPgPtr; 5540 int eType; 5541 5542 aCell = pageGetCell(aData, nData, i); 5543 eType = *aCell++; 5544 assert( (flags & SEGMENT_BTREE_FLAG) || eType!=0 ); 5545 aCell += lsmVarintGet32(aCell, &iPgPtr); 5546 5547 if( eType==0 ){ 5548 Pgno iRef; /* Page number of referenced page */ 5549 aCell += lsmVarintGet64(aCell, &iRef); 5550 lsmFsDbPageGet(pDb->pFS, pRun, iRef, &pRef); 5551 aKey = pageGetKey(pRun, pRef, 0, &iTopic, &nKey, &blob); 5552 }else{ 5553 aCell += lsmVarintGet32(aCell, &nKey); 5554 if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal); 5555 sortedReadData(0, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, &blob); 5556 aVal = &aKey[nKey]; 5557 iTopic = eType; 5558 } 5559 5560 lsmStringAppendf(&s, "%s%2X:", (i==0?"":" "), iTopic); 5561 for(iChar=0; iChar<nKey; iChar++){ 5562 lsmStringAppendf(&s, "%c", isalnum(aKey[iChar]) ? aKey[iChar] : '.'); 5563 } 5564 if( nVal>0 && bVals ){ 5565 lsmStringAppendf(&s, "##"); 5566 for(iChar=0; iChar<nVal; iChar++){ 5567 lsmStringAppendf(&s, "%c", isalnum(aVal[iChar]) ? aVal[iChar] : '.'); 5568 } 5569 } 5570 5571 lsmStringAppendf(&s, " %d", iPgPtr+iPtr); 5572 lsmFsPageRelease(pRef); 5573 } 5574 lsmStringAppend(&s, "}", 1); 5575 5576 lsmLogMessage(pDb, LSM_OK, " Page %d: %s", lsmFsPageNumber(pPg), s.z); 5577 lsmStringClear(&s); 5578 5579 sortedBlobFree(&blob); 5580 } 5581 5582 static void infoCellDump( 5583 lsm_db *pDb, /* Database handle */ 5584 Segment *pSeg, /* Segment page belongs to */ 5585 int bIndirect, /* True to follow indirect refs */ 5586 Page *pPg, 5587 int iCell, 5588 int *peType, 5589 int *piPgPtr, 5590 u8 **paKey, int *pnKey, 5591 u8 **paVal, int *pnVal, 5592 Blob *pBlob 5593 ){ 5594 u8 *aData; int nData; /* Page data */ 5595 u8 *aKey; int nKey = 0; /* Key */ 5596 u8 *aVal = 0; int nVal = 0; /* Value */ 5597 int eType; 5598 int iPgPtr; 5599 Page *pRef = 0; /* Pointer to page iRef */ 5600 u8 *aCell; 5601 5602 aData = fsPageData(pPg, &nData); 5603 5604 aCell = pageGetCell(aData, nData, iCell); 5605 eType = *aCell++; 5606 aCell += lsmVarintGet32(aCell, &iPgPtr); 5607 5608 if( eType==0 ){ 5609 int dummy; 5610 Pgno iRef; /* Page number of referenced page */ 5611 aCell += lsmVarintGet64(aCell, &iRef); 5612 if( bIndirect ){ 5613 lsmFsDbPageGet(pDb->pFS, pSeg, iRef, &pRef); 5614 pageGetKeyCopy(pDb->pEnv, pSeg, pRef, 0, &dummy, pBlob); 5615 aKey = (u8 *)pBlob->pData; 5616 nKey = pBlob->nData; 5617 lsmFsPageRelease(pRef); 5618 }else{ 5619 aKey = (u8 *)"<indirect>"; 5620 nKey = 11; 5621 } 5622 }else{ 5623 aCell += lsmVarintGet32(aCell, &nKey); 5624 if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal); 5625 sortedReadData(pSeg, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, pBlob); 5626 aVal = &aKey[nKey]; 5627 } 5628 5629 if( peType ) *peType = eType; 5630 if( piPgPtr ) *piPgPtr = iPgPtr; 5631 if( paKey ) *paKey = aKey; 5632 if( paVal ) *paVal = aVal; 5633 if( pnKey ) *pnKey = nKey; 5634 if( pnVal ) *pnVal = nVal; 5635 } 5636 5637 static int infoAppendBlob(LsmString *pStr, int bHex, u8 *z, int n){ 5638 int iChar; 5639 for(iChar=0; iChar<n; iChar++){ 5640 if( bHex ){ 5641 lsmStringAppendf(pStr, "%02X", z[iChar]); 5642 }else{ 5643 lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.'); 5644 } 5645 } 5646 return LSM_OK; 5647 } 5648 5649 #define INFO_PAGE_DUMP_DATA 0x01 5650 #define INFO_PAGE_DUMP_VALUES 0x02 5651 #define INFO_PAGE_DUMP_HEX 0x04 5652 #define INFO_PAGE_DUMP_INDIRECT 0x08 5653 5654 static int infoPageDump( 5655 lsm_db *pDb, /* Database handle */ 5656 Pgno iPg, /* Page number of page to dump */ 5657 int flags, 5658 char **pzOut /* OUT: lsmMalloc'd string */ 5659 ){ 5660 int rc = LSM_OK; /* Return code */ 5661 Page *pPg = 0; /* Handle for page iPg */ 5662 int i, j; /* Loop counters */ 5663 const int perLine = 16; /* Bytes per line in the raw hex dump */ 5664 Segment *pSeg = 0; 5665 Snapshot *pSnap; 5666 5667 int bValues = (flags & INFO_PAGE_DUMP_VALUES); 5668 int bHex = (flags & INFO_PAGE_DUMP_HEX); 5669 int bData = (flags & INFO_PAGE_DUMP_DATA); 5670 int bIndirect = (flags & INFO_PAGE_DUMP_INDIRECT); 5671 5672 *pzOut = 0; 5673 if( iPg==0 ) return LSM_ERROR; 5674 5675 assert( pDb->pClient || pDb->pWorker ); 5676 pSnap = pDb->pClient; 5677 if( pSnap==0 ) pSnap = pDb->pWorker; 5678 if( pSnap->redirect.n>0 ){ 5679 Level *pLvl; 5680 int bUse = 0; 5681 for(pLvl=pSnap->pLevel; pLvl->pNext; pLvl=pLvl->pNext); 5682 pSeg = (pLvl->nRight==0 ? &pLvl->lhs : &pLvl->aRhs[pLvl->nRight-1]); 5683 rc = lsmFsSegmentContainsPg(pDb->pFS, pSeg, iPg, &bUse); 5684 if( bUse==0 ){ 5685 pSeg = 0; 5686 } 5687 } 5688 5689 /* iPg is a real page number (not subject to redirection). So it is safe 5690 ** to pass a NULL in place of the segment pointer as the second argument 5691 ** to lsmFsDbPageGet() here. */ 5692 if( rc==LSM_OK ){ 5693 rc = lsmFsDbPageGet(pDb->pFS, 0, iPg, &pPg); 5694 } 5695 5696 if( rc==LSM_OK ){ 5697 Blob blob = {0, 0, 0, 0}; 5698 int nKeyWidth = 0; 5699 LsmString str; 5700 int nRec; 5701 int iPtr; 5702 int flags2; 5703 int iCell; 5704 u8 *aData; int nData; /* Page data and size thereof */ 5705 5706 aData = fsPageData(pPg, &nData); 5707 nRec = pageGetNRec(aData, nData); 5708 iPtr = (int)pageGetPtr(aData, nData); 5709 flags2 = pageGetFlags(aData, nData); 5710 5711 lsmStringInit(&str, pDb->pEnv); 5712 lsmStringAppendf(&str, "Page : %lld (%d bytes)\n", iPg, nData); 5713 lsmStringAppendf(&str, "nRec : %d\n", nRec); 5714 lsmStringAppendf(&str, "iPtr : %d\n", iPtr); 5715 lsmStringAppendf(&str, "flags: %04x\n", flags2); 5716 lsmStringAppendf(&str, "\n"); 5717 5718 for(iCell=0; iCell<nRec; iCell++){ 5719 int nKey; 5720 infoCellDump( 5721 pDb, pSeg, bIndirect, pPg, iCell, 0, 0, 0, &nKey, 0, 0, &blob 5722 ); 5723 if( nKey>nKeyWidth ) nKeyWidth = nKey; 5724 } 5725 if( bHex ) nKeyWidth = nKeyWidth * 2; 5726 5727 for(iCell=0; iCell<nRec; iCell++){ 5728 u8 *aKey; int nKey = 0; /* Key */ 5729 u8 *aVal; int nVal = 0; /* Value */ 5730 int iPgPtr; 5731 int eType; 5732 Pgno iAbsPtr; 5733 char zFlags[8]; 5734 5735 infoCellDump(pDb, pSeg, bIndirect, pPg, iCell, &eType, &iPgPtr, 5736 &aKey, &nKey, &aVal, &nVal, &blob 5737 ); 5738 iAbsPtr = iPgPtr + ((flags2 & SEGMENT_BTREE_FLAG) ? 0 : iPtr); 5739 5740 lsmFlagsToString(eType, zFlags); 5741 lsmStringAppendf(&str, "%s %d (%s) ", 5742 zFlags, iAbsPtr, (rtTopic(eType) ? "sys" : "usr") 5743 ); 5744 infoAppendBlob(&str, bHex, aKey, nKey); 5745 if( nVal>0 && bValues ){ 5746 lsmStringAppendf(&str, "%*s", nKeyWidth - (nKey*(1+bHex)), ""); 5747 lsmStringAppendf(&str, " "); 5748 infoAppendBlob(&str, bHex, aVal, nVal); 5749 } 5750 if( rtTopic(eType) ){ 5751 int iBlk = (int)~lsmGetU32(aKey); 5752 lsmStringAppendf(&str, " (block=%d", iBlk); 5753 if( nVal>0 ){ 5754 i64 iSnap = lsmGetU64(aVal); 5755 lsmStringAppendf(&str, " snapshot=%lld", iSnap); 5756 } 5757 lsmStringAppendf(&str, ")"); 5758 } 5759 lsmStringAppendf(&str, "\n"); 5760 } 5761 5762 if( bData ){ 5763 lsmStringAppendf(&str, "\n-------------------" 5764 "-------------------------------------------------------------\n"); 5765 lsmStringAppendf(&str, "Page %d\n", 5766 iPg, (iPg-1)*nData, iPg*nData - 1); 5767 for(i=0; i<nData; i += perLine){ 5768 lsmStringAppendf(&str, "%04x: ", i); 5769 for(j=0; j<perLine; j++){ 5770 if( i+j>nData ){ 5771 lsmStringAppendf(&str, " "); 5772 }else{ 5773 lsmStringAppendf(&str, "%02x ", aData[i+j]); 5774 } 5775 } 5776 lsmStringAppendf(&str, " "); 5777 for(j=0; j<perLine; j++){ 5778 if( i+j>nData ){ 5779 lsmStringAppendf(&str, " "); 5780 }else{ 5781 lsmStringAppendf(&str,"%c", isprint(aData[i+j]) ? aData[i+j] : '.'); 5782 } 5783 } 5784 lsmStringAppendf(&str,"\n"); 5785 } 5786 } 5787 5788 *pzOut = str.z; 5789 sortedBlobFree(&blob); 5790 lsmFsPageRelease(pPg); 5791 } 5792 5793 return rc; 5794 } 5795 5796 int lsmInfoPageDump( 5797 lsm_db *pDb, /* Database handle */ 5798 Pgno iPg, /* Page number of page to dump */ 5799 int bHex, /* True to output key/value in hex form */ 5800 char **pzOut /* OUT: lsmMalloc'd string */ 5801 ){ 5802 int flags = INFO_PAGE_DUMP_DATA | INFO_PAGE_DUMP_VALUES; 5803 if( bHex ) flags |= INFO_PAGE_DUMP_HEX; 5804 return infoPageDump(pDb, iPg, flags, pzOut); 5805 } 5806 5807 void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){ 5808 assert( pDb->xLog ); 5809 if( pRun && pRun->iFirst ){ 5810 int flags = (bVals ? INFO_PAGE_DUMP_VALUES : 0); 5811 char *zSeg; 5812 Page *pPg; 5813 5814 zSeg = segToString(pDb->pEnv, pRun, 0); 5815 lsmLogMessage(pDb, LSM_OK, "Segment: %s", zSeg); 5816 lsmFree(pDb->pEnv, zSeg); 5817 5818 lsmFsDbPageGet(pDb->pFS, pRun, pRun->iFirst, &pPg); 5819 while( pPg ){ 5820 Page *pNext; 5821 char *z = 0; 5822 infoPageDump(pDb, lsmFsPageNumber(pPg), flags, &z); 5823 lsmLogMessage(pDb, LSM_OK, "%s", z); 5824 lsmFree(pDb->pEnv, z); 5825 #if 0 5826 sortedDumpPage(pDb, pRun, pPg, bVals); 5827 #endif 5828 lsmFsDbPageNext(pRun, pPg, 1, &pNext); 5829 lsmFsPageRelease(pPg); 5830 pPg = pNext; 5831 } 5832 } 5833 } 5834 5835 /* 5836 ** Invoke the log callback zero or more times with messages that describe 5837 ** the current database structure. 5838 */ 5839 void lsmSortedDumpStructure( 5840 lsm_db *pDb, /* Database handle (used for xLog callback) */ 5841 Snapshot *pSnap, /* Snapshot to dump */ 5842 int bKeys, /* Output the keys from each segment */ 5843 int bVals, /* Output the values from each segment */ 5844 const char *zWhy /* Caption to print near top of dump */ 5845 ){ 5846 Snapshot *pDump = pSnap; 5847 Level *pTopLevel; 5848 char *zFree = 0; 5849 5850 assert( pSnap ); 5851 pTopLevel = lsmDbSnapshotLevel(pDump); 5852 if( pDb->xLog && pTopLevel ){ 5853 static int nCall = 0; 5854 Level *pLevel; 5855 int iLevel = 0; 5856 5857 nCall++; 5858 lsmLogMessage(pDb, LSM_OK, "Database structure %d (%s)", nCall, zWhy); 5859 5860 #if 0 5861 if( nCall==1031 || nCall==1032 ) bKeys=1; 5862 #endif 5863 5864 for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){ 5865 char zLeft[1024]; 5866 char zRight[1024]; 5867 int i = 0; 5868 5869 Segment *aLeft[24]; 5870 Segment *aRight[24]; 5871 5872 int nLeft = 0; 5873 int nRight = 0; 5874 5875 Segment *pSeg = &pLevel->lhs; 5876 aLeft[nLeft++] = pSeg; 5877 5878 for(i=0; i<pLevel->nRight; i++){ 5879 aRight[nRight++] = &pLevel->aRhs[i]; 5880 } 5881 5882 #ifdef LSM_LOG_FREELIST 5883 if( nRight ){ 5884 memmove(&aRight[1], aRight, sizeof(aRight[0])*nRight); 5885 aRight[0] = 0; 5886 nRight++; 5887 } 5888 #endif 5889 5890 for(i=0; i<nLeft || i<nRight; i++){ 5891 int iPad = 0; 5892 char zLevel[32]; 5893 zLeft[0] = '\0'; 5894 zRight[0] = '\0'; 5895 5896 if( i<nLeft ){ 5897 fileToString(pDb, zLeft, sizeof(zLeft), 24, aLeft[i]); 5898 } 5899 if( i<nRight ){ 5900 fileToString(pDb, zRight, sizeof(zRight), 24, aRight[i]); 5901 } 5902 5903 if( i==0 ){ 5904 snprintf(zLevel, sizeof(zLevel), "L%d: (age=%d) (flags=%.4x)", 5905 iLevel, (int)pLevel->iAge, (int)pLevel->flags 5906 ); 5907 }else{ 5908 zLevel[0] = '\0'; 5909 } 5910 5911 if( nRight==0 ){ 5912 iPad = 10; 5913 } 5914 5915 lsmLogMessage(pDb, LSM_OK, "% 25s % *s% -35s %s", 5916 zLevel, iPad, "", zLeft, zRight 5917 ); 5918 } 5919 5920 iLevel++; 5921 } 5922 5923 if( bKeys ){ 5924 for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){ 5925 int i; 5926 sortedDumpSegment(pDb, &pLevel->lhs, bVals); 5927 for(i=0; i<pLevel->nRight; i++){ 5928 sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals); 5929 } 5930 } 5931 } 5932 } 5933 5934 lsmInfoFreelist(pDb, &zFree); 5935 lsmLogMessage(pDb, LSM_OK, "Freelist: %s", zFree); 5936 lsmFree(pDb->pEnv, zFree); 5937 5938 assert( lsmFsIntegrityCheck(pDb) ); 5939 } 5940 5941 void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){ 5942 Level *pNext; 5943 Level *p; 5944 5945 for(p=pLevel; p; p=pNext){ 5946 pNext = p->pNext; 5947 sortedFreeLevel(pEnv, p); 5948 } 5949 } 5950 5951 void lsmSortedSaveTreeCursors(lsm_db *pDb){ 5952 MultiCursor *pCsr; 5953 for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){ 5954 lsmTreeCursorSave(pCsr->apTreeCsr[0]); 5955 lsmTreeCursorSave(pCsr->apTreeCsr[1]); 5956 } 5957 } 5958 5959 void lsmSortedExpandBtreePage(Page *pPg, int nOrig){ 5960 u8 *aData; 5961 int nData; 5962 int nEntry; 5963 int iHdr; 5964 5965 aData = lsmFsPageData(pPg, &nData); 5966 nEntry = pageGetNRec(aData, nOrig); 5967 iHdr = SEGMENT_EOF(nOrig, nEntry); 5968 memmove(&aData[iHdr + (nData-nOrig)], &aData[iHdr], nOrig-iHdr); 5969 } 5970 5971 #ifdef LSM_DEBUG_EXPENSIVE 5972 static void assertRunInOrder(lsm_db *pDb, Segment *pSeg){ 5973 Page *pPg = 0; 5974 Blob blob1 = {0, 0, 0, 0}; 5975 Blob blob2 = {0, 0, 0, 0}; 5976 5977 lsmFsDbPageGet(pDb->pFS, pSeg, pSeg->iFirst, &pPg); 5978 while( pPg ){ 5979 u8 *aData; int nData; 5980 Page *pNext; 5981 5982 aData = lsmFsPageData(pPg, &nData); 5983 if( 0==(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) ){ 5984 int i; 5985 int nRec = pageGetNRec(aData, nData); 5986 for(i=0; i<nRec; i++){ 5987 int iTopic1, iTopic2; 5988 pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i, &iTopic1, &blob1); 5989 5990 if( i==0 && blob2.nData ){ 5991 assert( sortedKeyCompare( 5992 pDb->xCmp, iTopic2, blob2.pData, blob2.nData, 5993 iTopic1, blob1.pData, blob1.nData 5994 )<0 ); 5995 } 5996 5997 if( i<(nRec-1) ){ 5998 pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i+1, &iTopic2, &blob2); 5999 assert( sortedKeyCompare( 6000 pDb->xCmp, iTopic1, blob1.pData, blob1.nData, 6001 iTopic2, blob2.pData, blob2.nData 6002 )<0 ); 6003 } 6004 } 6005 } 6006 6007 lsmFsDbPageNext(pSeg, pPg, 1, &pNext); 6008 lsmFsPageRelease(pPg); 6009 pPg = pNext; 6010 } 6011 6012 sortedBlobFree(&blob1); 6013 sortedBlobFree(&blob2); 6014 } 6015 #endif 6016 6017 #ifdef LSM_DEBUG_EXPENSIVE 6018 /* 6019 ** This function is only included in the build if LSM_DEBUG_EXPENSIVE is 6020 ** defined. Its only purpose is to evaluate various assert() statements to 6021 ** verify that the database is well formed in certain respects. 6022 ** 6023 ** More specifically, it checks that the array pOne contains the required 6024 ** pointers to pTwo. Array pTwo must be a main array. pOne may be either a 6025 ** separators array or another main array. If pOne does not contain the 6026 ** correct set of pointers, an assert() statement fails. 6027 */ 6028 static int assertPointersOk( 6029 lsm_db *pDb, /* Database handle */ 6030 Segment *pOne, /* Segment containing pointers */ 6031 Segment *pTwo, /* Segment containing pointer targets */ 6032 int bRhs /* True if pTwo may have been Gobble()d */ 6033 ){ 6034 int rc = LSM_OK; /* Error code */ 6035 SegmentPtr ptr1; /* Iterates through pOne */ 6036 SegmentPtr ptr2; /* Iterates through pTwo */ 6037 Pgno iPrev; 6038 6039 assert( pOne && pTwo ); 6040 6041 memset(&ptr1, 0, sizeof(ptr1)); 6042 memset(&ptr2, 0, sizeof(ptr1)); 6043 ptr1.pSeg = pOne; 6044 ptr2.pSeg = pTwo; 6045 segmentPtrEndPage(pDb->pFS, &ptr1, 0, &rc); 6046 segmentPtrEndPage(pDb->pFS, &ptr2, 0, &rc); 6047 6048 /* Check that the footer pointer of the first page of pOne points to 6049 ** the first page of pTwo. */ 6050 iPrev = pTwo->iFirst; 6051 if( ptr1.iPtr!=iPrev && !bRhs ){ 6052 assert( 0 ); 6053 } 6054 6055 if( rc==LSM_OK && ptr1.nCell>0 ){ 6056 rc = segmentPtrLoadCell(&ptr1, 0); 6057 } 6058 6059 while( rc==LSM_OK && ptr2.pPg ){ 6060 Pgno iThis; 6061 6062 /* Advance to the next page of segment pTwo that contains at least 6063 ** one cell. Break out of the loop if the iterator reaches EOF. */ 6064 do{ 6065 rc = segmentPtrNextPage(&ptr2, 1); 6066 assert( rc==LSM_OK ); 6067 }while( rc==LSM_OK && ptr2.pPg && ptr2.nCell==0 ); 6068 if( rc!=LSM_OK || ptr2.pPg==0 ) break; 6069 iThis = lsmFsPageNumber(ptr2.pPg); 6070 6071 if( (ptr2.flags & (PGFTR_SKIP_THIS_FLAG|SEGMENT_BTREE_FLAG))==0 ){ 6072 6073 /* Load the first cell in the array pTwo page. */ 6074 rc = segmentPtrLoadCell(&ptr2, 0); 6075 6076 /* Iterate forwards through pOne, searching for a key that matches the 6077 ** key ptr2.pKey/nKey. This key should have a pointer to the page that 6078 ** ptr2 currently points to. */ 6079 while( rc==LSM_OK ){ 6080 int res = rtTopic(ptr1.eType) - rtTopic(ptr2.eType); 6081 if( res==0 ){ 6082 res = pDb->xCmp(ptr1.pKey, ptr1.nKey, ptr2.pKey, ptr2.nKey); 6083 } 6084 6085 if( res<0 ){ 6086 assert( bRhs || ptr1.iPtr+ptr1.iPgPtr==iPrev ); 6087 }else if( res>0 ){ 6088 assert( 0 ); 6089 }else{ 6090 assert( ptr1.iPtr+ptr1.iPgPtr==iThis ); 6091 iPrev = iThis; 6092 break; 6093 } 6094 6095 rc = segmentPtrAdvance(0, &ptr1, 0); 6096 if( ptr1.pPg==0 ){ 6097 assert( 0 ); 6098 } 6099 } 6100 } 6101 } 6102 6103 segmentPtrReset(&ptr1, 0); 6104 segmentPtrReset(&ptr2, 0); 6105 return LSM_OK; 6106 } 6107 6108 /* 6109 ** This function is only included in the build if LSM_DEBUG_EXPENSIVE is 6110 ** defined. Its only purpose is to evaluate various assert() statements to 6111 ** verify that the database is well formed in certain respects. 6112 ** 6113 ** More specifically, it checks that the b-tree embedded in array pRun 6114 ** contains the correct keys. If not, an assert() fails. 6115 */ 6116 static int assertBtreeOk( 6117 lsm_db *pDb, 6118 Segment *pSeg 6119 ){ 6120 int rc = LSM_OK; /* Return code */ 6121 if( pSeg->iRoot ){ 6122 Blob blob = {0, 0, 0}; /* Buffer used to cache overflow keys */ 6123 FileSystem *pFS = pDb->pFS; /* File system to read from */ 6124 Page *pPg = 0; /* Main run page */ 6125 BtreeCursor *pCsr = 0; /* Btree cursor */ 6126 6127 rc = btreeCursorNew(pDb, pSeg, &pCsr); 6128 if( rc==LSM_OK ){ 6129 rc = btreeCursorFirst(pCsr); 6130 } 6131 if( rc==LSM_OK ){ 6132 rc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pPg); 6133 } 6134 6135 while( rc==LSM_OK ){ 6136 Page *pNext; 6137 u8 *aData; 6138 int nData; 6139 int flags; 6140 6141 rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext); 6142 lsmFsPageRelease(pPg); 6143 pPg = pNext; 6144 if( pPg==0 ) break; 6145 aData = fsPageData(pPg, &nData); 6146 flags = pageGetFlags(aData, nData); 6147 if( rc==LSM_OK 6148 && 0==((SEGMENT_BTREE_FLAG|PGFTR_SKIP_THIS_FLAG) & flags) 6149 && 0!=pageGetNRec(aData, nData) 6150 ){ 6151 u8 *pKey; 6152 int nKey; 6153 int iTopic; 6154 pKey = pageGetKey(pSeg, pPg, 0, &iTopic, &nKey, &blob); 6155 assert( nKey==pCsr->nKey && 0==memcmp(pKey, pCsr->pKey, nKey) ); 6156 assert( lsmFsPageNumber(pPg)==pCsr->iPtr ); 6157 rc = btreeCursorNext(pCsr); 6158 } 6159 } 6160 assert( rc!=LSM_OK || pCsr->pKey==0 ); 6161 6162 if( pPg ) lsmFsPageRelease(pPg); 6163 6164 btreeCursorFree(pCsr); 6165 sortedBlobFree(&blob); 6166 } 6167 6168 return rc; 6169 } 6170 #endif /* ifdef LSM_DEBUG_EXPENSIVE */