github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/sstable/reader.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "errors" 11 "fmt" 12 "io" 13 "sort" 14 "sync" 15 "unsafe" 16 17 "github.com/golang/snappy" 18 "github.com/petermattis/pebble/cache" 19 "github.com/petermattis/pebble/internal/base" 20 "github.com/petermattis/pebble/internal/crc" 21 "github.com/petermattis/pebble/internal/rangedel" 22 "github.com/petermattis/pebble/vfs" 23 ) 24 25 // BlockHandle is the file offset and length of a block. 26 type BlockHandle struct { 27 Offset, Length uint64 28 } 29 30 // decodeBlockHandle returns the block handle encoded at the start of src, as 31 // well as the number of bytes it occupies. It returns zero if given invalid 32 // input. 33 func decodeBlockHandle(src []byte) (BlockHandle, int) { 34 offset, n := binary.Uvarint(src) 35 length, m := binary.Uvarint(src[n:]) 36 if n == 0 || m == 0 { 37 return BlockHandle{}, 0 38 } 39 return BlockHandle{offset, length}, n + m 40 } 41 42 func encodeBlockHandle(dst []byte, b BlockHandle) int { 43 n := binary.PutUvarint(dst, b.Offset) 44 m := binary.PutUvarint(dst[n:], b.Length) 45 return n + m 46 } 47 48 // block is a []byte that holds a sequence of key/value pairs plus an index 49 // over those pairs. 50 type block []byte 51 52 // Iterator iterates over an entire table of data. 53 type Iterator interface { 54 base.InternalIterator 55 56 Init(r *Reader, lower, upper []byte) error 57 SetCloseHook(fn func(i Iterator) error) 58 } 59 60 // singleLevelIterator iterates over an entire table of data. To seek for a given 61 // key, it first looks in the index for the block that contains that key, and then 62 // looks inside that block. 63 type singleLevelIterator struct { 64 cmp Compare 65 // Global lower/upper bound for the iterator. 66 lower []byte 67 upper []byte 68 // Per-block lower/upper bound. Nil if the bound does not apply to the block 69 // because we determined the block lies completely within the bound. 70 blockLower []byte 71 blockUpper []byte 72 reader *Reader 73 index blockIter 74 data blockIter 75 dataBH BlockHandle 76 err error 77 closeHook func(i Iterator) error 78 } 79 80 var singleLevelIterPool = sync.Pool{ 81 New: func() interface{} { 82 return &singleLevelIterator{} 83 }, 84 } 85 86 var twoLevelIterPool = sync.Pool{ 87 New: func() interface{} { 88 return &twoLevelIterator{} 89 }, 90 } 91 92 // Init initializes a singleLevelIterator for reading from the table. It is 93 // synonmous with Reader.NewIter, but allows for reusing of the iterator 94 // between different Readers. 95 func (i *singleLevelIterator) Init(r *Reader, lower, upper []byte) error { 96 *i = singleLevelIterator{ 97 lower: lower, 98 upper: upper, 99 reader: r, 100 err: r.err, 101 } 102 if i.err == nil { 103 var index block 104 index, i.err = r.readIndex() 105 if i.err != nil { 106 return i.err 107 } 108 i.cmp = r.Compare 109 i.err = i.index.init(i.cmp, index, r.Properties.GlobalSeqNum) 110 } 111 return i.err 112 } 113 114 func (i *singleLevelIterator) initBounds() { 115 if i.lower == nil && i.upper == nil { 116 return 117 } 118 119 // Trim the iteration bounds for the current block. We don't have to check 120 // the bounds on each iteration if the block is entirely contained within the 121 // iteration bounds. 122 i.blockLower = i.lower 123 if i.blockLower != nil { 124 key, _ := i.data.First() 125 if key != nil && i.cmp(i.blockLower, key.UserKey) < 0 { 126 // The lower-bound is less than the first key in the block. No need 127 // to check the lower-bound again for this block. 128 i.blockLower = nil 129 } 130 } 131 i.blockUpper = i.upper 132 if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 { 133 // The upper-bound is greater than the index key which itself is greater 134 // than or equal to every key in the block. No need to check the 135 // upper-bound again for this block. 136 i.blockUpper = nil 137 } 138 } 139 140 // loadBlock loads the block at the current index position and leaves i.data 141 // unpositioned. If unsuccessful, it sets i.err to any error encountered, which 142 // may be nil if we have simply exhausted the entire table. 143 func (i *singleLevelIterator) loadBlock() bool { 144 if !i.index.Valid() { 145 i.err = i.index.err 146 // TODO(peter): Need to test that seeking to a key outside of the sstable 147 // invalidates the iterator. 148 i.data.offset = 0 149 i.data.restarts = 0 150 return false 151 } 152 // Load the next block. 153 v := i.index.Value() 154 var n int 155 i.dataBH, n = decodeBlockHandle(v) 156 if n == 0 || n != len(v) { 157 i.err = errors.New("pebble/table: corrupt index entry") 158 return false 159 } 160 block, err := i.reader.readBlock(i.dataBH, nil /* transform */) 161 if err != nil { 162 i.err = err 163 return false 164 } 165 i.data.setCacheHandle(block) 166 i.err = i.data.init(i.cmp, block.Get(), i.reader.Properties.GlobalSeqNum) 167 if i.err != nil { 168 return false 169 } 170 i.initBounds() 171 return true 172 } 173 174 // seekBlock loads the block at the current index position and positions i.data 175 // at the first key in that block which is >= the given key. If unsuccessful, 176 // it sets i.err to any error encountered, which may be nil if we have simply 177 // exhausted the entire table. 178 func (i *singleLevelIterator) seekBlock(key []byte) bool { 179 if !i.index.Valid() { 180 i.err = i.index.err 181 return false 182 } 183 // Load the next block. 184 v := i.index.Value() 185 h, n := decodeBlockHandle(v) 186 if n == 0 || n != len(v) { 187 i.err = errors.New("pebble/table: corrupt index entry") 188 return false 189 } 190 block, err := i.reader.readBlock(h, nil /* transform */) 191 if err != nil { 192 i.err = err 193 return false 194 } 195 i.data.setCacheHandle(block) 196 i.err = i.data.init(i.cmp, block.Get(), i.reader.Properties.GlobalSeqNum) 197 if i.err != nil { 198 return false 199 } 200 // Look for the key inside that block. 201 i.initBounds() 202 i.data.SeekGE(key) 203 return true 204 } 205 206 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 207 // package. Note that SeekGE only checks the upper bound. It is up to the 208 // caller to ensure that key is greater than or equal to the lower bound. 209 func (i *singleLevelIterator) SeekGE(key []byte) (*InternalKey, []byte) { 210 if i.err != nil { 211 return nil, nil 212 } 213 214 if ikey, _ := i.index.SeekGE(key); ikey == nil { 215 return nil, nil 216 } 217 if !i.loadBlock() { 218 return nil, nil 219 } 220 ikey, val := i.data.SeekGE(key) 221 if ikey == nil { 222 return nil, nil 223 } 224 if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 { 225 i.data.invalidateUpper() // force i.data.Valid() to return false 226 return nil, nil 227 } 228 return ikey, val 229 } 230 231 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 232 // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up 233 // to the caller to ensure that key is greater than or equal to the lower bound. 234 func (i *singleLevelIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) { 235 if i.err != nil { 236 return nil, nil 237 } 238 239 // Check prefix bloom filter. 240 if i.reader.tableFilter != nil { 241 data, err := i.reader.readFilter() 242 if err != nil { 243 return nil, nil 244 } 245 if !i.reader.tableFilter.mayContain(data, prefix) { 246 i.data.invalidateUpper() // force i.data.Valid() to return false 247 return nil, nil 248 } 249 } 250 251 if ikey, _ := i.index.SeekGE(key); ikey == nil { 252 return nil, nil 253 } 254 if !i.loadBlock() { 255 return nil, nil 256 } 257 ikey, val := i.data.SeekGE(key) 258 if ikey == nil { 259 return nil, nil 260 } 261 if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 { 262 i.data.invalidateUpper() // force i.data.Valid() to return false 263 return nil, nil 264 } 265 return ikey, val 266 } 267 268 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 269 // package. Note that SeekLT only checks the lower bound. It is up to the 270 // caller to ensure that key is less than the upper bound. 271 func (i *singleLevelIterator) SeekLT(key []byte) (*InternalKey, []byte) { 272 if i.err != nil { 273 return nil, nil 274 } 275 276 if ikey, _ := i.index.SeekGE(key); ikey == nil { 277 i.index.Last() 278 } 279 if !i.loadBlock() { 280 return nil, nil 281 } 282 ikey, val := i.data.SeekLT(key) 283 if ikey == nil { 284 // The index contains separator keys which may lie between 285 // user-keys. Consider the user-keys: 286 // 287 // complete 288 // ---- new block --- 289 // complexion 290 // 291 // If these two keys end one block and start the next, the index key may 292 // be chosen as "compleu". The SeekGE in the index block will then point 293 // us to the block containing "complexion". If this happens, we want the 294 // last key from the previous data block. 295 if ikey, _ = i.index.Prev(); ikey == nil { 296 return nil, nil 297 } 298 if !i.loadBlock() { 299 return nil, nil 300 } 301 if ikey, val = i.data.Last(); ikey == nil { 302 return nil, nil 303 } 304 } 305 if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 { 306 i.data.invalidateLower() // force i.data.Valid() to return false 307 return nil, nil 308 } 309 return ikey, val 310 } 311 312 // First implements internalIterator.First, as documented in the pebble 313 // package. Note that First only checks the upper bound. It is up to the caller 314 // to ensure that key is greater than or equal to the lower bound (e.g. via a 315 // call to SeekGE(lower)). 316 func (i *singleLevelIterator) First() (*InternalKey, []byte) { 317 if i.err != nil { 318 return nil, nil 319 } 320 321 if ikey, _ := i.index.First(); ikey == nil { 322 return nil, nil 323 } 324 if !i.loadBlock() { 325 return nil, nil 326 } 327 ikey, val := i.data.First() 328 if ikey == nil { 329 return nil, nil 330 } 331 if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 { 332 i.data.invalidateUpper() // force i.data.Valid() to return false 333 return nil, nil 334 } 335 return ikey, val 336 } 337 338 // Last implements internalIterator.Last, as documented in the pebble 339 // package. Note that Last only checks the lower bound. It is up to the caller 340 // to ensure that key is less than the upper bound (e.g. via a call to 341 // SeekLT(upper)) 342 func (i *singleLevelIterator) Last() (*InternalKey, []byte) { 343 if i.err != nil { 344 return nil, nil 345 } 346 347 if ikey, _ := i.index.Last(); ikey == nil { 348 return nil, nil 349 } 350 if !i.loadBlock() { 351 return nil, nil 352 } 353 if ikey, _ := i.data.Last(); ikey == nil { 354 return nil, nil 355 } 356 if i.blockLower != nil && i.cmp(i.data.ikey.UserKey, i.blockLower) < 0 { 357 i.data.invalidateLower() 358 return nil, nil 359 } 360 return &i.data.ikey, i.data.val 361 } 362 363 // Next implements internalIterator.Next, as documented in the pebble 364 // package. 365 // Note: compactionIterator.Next mirrors the implementation of Iterator.Next 366 // due to performance. Keep the two in sync. 367 func (i *singleLevelIterator) Next() (*InternalKey, []byte) { 368 if i.err != nil { 369 return nil, nil 370 } 371 if key, val := i.data.Next(); key != nil { 372 if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 { 373 i.data.invalidateUpper() 374 return nil, nil 375 } 376 return key, val 377 } 378 for { 379 if i.data.err != nil { 380 i.err = i.data.err 381 break 382 } 383 if key, _ := i.index.Next(); key == nil { 384 break 385 } 386 if i.loadBlock() { 387 key, val := i.data.First() 388 if key == nil { 389 return nil, nil 390 } 391 if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 { 392 i.data.invalidateUpper() 393 return nil, nil 394 } 395 return key, val 396 } 397 } 398 return nil, nil 399 } 400 401 // Prev implements internalIterator.Prev, as documented in the pebble 402 // package. 403 func (i *singleLevelIterator) Prev() (*InternalKey, []byte) { 404 if i.err != nil { 405 return nil, nil 406 } 407 if key, val := i.data.Prev(); key != nil { 408 if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 { 409 i.data.invalidateLower() 410 return nil, nil 411 } 412 return key, val 413 } 414 for { 415 if i.data.err != nil { 416 i.err = i.data.err 417 break 418 } 419 if key, _ := i.index.Prev(); key == nil { 420 break 421 } 422 if i.loadBlock() { 423 key, val := i.data.Last() 424 if key == nil { 425 return nil, nil 426 } 427 if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 { 428 i.data.invalidateLower() 429 return nil, nil 430 } 431 return key, val 432 } 433 } 434 return nil, nil 435 } 436 437 // Key implements internalIterator.Key, as documented in the pebble package. 438 func (i *singleLevelIterator) Key() *InternalKey { 439 return i.data.Key() 440 } 441 442 // Value implements internalIterator.Value, as documented in the pebble 443 // package. 444 func (i *singleLevelIterator) Value() []byte { 445 return i.data.Value() 446 } 447 448 // Valid implements internalIterator.Valid, as documented in the pebble 449 // package. 450 func (i *singleLevelIterator) Valid() bool { 451 return i.data.Valid() 452 } 453 454 // Error implements internalIterator.Error, as documented in the pebble 455 // package. 456 func (i *singleLevelIterator) Error() error { 457 if err := i.data.Error(); err != nil { 458 return err 459 } 460 return i.err 461 } 462 463 // SetCloseHook sets a function that will be called when the iterator is 464 // closed. 465 func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) { 466 i.closeHook = fn 467 } 468 469 // Close implements internalIterator.Close, as documented in the pebble 470 // package. 471 func (i *singleLevelIterator) Close() error { 472 if i.closeHook != nil { 473 if err := i.closeHook(i); err != nil { 474 return err 475 } 476 } 477 if err := i.data.Close(); err != nil { 478 return err 479 } 480 err := i.err 481 *i = singleLevelIterator{} 482 singleLevelIterPool.Put(i) 483 return err 484 } 485 486 // SetBounds implements internalIterator.SetBounds, as documented in the pebble 487 // package. 488 func (i *singleLevelIterator) SetBounds(lower, upper []byte) { 489 i.lower = lower 490 i.upper = upper 491 } 492 493 // compactionIterator is similar to Iterator but it increments the number of 494 // bytes that have been iterated through. 495 type compactionIterator struct { 496 *singleLevelIterator 497 bytesIterated *uint64 498 prevOffset uint64 499 } 500 501 func (i *compactionIterator) SeekGE(key []byte) (*InternalKey, []byte) { 502 panic("pebble: SeekGE unimplemented") 503 } 504 505 func (i *compactionIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) { 506 panic("pebble: SeekPrefixGE unimplemented") 507 } 508 509 func (i *compactionIterator) SeekLT(key []byte) (*InternalKey, []byte) { 510 panic("pebble: SeekLT unimplemented") 511 } 512 513 func (i *compactionIterator) First() (*InternalKey, []byte) { 514 key, val := i.singleLevelIterator.First() 515 if key == nil { 516 // An empty sstable will still encode the block trailer and restart points, so bytes 517 // iterated must be incremented. 518 519 // We must use i.dataBH.Length instead of (4*(i.data.numRestarts+1)) to calculate the 520 // number of bytes for the restart points, since i.dataBH.Length accounts for 521 // compression. When uncompressed, i.dataBH.Length == (4*(i.data.numRestarts+1)) 522 *i.bytesIterated += blockTrailerLen + i.dataBH.Length 523 return nil, nil 524 } 525 // If the sstable only has 1 entry, we are at the last entry in the block and we must 526 // increment bytes iterated by the size of the block trailer and restart points. 527 if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) { 528 i.prevOffset = blockTrailerLen + i.dataBH.Length 529 } else { 530 // i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1. 531 // i.data.nextOffset is the uncompressed size of the first record. 532 i.prevOffset = (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data)) 533 } 534 *i.bytesIterated += i.prevOffset 535 return key, val 536 } 537 538 func (i *compactionIterator) Last() (*InternalKey, []byte) { 539 panic("pebble: Last unimplemented") 540 } 541 542 // Note: compactionIterator.Next mirrors the implementation of Iterator.Next 543 // due to performance. Keep the two in sync. 544 func (i *compactionIterator) Next() (*InternalKey, []byte) { 545 if i.err != nil { 546 return nil, nil 547 } 548 key, val := i.data.Next() 549 if key == nil { 550 for { 551 if i.data.err != nil { 552 i.err = i.data.err 553 return nil, nil 554 } 555 if key, _ := i.index.Next(); key == nil { 556 return nil, nil 557 } 558 if i.loadBlock() { 559 key, val = i.data.First() 560 if key == nil { 561 return nil, nil 562 } 563 break 564 } 565 } 566 } 567 568 // i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1. 569 // i.data.nextOffset is the uncompressed position of the current record in the block. 570 // i.dataBH.Offset is the offset of the block in the sstable before decompression. 571 recordOffset := (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data)) 572 curOffset := i.dataBH.Offset + recordOffset 573 // Last entry in the block must increment bytes iterated by the size of the block trailer 574 // and restart points. 575 if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) { 576 curOffset = i.dataBH.Offset + i.dataBH.Length + blockTrailerLen 577 } 578 *i.bytesIterated += uint64(curOffset - i.prevOffset) 579 i.prevOffset = curOffset 580 return key, val 581 } 582 583 func (i *compactionIterator) Prev() (*InternalKey, []byte) { 584 panic("pebble: Prev unimplemented") 585 } 586 587 type twoLevelIterator struct { 588 singleLevelIterator 589 topLevelIndex blockIter 590 } 591 592 // loadIndex loads the index block at the current top level index position and 593 // leaves i.index unpositioned. If unsuccessful, it gets i.err to any error 594 // encountered, which may be nil if we have simply exhausted the entire table. 595 // This is used for two level indexes. 596 func (i *twoLevelIterator) loadIndex() bool { 597 if !i.topLevelIndex.Valid() { 598 i.err = i.topLevelIndex.err 599 i.index.offset = 0 600 i.index.restarts = 0 601 return false 602 } 603 h, n := decodeBlockHandle(i.topLevelIndex.Value()) 604 if n == 0 || n != len(i.topLevelIndex.Value()) { 605 i.err = errors.New("pebble/table: corrupt top level index entry") 606 return false 607 } 608 indexBlock, err := i.reader.readBlock(h, nil /* transform */) 609 if err != nil { 610 i.err = err 611 return false 612 } 613 i.index.setCacheHandle(indexBlock) 614 i.err = i.index.init(i.cmp, indexBlock.Get(), i.reader.Properties.GlobalSeqNum) 615 if i.err != nil { 616 return false 617 } 618 return true 619 } 620 621 func (i *twoLevelIterator) Init(r *Reader, lower, upper []byte) error { 622 *i = twoLevelIterator{ 623 singleLevelIterator: singleLevelIterator{ 624 lower: lower, 625 upper: upper, 626 reader: r, 627 err: r.err, 628 }, 629 } 630 if i.err == nil { 631 topLevelIndex, err := r.readIndex() 632 if i.err != nil { 633 i.err = err 634 return i.err 635 } 636 i.cmp = r.Compare 637 i.err = i.topLevelIndex.init(i.cmp, topLevelIndex, r.Properties.GlobalSeqNum) 638 } 639 return i.err 640 } 641 642 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 643 // package. Note that SeekGE only checks the upper bound. It is up to the 644 // caller to ensure that key is greater than or equal to the lower bound. 645 func (i *twoLevelIterator) SeekGE(key []byte) (*InternalKey, []byte) { 646 if i.err != nil { 647 return nil, nil 648 } 649 650 if ikey, _ := i.topLevelIndex.SeekGE(key); ikey == nil { 651 return nil, nil 652 } 653 654 if !i.loadIndex() { 655 return nil, nil 656 } 657 658 return i.singleLevelIterator.SeekGE(key) 659 } 660 661 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 662 // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up 663 // to the caller to ensure that key is greater than or equal to the lower bound. 664 func (i *twoLevelIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) { 665 if i.err != nil { 666 return nil, nil 667 } 668 669 if ikey, _ := i.topLevelIndex.SeekGE(key); ikey == nil { 670 return nil, nil 671 } 672 673 if !i.loadIndex() { 674 return nil, nil 675 } 676 677 return i.singleLevelIterator.SeekPrefixGE(prefix, key) 678 } 679 680 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 681 // package. Note that SeekLT only checks the lower bound. It is up to the 682 // caller to ensure that key is less than the upper bound. 683 func (i *twoLevelIterator) SeekLT(key []byte) (*InternalKey, []byte) { 684 if i.err != nil { 685 return nil, nil 686 } 687 688 if ikey, _ := i.topLevelIndex.SeekGE(key); ikey == nil { 689 if ikey, _ := i.topLevelIndex.Last(); ikey == nil { 690 return nil, nil 691 } 692 693 if !i.loadIndex() { 694 return nil, nil 695 } 696 697 return i.singleLevelIterator.Last() 698 } 699 700 if !i.loadIndex() { 701 return nil, nil 702 } 703 704 ikey, val := i.singleLevelIterator.SeekLT(key) 705 if ikey == nil { 706 if ikey, val = i.topLevelIndex.Prev(); ikey == nil { 707 return nil, nil 708 } 709 if !i.loadIndex() { 710 return nil, nil 711 } 712 if ikey, val = i.singleLevelIterator.Last(); ikey == nil { 713 return nil, nil 714 } 715 } 716 717 return ikey, val 718 } 719 720 // First implements internalIterator.First, as documented in the pebble 721 // package. Note that First only checks the upper bound. It is up to the caller 722 // to ensure that key is greater than or equal to the lower bound (e.g. via a 723 // call to SeekGE(lower)). 724 func (i *twoLevelIterator) First() (*InternalKey, []byte) { 725 if i.err != nil { 726 return nil, nil 727 } 728 729 if ikey, _ := i.topLevelIndex.First(); ikey == nil { 730 return nil, nil 731 } 732 733 if !i.loadIndex() { 734 return nil, nil 735 } 736 737 return i.singleLevelIterator.First() 738 } 739 740 // Last implements internalIterator.Last, as documented in the pebble 741 // package. Note that Last only checks the lower bound. It is up to the caller 742 // to ensure that key is less than the upper bound (e.g. via a call to 743 // SeekLT(upper)) 744 func (i *twoLevelIterator) Last() (*InternalKey, []byte) { 745 if i.err != nil { 746 return nil, nil 747 } 748 749 if ikey, _ := i.topLevelIndex.Last(); ikey == nil { 750 return nil, nil 751 } 752 753 if !i.loadIndex() { 754 return nil, nil 755 } 756 757 return i.singleLevelIterator.Last() 758 } 759 760 // Next implements internalIterator.Next, as documented in the pebble 761 // package. 762 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 763 // twoLevelIterator.Next due to performance. Keep the two in sync. 764 func (i *twoLevelIterator) Next() (*InternalKey, []byte) { 765 if i.err != nil { 766 return nil, nil 767 } 768 if key, val := i.singleLevelIterator.Next(); key != nil { 769 return key, val 770 } 771 for { 772 if i.index.err != nil { 773 i.err = i.index.err 774 break 775 } 776 if ikey, _ := i.topLevelIndex.Next(); ikey == nil { 777 return nil, nil 778 } 779 if !i.loadIndex() { 780 return nil, nil 781 } 782 return i.singleLevelIterator.First() 783 } 784 return nil, nil 785 } 786 787 // Prev implements internalIterator.Prev, as documented in the pebble 788 // package. 789 func (i *twoLevelIterator) Prev() (*InternalKey, []byte) { 790 if i.err != nil { 791 return nil, nil 792 } 793 if key, val := i.singleLevelIterator.Prev(); key != nil { 794 return key, val 795 } 796 for { 797 if i.index.err != nil { 798 i.err = i.index.err 799 break 800 } 801 if ikey, _ := i.topLevelIndex.Prev(); ikey == nil { 802 return nil, nil 803 } 804 if !i.loadIndex() { 805 return nil, nil 806 } 807 return i.singleLevelIterator.Last() 808 } 809 return nil, nil 810 } 811 812 // Close implements internalIterator.Close, as documented in the pebble 813 // package. 814 func (i *twoLevelIterator) Close() error { 815 if i.closeHook != nil { 816 if err := i.closeHook(i); err != nil { 817 return err 818 } 819 } 820 if err := i.data.Close(); err != nil { 821 return err 822 } 823 err := i.err 824 *i = twoLevelIterator{} 825 twoLevelIterPool.Put(i) 826 return err 827 } 828 829 // Note: twoLevelCompactionIterator and compactionIterator are very similar but 830 // were separated due to performance. 831 type twoLevelCompactionIterator struct { 832 *twoLevelIterator 833 bytesIterated *uint64 834 prevOffset uint64 835 } 836 837 func (i *twoLevelCompactionIterator) SeekGE(key []byte) (*InternalKey, []byte) { 838 panic("pebble: SeekGE unimplemented") 839 } 840 841 func (i *twoLevelCompactionIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) { 842 panic("pebble: SeekPrefixGE unimplemented") 843 } 844 845 func (i *twoLevelCompactionIterator) SeekLT(key []byte) (*InternalKey, []byte) { 846 panic("pebble: SeekLT unimplemented") 847 } 848 849 func (i *twoLevelCompactionIterator) First() (*InternalKey, []byte) { 850 key, val := i.twoLevelIterator.First() 851 if key == nil { 852 // An empty sstable will still encode the block trailer and restart points, so bytes 853 // iterated must be incremented. 854 855 // We must use i.dataBH.Length instead of (4*(i.data.numRestarts+1)) to calculate the 856 // number of bytes for the restart points, since i.dataBH.Length accounts for 857 // compression. When uncompressed, i.dataBH.Length == (4*(i.data.numRestarts+1)) 858 *i.bytesIterated += blockTrailerLen + i.dataBH.Length 859 return nil, nil 860 } 861 // If the sstable only has 1 entry, we are at the last entry in the block and we must 862 // increment bytes iterated by the size of the block trailer and restart points. 863 if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) { 864 i.prevOffset = blockTrailerLen + i.dataBH.Length 865 } else { 866 // i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1. 867 // i.data.nextOffset is the uncompressed size of the first record. 868 i.prevOffset = (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data)) 869 } 870 *i.bytesIterated += i.prevOffset 871 return key, val 872 } 873 874 func (i *twoLevelCompactionIterator) Last() (*InternalKey, []byte) { 875 panic("pebble: Last unimplemented") 876 } 877 878 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 879 // twoLevelIterator.Next due to performance. Keep the two in sync. 880 func (i *twoLevelCompactionIterator) Next() (*InternalKey, []byte) { 881 if i.err != nil { 882 return nil, nil 883 } 884 key, val := i.singleLevelIterator.Next() 885 if key == nil { 886 for { 887 if i.index.err != nil { 888 i.err = i.index.err 889 return nil, nil 890 } 891 if key, _ := i.topLevelIndex.Next(); key == nil { 892 return nil, nil 893 } 894 if i.loadIndex() { 895 key, val = i.singleLevelIterator.First() 896 if key == nil { 897 return nil, nil 898 } 899 break 900 } 901 } 902 } 903 904 // i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1. 905 // i.data.nextOffset is the uncompressed position of the current record in the block. 906 // i.dataBH.Offset is the offset of the block in the sstable before decompression. 907 recordOffset := (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data)) 908 curOffset := i.dataBH.Offset + recordOffset 909 // Last entry in the block must increment bytes iterated by the size of the block trailer 910 // and restart points. 911 if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) { 912 curOffset = i.dataBH.Offset + i.dataBH.Length + blockTrailerLen 913 } 914 *i.bytesIterated += uint64(curOffset - i.prevOffset) 915 i.prevOffset = curOffset 916 return key, val 917 } 918 919 func (i *twoLevelCompactionIterator) Prev() (*InternalKey, []byte) { 920 panic("pebble: Prev unimplemented") 921 } 922 923 type weakCachedBlock struct { 924 bh BlockHandle 925 mu sync.RWMutex 926 handle cache.WeakHandle 927 } 928 929 type blockTransform func([]byte) ([]byte, error) 930 931 // OpenOptions provide an interface to do work on Reader while it is being 932 // opened. 933 type OpenOption interface { 934 // Apply is called on the reader its opened. 935 Apply(*Reader) 936 } 937 938 // Comparers is a map from comparer name to comparer. It is used for debugging 939 // tools which may be used on multiple databases configured with different 940 // comparers. Comparers implements the OpenOption interface and can be passed 941 // as a parameter to NewReader. 942 type Comparers map[string]*Comparer 943 944 // Apply applies the comparers option to the reader. 945 func (c Comparers) Apply(r *Reader) { 946 if r.Compare != nil { 947 return 948 } 949 if comparer, ok := c[r.Properties.ComparerName]; ok { 950 r.Compare = comparer.Compare 951 r.split = comparer.Split 952 } 953 } 954 955 // Mergers is a map from merger name to merger. It is used for debugging tools 956 // which may be used on multiple databases configured with different 957 // mergers. Mergers implements the OpenOption interface and can be passed as 958 // a parameter to NewReader. 959 type Mergers map[string]*Merger 960 961 // Apply applies the mergers option to the reader. 962 func (m Mergers) Apply(r *Reader) { 963 if r.mergerOK { 964 return 965 } 966 _, r.mergerOK = m[r.Properties.MergerName] 967 } 968 969 // Reader is a table reader. 970 type Reader struct { 971 file vfs.File 972 dbNum uint64 973 fileNum uint64 974 err error 975 index weakCachedBlock 976 filter weakCachedBlock 977 rangeDel weakCachedBlock 978 rangeDelTransform blockTransform 979 propertiesBH BlockHandle 980 metaIndexBH BlockHandle 981 footerBH BlockHandle 982 opts *Options 983 cache *cache.Cache 984 Compare Compare 985 split Split 986 mergerOK bool 987 tableFilter *tableFilterReader 988 Properties Properties 989 } 990 991 // Close implements DB.Close, as documented in the pebble package. 992 func (r *Reader) Close() error { 993 if r.err != nil { 994 if r.file != nil { 995 r.file.Close() 996 r.file = nil 997 } 998 return r.err 999 } 1000 if r.file != nil { 1001 r.err = r.file.Close() 1002 r.file = nil 1003 if r.err != nil { 1004 return r.err 1005 } 1006 } 1007 // Make any future calls to Get, NewIter or Close return an error. 1008 r.err = errors.New("pebble/table: reader is closed") 1009 return nil 1010 } 1011 1012 // get is a testing helper that simulates a read and helps verify bloom filters 1013 // until they are available through iterators. 1014 func (r *Reader) get(key []byte) (value []byte, err error) { 1015 if r.err != nil { 1016 return nil, r.err 1017 } 1018 1019 if r.tableFilter != nil { 1020 data, err := r.readFilter() 1021 if err != nil { 1022 return nil, err 1023 } 1024 var lookupKey []byte 1025 if r.split != nil { 1026 lookupKey = key[:r.split(key)] 1027 } else { 1028 lookupKey = key 1029 } 1030 if !r.tableFilter.mayContain(data, lookupKey) { 1031 return nil, base.ErrNotFound 1032 } 1033 } 1034 1035 i := r.NewIter(nil /* lower */, nil /* upper */) 1036 i.SeekGE(key) 1037 1038 if !i.Valid() || r.Compare(key, i.Key().UserKey) != 0 { 1039 err := i.Close() 1040 if err == nil { 1041 err = base.ErrNotFound 1042 } 1043 return nil, err 1044 } 1045 return i.Value(), i.Close() 1046 } 1047 1048 // NewIter returns an iterator for the contents of the table. 1049 func (r *Reader) NewIter(lower, upper []byte) Iterator { 1050 // NB: pebble.tableCache wraps the returned iterator with one which performs 1051 // reference counting on the Reader, preventing the Reader from being closed 1052 // until the final iterator closes. 1053 var i Iterator 1054 if r.Properties.IndexType == twoLevelIndex { 1055 i = twoLevelIterPool.Get().(*twoLevelIterator) 1056 } else { 1057 i = singleLevelIterPool.Get().(*singleLevelIterator) 1058 } 1059 _ = i.Init(r, lower, upper) 1060 return i 1061 } 1062 1063 // NewCompactionIter returns an iterator similar to NewIter but it also increments 1064 // the number of bytes iterated. 1065 func (r *Reader) NewCompactionIter(bytesIterated *uint64) Iterator { 1066 if r.Properties.IndexType == twoLevelIndex { 1067 i := twoLevelIterPool.Get().(*twoLevelIterator) 1068 _ = i.Init(r, nil /* lower */, nil /* upper */) 1069 return &twoLevelCompactionIterator{ 1070 twoLevelIterator: i, 1071 bytesIterated: bytesIterated, 1072 } 1073 } else { 1074 i := singleLevelIterPool.Get().(*singleLevelIterator) 1075 _ = i.Init(r, nil /* lower */, nil /* upper */) 1076 return &compactionIterator{ 1077 singleLevelIterator: i, 1078 bytesIterated: bytesIterated, 1079 } 1080 } 1081 } 1082 1083 // NewRangeDelIter returns an internal iterator for the contents of the 1084 // range-del block for the table. Returns nil if the table does not contain any 1085 // range deletions. 1086 func (r *Reader) NewRangeDelIter() *blockIter { 1087 if r.rangeDel.bh.Length == 0 { 1088 return nil 1089 } 1090 b, err := r.readRangeDel() 1091 if err != nil { 1092 // TODO(peter): propagate the error 1093 panic(err) 1094 } 1095 i := &blockIter{} 1096 if err := i.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil { 1097 // TODO(peter): propagate the error 1098 panic(err) 1099 } 1100 return i 1101 } 1102 1103 func (r *Reader) readIndex() (block, error) { 1104 return r.readWeakCachedBlock(&r.index, nil /* transform */) 1105 } 1106 1107 func (r *Reader) readFilter() (block, error) { 1108 return r.readWeakCachedBlock(&r.filter, nil /* transform */) 1109 } 1110 1111 func (r *Reader) readRangeDel() (block, error) { 1112 return r.readWeakCachedBlock(&r.rangeDel, r.rangeDelTransform) 1113 } 1114 1115 func (r *Reader) readWeakCachedBlock( 1116 w *weakCachedBlock, transform blockTransform, 1117 ) (block, error) { 1118 // Fast-path for retrieving the block from a weak cache handle. 1119 w.mu.RLock() 1120 var b []byte 1121 if w.handle != nil { 1122 b = w.handle.Get() 1123 } 1124 w.mu.RUnlock() 1125 if b != nil { 1126 return b, nil 1127 } 1128 1129 // Slow-path: read the index block from disk. This checks the cache again, 1130 // but that is ok because somebody else might have inserted it for us. 1131 h, err := r.readBlock(w.bh, transform) 1132 if err != nil { 1133 return nil, err 1134 } 1135 b = h.Get() 1136 if wh := h.Weak(); wh != nil { 1137 w.mu.Lock() 1138 w.handle = wh 1139 w.mu.Unlock() 1140 } 1141 return b, err 1142 } 1143 1144 // readBlock reads and decompresses a block from disk into memory. 1145 func (r *Reader) readBlock( 1146 bh BlockHandle, transform blockTransform, 1147 ) (cache.Handle, error) { 1148 if h := r.cache.Get(r.dbNum, r.fileNum, bh.Offset); h.Get() != nil { 1149 return h, nil 1150 } 1151 1152 b := r.cache.Alloc(int(bh.Length + blockTrailerLen)) 1153 if _, err := r.file.ReadAt(b, int64(bh.Offset)); err != nil { 1154 return cache.Handle{}, err 1155 } 1156 1157 checksum0 := binary.LittleEndian.Uint32(b[bh.Length+1:]) 1158 checksum1 := crc.New(b[:bh.Length+1]).Value() 1159 if checksum0 != checksum1 { 1160 return cache.Handle{}, errors.New("pebble/table: invalid table (checksum mismatch)") 1161 } 1162 1163 typ := b[bh.Length] 1164 b = b[:bh.Length] 1165 1166 switch typ { 1167 case noCompressionBlockType: 1168 break 1169 case snappyCompressionBlockType: 1170 decodedLen, err := snappy.DecodedLen(b) 1171 if err != nil { 1172 return cache.Handle{}, err 1173 } 1174 decoded := r.cache.Alloc(decodedLen) 1175 decoded, err = snappy.Decode(decoded, b) 1176 if err != nil { 1177 return cache.Handle{}, err 1178 } 1179 r.cache.Free(b) 1180 b = decoded 1181 default: 1182 return cache.Handle{}, fmt.Errorf("pebble/table: unknown block compression: %d", typ) 1183 } 1184 1185 if transform != nil { 1186 // Transforming blocks is rare, so we don't bother to use cache.Alloc. 1187 var err error 1188 b, err = transform(b) 1189 if err != nil { 1190 return cache.Handle{}, err 1191 } 1192 } 1193 1194 h := r.cache.Set(r.dbNum, r.fileNum, bh.Offset, b) 1195 return h, nil 1196 } 1197 1198 func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { 1199 // Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The 1200 // v1 format range-del blocks have unfragmented and unsorted range 1201 // tombstones. We need properly fragmented and sorted range tombstones in 1202 // order to serve from them directly. 1203 iter := &blockIter{} 1204 if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil { 1205 return nil, err 1206 } 1207 var tombstones []rangedel.Tombstone 1208 for key, value := iter.First(); key != nil; key, value = iter.Next() { 1209 t := rangedel.Tombstone{ 1210 Start: *key, 1211 End: value, 1212 } 1213 tombstones = append(tombstones, t) 1214 } 1215 rangedel.Sort(r.Compare, tombstones) 1216 1217 // Fragment the tombstones, outputting them directly to a block writer. 1218 rangeDelBlock := blockWriter{ 1219 restartInterval: 1, 1220 } 1221 frag := rangedel.Fragmenter{ 1222 Cmp: r.Compare, 1223 Emit: func(fragmented []rangedel.Tombstone) { 1224 for i := range fragmented { 1225 t := &fragmented[i] 1226 rangeDelBlock.add(t.Start, t.End) 1227 } 1228 }, 1229 } 1230 for i := range tombstones { 1231 t := &tombstones[i] 1232 frag.Add(t.Start, t.End) 1233 } 1234 frag.Finish() 1235 1236 // Return the contents of the constructed v2 format range-del block. 1237 return rangeDelBlock.finish(), nil 1238 } 1239 1240 func (r *Reader) readMetaindex(metaindexBH BlockHandle, o *Options) error { 1241 b, err := r.readBlock(metaindexBH, nil /* transform */) 1242 if err != nil { 1243 return err 1244 } 1245 i, err := newRawBlockIter(bytes.Compare, b.Get()) 1246 b.Release() 1247 if err != nil { 1248 return err 1249 } 1250 1251 meta := map[string]BlockHandle{} 1252 for valid := i.First(); valid; valid = i.Next() { 1253 bh, n := decodeBlockHandle(i.Value()) 1254 if n == 0 { 1255 return errors.New("pebble/table: invalid table (bad filter block handle)") 1256 } 1257 meta[string(i.Key().UserKey)] = bh 1258 } 1259 if err := i.Close(); err != nil { 1260 return err 1261 } 1262 1263 if bh, ok := meta[metaPropertiesName]; ok { 1264 b, err = r.readBlock(bh, nil /* transform */) 1265 if err != nil { 1266 return err 1267 } 1268 data := b.Get() 1269 r.propertiesBH = bh 1270 err := r.Properties.load(data, bh.Offset) 1271 b.Release() 1272 if err != nil { 1273 return err 1274 } 1275 } 1276 1277 if bh, ok := meta[metaRangeDelV2Name]; ok { 1278 r.rangeDel.bh = bh 1279 } else if bh, ok := meta[metaRangeDelName]; ok { 1280 r.rangeDel.bh = bh 1281 r.rangeDelTransform = r.transformRangeDelV1 1282 } 1283 1284 for name, fp := range r.opts.Filters { 1285 types := []struct { 1286 ftype FilterType 1287 prefix string 1288 }{ 1289 {TableFilter, "fullfilter."}, 1290 } 1291 var done bool 1292 for _, t := range types { 1293 if bh, ok := meta[t.prefix+name]; ok { 1294 r.filter.bh = bh 1295 1296 switch t.ftype { 1297 case TableFilter: 1298 r.tableFilter = newTableFilterReader(fp) 1299 default: 1300 return fmt.Errorf("unknown filter type: %v", t.ftype) 1301 } 1302 1303 done = true 1304 break 1305 } 1306 } 1307 if done { 1308 break 1309 } 1310 } 1311 return nil 1312 } 1313 1314 // Layout returns the layout (block organization) for an sstable. 1315 func (r *Reader) Layout() (*Layout, error) { 1316 if r.err != nil { 1317 return nil, r.err 1318 } 1319 1320 l := &Layout{ 1321 Data: make([]BlockHandle, 0, r.Properties.NumDataBlocks), 1322 Filter: r.filter.bh, 1323 RangeDel: r.rangeDel.bh, 1324 Properties: r.propertiesBH, 1325 MetaIndex: r.metaIndexBH, 1326 Footer: r.footerBH, 1327 } 1328 1329 index, err := r.readIndex() 1330 if err != nil { 1331 return nil, err 1332 } 1333 1334 if r.Properties.IndexPartitions == 0 { 1335 l.Index = append(l.Index, r.index.bh) 1336 iter, _ := newBlockIter(r.Compare, index) 1337 for key, value := iter.First(); key != nil; key, value = iter.Next() { 1338 dataBH, n := decodeBlockHandle(value) 1339 if n == 0 || n != len(value) { 1340 return nil, errors.New("pebble/table: corrupt index entry") 1341 } 1342 l.Data = append(l.Data, dataBH) 1343 } 1344 } else { 1345 l.TopIndex = r.index.bh 1346 topIter, _ := newBlockIter(r.Compare, index) 1347 for key, value := topIter.First(); key != nil; key, value = topIter.Next() { 1348 indexBH, n := decodeBlockHandle(value) 1349 if n == 0 || n != len(value) { 1350 return nil, errors.New("pebble/table: corrupt index entry") 1351 } 1352 l.Index = append(l.Index, indexBH) 1353 1354 subIndex, err := r.readBlock(indexBH, nil /* transform */) 1355 if err != nil { 1356 return nil, err 1357 } 1358 iter, _ := newBlockIter(r.Compare, subIndex.Get()) 1359 for key, value := iter.First(); key != nil; key, value = iter.Next() { 1360 dataBH, n := decodeBlockHandle(value) 1361 if n == 0 || n != len(value) { 1362 return nil, errors.New("pebble/table: corrupt index entry") 1363 } 1364 l.Data = append(l.Data, dataBH) 1365 } 1366 subIndex.Release() 1367 } 1368 } 1369 1370 return l, nil 1371 } 1372 1373 // NewReader returns a new table reader for the file. Closing the reader will 1374 // close the file. 1375 func NewReader( 1376 f vfs.File, dbNum, fileNum uint64, o *Options, extraOpts ...OpenOption, 1377 ) (*Reader, error) { 1378 o = o.EnsureDefaults() 1379 1380 r := &Reader{ 1381 file: f, 1382 dbNum: dbNum, 1383 fileNum: fileNum, 1384 opts: o, 1385 cache: o.Cache, 1386 } 1387 if f == nil { 1388 r.err = errors.New("pebble/table: nil file") 1389 return r, r.err 1390 } 1391 footer, err := readFooter(f) 1392 if err != nil { 1393 r.err = err 1394 return r, r.err 1395 } 1396 // Read the metaindex. 1397 if err := r.readMetaindex(footer.metaindexBH, o); err != nil { 1398 r.err = err 1399 return r, r.err 1400 } 1401 r.index.bh = footer.indexBH 1402 r.metaIndexBH = footer.metaindexBH 1403 r.footerBH = footer.footerBH 1404 1405 if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName { 1406 r.Compare = o.Comparer.Compare 1407 r.split = o.Comparer.Split 1408 } 1409 1410 if o.Merger != nil && o.Merger.Name == r.Properties.MergerName { 1411 r.mergerOK = true 1412 } 1413 1414 for _, opt := range extraOpts { 1415 opt.Apply(r) 1416 } 1417 1418 if r.Compare == nil { 1419 r.err = fmt.Errorf("pebble/table: %d: unknown comparer %s", 1420 fileNum, r.Properties.ComparerName) 1421 } 1422 if !r.mergerOK { 1423 if name := r.Properties.MergerName; name != "" && name != "nullptr" { 1424 r.err = fmt.Errorf("pebble/table: %d: unknown merger %s", 1425 fileNum, r.Properties.MergerName) 1426 } 1427 } 1428 return r, r.err 1429 } 1430 1431 // Layout describes the block organization of an sstable. 1432 type Layout struct { 1433 Data []BlockHandle 1434 Index []BlockHandle 1435 TopIndex BlockHandle 1436 Filter BlockHandle 1437 RangeDel BlockHandle 1438 Properties BlockHandle 1439 MetaIndex BlockHandle 1440 Footer BlockHandle 1441 } 1442 1443 // Describe returns a description of the layout. If the verbose parameter is 1444 // true, details of the structure of each block are returned as well. 1445 func (l *Layout) Describe( 1446 w io.Writer, 1447 verbose bool, 1448 r *Reader, 1449 fmtRecord func(key *base.InternalKey, value []byte), 1450 ) { 1451 type block struct { 1452 BlockHandle 1453 name string 1454 } 1455 var blocks []block 1456 1457 for i := range l.Data { 1458 blocks = append(blocks, block{l.Data[i], "data"}) 1459 } 1460 for i := range l.Index { 1461 blocks = append(blocks, block{l.Index[i], "index"}) 1462 } 1463 if l.TopIndex.Length != 0 { 1464 blocks = append(blocks, block{l.TopIndex, "top-index"}) 1465 } 1466 if l.Filter.Length != 0 { 1467 blocks = append(blocks, block{l.Filter, "filter"}) 1468 } 1469 if l.RangeDel.Length != 0 { 1470 blocks = append(blocks, block{l.RangeDel, "range-del"}) 1471 } 1472 if l.Properties.Length != 0 { 1473 blocks = append(blocks, block{l.Properties, "properties"}) 1474 } 1475 if l.MetaIndex.Length != 0 { 1476 blocks = append(blocks, block{l.MetaIndex, "meta-index"}) 1477 } 1478 if l.Footer.Length != 0 { 1479 if l.Footer.Length == levelDBFooterLen { 1480 blocks = append(blocks, block{l.Footer, "leveldb-footer"}) 1481 } else { 1482 blocks = append(blocks, block{l.Footer, "footer"}) 1483 } 1484 } 1485 1486 sort.Slice(blocks, func(i, j int) bool { 1487 return blocks[i].Offset < blocks[j].Offset 1488 }) 1489 1490 for i := range blocks { 1491 b := &blocks[i] 1492 fmt.Fprintf(w, "%10d %s (%d)\n", b.Offset, b.name, b.Length) 1493 1494 if !verbose { 1495 continue 1496 } 1497 if b.name == "footer" || b.name == "leveldb-footer" || b.name == "filter" { 1498 continue 1499 } 1500 1501 h, err := r.readBlock(b.BlockHandle, nil /* transform */) 1502 if err != nil { 1503 fmt.Fprintf(w, " [err: %s]\n", err) 1504 continue 1505 } 1506 1507 getRestart := func(data []byte, restarts, i int32) int32 { 1508 return int32(binary.LittleEndian.Uint32(data[restarts+4*i:])) 1509 } 1510 1511 formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) { 1512 i := sort.Search(int(numRestarts), func(i int) bool { 1513 return getRestart(data, restarts, int32(i)) >= offset 1514 }) 1515 if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset { 1516 fmt.Fprintf(w, " [restart]\n") 1517 } else { 1518 fmt.Fprintf(w, "\n") 1519 } 1520 } 1521 1522 formatRestarts := func(data []byte, restarts, numRestarts int32) { 1523 for i := int32(0); i < numRestarts; i++ { 1524 offset := getRestart(data, restarts, i) 1525 fmt.Fprintf(w, "%10d [restart %d]\n", 1526 b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset)) 1527 } 1528 } 1529 1530 var lastKey InternalKey 1531 switch b.name { 1532 case "data", "range-del": 1533 iter, _ := newBlockIter(r.Compare, h.Get()) 1534 for key, value := iter.First(); key != nil; key, value = iter.Next() { 1535 ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset)) 1536 shared, ptr := decodeVarint(ptr) 1537 unshared, ptr := decodeVarint(ptr) 1538 value2, _ := decodeVarint(ptr) 1539 1540 total := iter.nextOffset - iter.offset 1541 // The format of the numbers in the record line is: 1542 // 1543 // (<total> = <length> [<shared>] + <unshared> + <value>) 1544 // 1545 // <total> is the total number of bytes for the record. 1546 // <length> is the size of the 3 varint encoded integers for <shared>, 1547 // <unshared>, and <value>. 1548 // <shared> is the number of key bytes shared with the previous key. 1549 // <unshared> is the number of unshared key bytes. 1550 // <value> is the number of value bytes. 1551 fmt.Fprintf(w, "%10d record (%d = %d [%d] + %d + %d)", 1552 b.Offset+uint64(iter.offset), total, 1553 total-int32(unshared+value2), shared, unshared, value2) 1554 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 1555 if fmtRecord != nil { 1556 fmt.Fprintf(w, " ") 1557 fmtRecord(key, value) 1558 } 1559 1560 if base.InternalCompare(r.Compare, lastKey, *key) >= 0 { 1561 fmt.Fprintf(w, " WARNING: OUT OF ORDER KEYS!\n") 1562 } 1563 lastKey.Trailer = key.Trailer 1564 lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...) 1565 } 1566 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 1567 case "index", "top-index": 1568 iter, _ := newBlockIter(r.Compare, h.Get()) 1569 for key, value := iter.First(); key != nil; key, value = iter.Next() { 1570 bh, n := decodeBlockHandle(value) 1571 if n == 0 || n != len(value) { 1572 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 1573 continue 1574 } 1575 fmt.Fprintf(w, "%10d block:%d/%d", 1576 b.Offset+uint64(iter.offset), bh.Offset, bh.Length) 1577 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 1578 } 1579 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 1580 case "properties": 1581 iter, _ := newRawBlockIter(r.Compare, h.Get()) 1582 for valid := iter.First(); valid; valid = iter.Next() { 1583 fmt.Fprintf(w, "%10d %s (%d)", 1584 b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset) 1585 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 1586 } 1587 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 1588 case "meta-index": 1589 iter, _ := newRawBlockIter(r.Compare, h.Get()) 1590 for valid := iter.First(); valid; valid = iter.Next() { 1591 value := iter.Value() 1592 bh, n := decodeBlockHandle(value) 1593 if n == 0 || n != len(value) { 1594 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 1595 continue 1596 } 1597 1598 fmt.Fprintf(w, "%10d %s block:%d/%d", 1599 b.Offset+uint64(iter.offset), iter.Key().UserKey, 1600 bh.Offset, bh.Length) 1601 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 1602 } 1603 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 1604 } 1605 1606 h.Release() 1607 } 1608 }