github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/sstable/reader.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"sort"
    14  	"sync"
    15  	"unsafe"
    16  
    17  	"github.com/golang/snappy"
    18  	"github.com/petermattis/pebble/cache"
    19  	"github.com/petermattis/pebble/internal/base"
    20  	"github.com/petermattis/pebble/internal/crc"
    21  	"github.com/petermattis/pebble/internal/rangedel"
    22  	"github.com/petermattis/pebble/vfs"
    23  )
    24  
    25  // BlockHandle is the file offset and length of a block.
    26  type BlockHandle struct {
    27  	Offset, Length uint64
    28  }
    29  
    30  // decodeBlockHandle returns the block handle encoded at the start of src, as
    31  // well as the number of bytes it occupies. It returns zero if given invalid
    32  // input.
    33  func decodeBlockHandle(src []byte) (BlockHandle, int) {
    34  	offset, n := binary.Uvarint(src)
    35  	length, m := binary.Uvarint(src[n:])
    36  	if n == 0 || m == 0 {
    37  		return BlockHandle{}, 0
    38  	}
    39  	return BlockHandle{offset, length}, n + m
    40  }
    41  
    42  func encodeBlockHandle(dst []byte, b BlockHandle) int {
    43  	n := binary.PutUvarint(dst, b.Offset)
    44  	m := binary.PutUvarint(dst[n:], b.Length)
    45  	return n + m
    46  }
    47  
    48  // block is a []byte that holds a sequence of key/value pairs plus an index
    49  // over those pairs.
    50  type block []byte
    51  
    52  // Iterator iterates over an entire table of data.
    53  type Iterator interface {
    54  	base.InternalIterator
    55  
    56  	Init(r *Reader, lower, upper []byte) error
    57  	SetCloseHook(fn func(i Iterator) error)
    58  }
    59  
    60  // singleLevelIterator iterates over an entire table of data. To seek for a given
    61  // key, it first looks in the index for the block that contains that key, and then
    62  // looks inside that block.
    63  type singleLevelIterator struct {
    64  	cmp Compare
    65  	// Global lower/upper bound for the iterator.
    66  	lower []byte
    67  	upper []byte
    68  	// Per-block lower/upper bound. Nil if the bound does not apply to the block
    69  	// because we determined the block lies completely within the bound.
    70  	blockLower []byte
    71  	blockUpper []byte
    72  	reader     *Reader
    73  	index      blockIter
    74  	data       blockIter
    75  	dataBH     BlockHandle
    76  	err        error
    77  	closeHook  func(i Iterator) error
    78  }
    79  
    80  var singleLevelIterPool = sync.Pool{
    81  	New: func() interface{} {
    82  		return &singleLevelIterator{}
    83  	},
    84  }
    85  
    86  var twoLevelIterPool = sync.Pool{
    87  	New: func() interface{} {
    88  		return &twoLevelIterator{}
    89  	},
    90  }
    91  
    92  // Init initializes a singleLevelIterator for reading from the table. It is
    93  // synonmous with Reader.NewIter, but allows for reusing of the iterator
    94  // between different Readers.
    95  func (i *singleLevelIterator) Init(r *Reader, lower, upper []byte) error {
    96  	*i = singleLevelIterator{
    97  		lower:  lower,
    98  		upper:  upper,
    99  		reader: r,
   100  		err:    r.err,
   101  	}
   102  	if i.err == nil {
   103  		var index block
   104  		index, i.err = r.readIndex()
   105  		if i.err != nil {
   106  			return i.err
   107  		}
   108  		i.cmp = r.Compare
   109  		i.err = i.index.init(i.cmp, index, r.Properties.GlobalSeqNum)
   110  	}
   111  	return i.err
   112  }
   113  
   114  func (i *singleLevelIterator) initBounds() {
   115  	if i.lower == nil && i.upper == nil {
   116  		return
   117  	}
   118  
   119  	// Trim the iteration bounds for the current block. We don't have to check
   120  	// the bounds on each iteration if the block is entirely contained within the
   121  	// iteration bounds.
   122  	i.blockLower = i.lower
   123  	if i.blockLower != nil {
   124  		key, _ := i.data.First()
   125  		if key != nil && i.cmp(i.blockLower, key.UserKey) < 0 {
   126  			// The lower-bound is less than the first key in the block. No need
   127  			// to check the lower-bound again for this block.
   128  			i.blockLower = nil
   129  		}
   130  	}
   131  	i.blockUpper = i.upper
   132  	if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
   133  		// The upper-bound is greater than the index key which itself is greater
   134  		// than or equal to every key in the block. No need to check the
   135  		// upper-bound again for this block.
   136  		i.blockUpper = nil
   137  	}
   138  }
   139  
   140  // loadBlock loads the block at the current index position and leaves i.data
   141  // unpositioned. If unsuccessful, it sets i.err to any error encountered, which
   142  // may be nil if we have simply exhausted the entire table.
   143  func (i *singleLevelIterator) loadBlock() bool {
   144  	if !i.index.Valid() {
   145  		i.err = i.index.err
   146  		// TODO(peter): Need to test that seeking to a key outside of the sstable
   147  		// invalidates the iterator.
   148  		i.data.offset = 0
   149  		i.data.restarts = 0
   150  		return false
   151  	}
   152  	// Load the next block.
   153  	v := i.index.Value()
   154  	var n int
   155  	i.dataBH, n = decodeBlockHandle(v)
   156  	if n == 0 || n != len(v) {
   157  		i.err = errors.New("pebble/table: corrupt index entry")
   158  		return false
   159  	}
   160  	block, err := i.reader.readBlock(i.dataBH, nil /* transform */)
   161  	if err != nil {
   162  		i.err = err
   163  		return false
   164  	}
   165  	i.data.setCacheHandle(block)
   166  	i.err = i.data.init(i.cmp, block.Get(), i.reader.Properties.GlobalSeqNum)
   167  	if i.err != nil {
   168  		return false
   169  	}
   170  	i.initBounds()
   171  	return true
   172  }
   173  
   174  // seekBlock loads the block at the current index position and positions i.data
   175  // at the first key in that block which is >= the given key. If unsuccessful,
   176  // it sets i.err to any error encountered, which may be nil if we have simply
   177  // exhausted the entire table.
   178  func (i *singleLevelIterator) seekBlock(key []byte) bool {
   179  	if !i.index.Valid() {
   180  		i.err = i.index.err
   181  		return false
   182  	}
   183  	// Load the next block.
   184  	v := i.index.Value()
   185  	h, n := decodeBlockHandle(v)
   186  	if n == 0 || n != len(v) {
   187  		i.err = errors.New("pebble/table: corrupt index entry")
   188  		return false
   189  	}
   190  	block, err := i.reader.readBlock(h, nil /* transform */)
   191  	if err != nil {
   192  		i.err = err
   193  		return false
   194  	}
   195  	i.data.setCacheHandle(block)
   196  	i.err = i.data.init(i.cmp, block.Get(), i.reader.Properties.GlobalSeqNum)
   197  	if i.err != nil {
   198  		return false
   199  	}
   200  	// Look for the key inside that block.
   201  	i.initBounds()
   202  	i.data.SeekGE(key)
   203  	return true
   204  }
   205  
   206  // SeekGE implements internalIterator.SeekGE, as documented in the pebble
   207  // package. Note that SeekGE only checks the upper bound. It is up to the
   208  // caller to ensure that key is greater than or equal to the lower bound.
   209  func (i *singleLevelIterator) SeekGE(key []byte) (*InternalKey, []byte) {
   210  	if i.err != nil {
   211  		return nil, nil
   212  	}
   213  
   214  	if ikey, _ := i.index.SeekGE(key); ikey == nil {
   215  		return nil, nil
   216  	}
   217  	if !i.loadBlock() {
   218  		return nil, nil
   219  	}
   220  	ikey, val := i.data.SeekGE(key)
   221  	if ikey == nil {
   222  		return nil, nil
   223  	}
   224  	if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 {
   225  		i.data.invalidateUpper() // force i.data.Valid() to return false
   226  		return nil, nil
   227  	}
   228  	return ikey, val
   229  }
   230  
   231  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
   232  // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up
   233  // to the caller to ensure that key is greater than or equal to the lower bound.
   234  func (i *singleLevelIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
   235  	if i.err != nil {
   236  		return nil, nil
   237  	}
   238  
   239  	// Check prefix bloom filter.
   240  	if i.reader.tableFilter != nil {
   241  		data, err := i.reader.readFilter()
   242  		if err != nil {
   243  			return nil, nil
   244  		}
   245  		if !i.reader.tableFilter.mayContain(data, prefix) {
   246  			i.data.invalidateUpper() // force i.data.Valid() to return false
   247  			return nil, nil
   248  		}
   249  	}
   250  
   251  	if ikey, _ := i.index.SeekGE(key); ikey == nil {
   252  		return nil, nil
   253  	}
   254  	if !i.loadBlock() {
   255  		return nil, nil
   256  	}
   257  	ikey, val := i.data.SeekGE(key)
   258  	if ikey == nil {
   259  		return nil, nil
   260  	}
   261  	if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 {
   262  		i.data.invalidateUpper() // force i.data.Valid() to return false
   263  		return nil, nil
   264  	}
   265  	return ikey, val
   266  }
   267  
   268  // SeekLT implements internalIterator.SeekLT, as documented in the pebble
   269  // package. Note that SeekLT only checks the lower bound. It is up to the
   270  // caller to ensure that key is less than the upper bound.
   271  func (i *singleLevelIterator) SeekLT(key []byte) (*InternalKey, []byte) {
   272  	if i.err != nil {
   273  		return nil, nil
   274  	}
   275  
   276  	if ikey, _ := i.index.SeekGE(key); ikey == nil {
   277  		i.index.Last()
   278  	}
   279  	if !i.loadBlock() {
   280  		return nil, nil
   281  	}
   282  	ikey, val := i.data.SeekLT(key)
   283  	if ikey == nil {
   284  		// The index contains separator keys which may lie between
   285  		// user-keys. Consider the user-keys:
   286  		//
   287  		//   complete
   288  		// ---- new block ---
   289  		//   complexion
   290  		//
   291  		// If these two keys end one block and start the next, the index key may
   292  		// be chosen as "compleu". The SeekGE in the index block will then point
   293  		// us to the block containing "complexion". If this happens, we want the
   294  		// last key from the previous data block.
   295  		if ikey, _ = i.index.Prev(); ikey == nil {
   296  			return nil, nil
   297  		}
   298  		if !i.loadBlock() {
   299  			return nil, nil
   300  		}
   301  		if ikey, val = i.data.Last(); ikey == nil {
   302  			return nil, nil
   303  		}
   304  	}
   305  	if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 {
   306  		i.data.invalidateLower() // force i.data.Valid() to return false
   307  		return nil, nil
   308  	}
   309  	return ikey, val
   310  }
   311  
   312  // First implements internalIterator.First, as documented in the pebble
   313  // package. Note that First only checks the upper bound. It is up to the caller
   314  // to ensure that key is greater than or equal to the lower bound (e.g. via a
   315  // call to SeekGE(lower)).
   316  func (i *singleLevelIterator) First() (*InternalKey, []byte) {
   317  	if i.err != nil {
   318  		return nil, nil
   319  	}
   320  
   321  	if ikey, _ := i.index.First(); ikey == nil {
   322  		return nil, nil
   323  	}
   324  	if !i.loadBlock() {
   325  		return nil, nil
   326  	}
   327  	ikey, val := i.data.First()
   328  	if ikey == nil {
   329  		return nil, nil
   330  	}
   331  	if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 {
   332  		i.data.invalidateUpper() // force i.data.Valid() to return false
   333  		return nil, nil
   334  	}
   335  	return ikey, val
   336  }
   337  
   338  // Last implements internalIterator.Last, as documented in the pebble
   339  // package. Note that Last only checks the lower bound. It is up to the caller
   340  // to ensure that key is less than the upper bound (e.g. via a call to
   341  // SeekLT(upper))
   342  func (i *singleLevelIterator) Last() (*InternalKey, []byte) {
   343  	if i.err != nil {
   344  		return nil, nil
   345  	}
   346  
   347  	if ikey, _ := i.index.Last(); ikey == nil {
   348  		return nil, nil
   349  	}
   350  	if !i.loadBlock() {
   351  		return nil, nil
   352  	}
   353  	if ikey, _ := i.data.Last(); ikey == nil {
   354  		return nil, nil
   355  	}
   356  	if i.blockLower != nil && i.cmp(i.data.ikey.UserKey, i.blockLower) < 0 {
   357  		i.data.invalidateLower()
   358  		return nil, nil
   359  	}
   360  	return &i.data.ikey, i.data.val
   361  }
   362  
   363  // Next implements internalIterator.Next, as documented in the pebble
   364  // package.
   365  // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
   366  // due to performance. Keep the two in sync.
   367  func (i *singleLevelIterator) Next() (*InternalKey, []byte) {
   368  	if i.err != nil {
   369  		return nil, nil
   370  	}
   371  	if key, val := i.data.Next(); key != nil {
   372  		if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 {
   373  			i.data.invalidateUpper()
   374  			return nil, nil
   375  		}
   376  		return key, val
   377  	}
   378  	for {
   379  		if i.data.err != nil {
   380  			i.err = i.data.err
   381  			break
   382  		}
   383  		if key, _ := i.index.Next(); key == nil {
   384  			break
   385  		}
   386  		if i.loadBlock() {
   387  			key, val := i.data.First()
   388  			if key == nil {
   389  				return nil, nil
   390  			}
   391  			if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 {
   392  				i.data.invalidateUpper()
   393  				return nil, nil
   394  			}
   395  			return key, val
   396  		}
   397  	}
   398  	return nil, nil
   399  }
   400  
   401  // Prev implements internalIterator.Prev, as documented in the pebble
   402  // package.
   403  func (i *singleLevelIterator) Prev() (*InternalKey, []byte) {
   404  	if i.err != nil {
   405  		return nil, nil
   406  	}
   407  	if key, val := i.data.Prev(); key != nil {
   408  		if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 {
   409  			i.data.invalidateLower()
   410  			return nil, nil
   411  		}
   412  		return key, val
   413  	}
   414  	for {
   415  		if i.data.err != nil {
   416  			i.err = i.data.err
   417  			break
   418  		}
   419  		if key, _ := i.index.Prev(); key == nil {
   420  			break
   421  		}
   422  		if i.loadBlock() {
   423  			key, val := i.data.Last()
   424  			if key == nil {
   425  				return nil, nil
   426  			}
   427  			if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 {
   428  				i.data.invalidateLower()
   429  				return nil, nil
   430  			}
   431  			return key, val
   432  		}
   433  	}
   434  	return nil, nil
   435  }
   436  
   437  // Key implements internalIterator.Key, as documented in the pebble package.
   438  func (i *singleLevelIterator) Key() *InternalKey {
   439  	return i.data.Key()
   440  }
   441  
   442  // Value implements internalIterator.Value, as documented in the pebble
   443  // package.
   444  func (i *singleLevelIterator) Value() []byte {
   445  	return i.data.Value()
   446  }
   447  
   448  // Valid implements internalIterator.Valid, as documented in the pebble
   449  // package.
   450  func (i *singleLevelIterator) Valid() bool {
   451  	return i.data.Valid()
   452  }
   453  
   454  // Error implements internalIterator.Error, as documented in the pebble
   455  // package.
   456  func (i *singleLevelIterator) Error() error {
   457  	if err := i.data.Error(); err != nil {
   458  		return err
   459  	}
   460  	return i.err
   461  }
   462  
   463  // SetCloseHook sets a function that will be called when the iterator is
   464  // closed.
   465  func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) {
   466  	i.closeHook = fn
   467  }
   468  
   469  // Close implements internalIterator.Close, as documented in the pebble
   470  // package.
   471  func (i *singleLevelIterator) Close() error {
   472  	if i.closeHook != nil {
   473  		if err := i.closeHook(i); err != nil {
   474  			return err
   475  		}
   476  	}
   477  	if err := i.data.Close(); err != nil {
   478  		return err
   479  	}
   480  	err := i.err
   481  	*i = singleLevelIterator{}
   482  	singleLevelIterPool.Put(i)
   483  	return err
   484  }
   485  
   486  // SetBounds implements internalIterator.SetBounds, as documented in the pebble
   487  // package.
   488  func (i *singleLevelIterator) SetBounds(lower, upper []byte) {
   489  	i.lower = lower
   490  	i.upper = upper
   491  }
   492  
   493  // compactionIterator is similar to Iterator but it increments the number of
   494  // bytes that have been iterated through.
   495  type compactionIterator struct {
   496  	*singleLevelIterator
   497  	bytesIterated *uint64
   498  	prevOffset    uint64
   499  }
   500  
   501  func (i *compactionIterator) SeekGE(key []byte) (*InternalKey, []byte) {
   502  	panic("pebble: SeekGE unimplemented")
   503  }
   504  
   505  func (i *compactionIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
   506  	panic("pebble: SeekPrefixGE unimplemented")
   507  }
   508  
   509  func (i *compactionIterator) SeekLT(key []byte) (*InternalKey, []byte) {
   510  	panic("pebble: SeekLT unimplemented")
   511  }
   512  
   513  func (i *compactionIterator) First() (*InternalKey, []byte) {
   514  	key, val := i.singleLevelIterator.First()
   515  	if key == nil {
   516  		// An empty sstable will still encode the block trailer and restart points, so bytes
   517  		// iterated must be incremented.
   518  
   519  		// We must use i.dataBH.Length instead of (4*(i.data.numRestarts+1)) to calculate the
   520  		// number of bytes for the restart points, since i.dataBH.Length accounts for
   521  		// compression. When uncompressed, i.dataBH.Length == (4*(i.data.numRestarts+1))
   522  		*i.bytesIterated += blockTrailerLen + i.dataBH.Length
   523  		return nil, nil
   524  	}
   525  	// If the sstable only has 1 entry, we are at the last entry in the block and we must
   526  	// increment bytes iterated by the size of the block trailer and restart points.
   527  	if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) {
   528  		i.prevOffset = blockTrailerLen + i.dataBH.Length
   529  	} else {
   530  		// i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1.
   531  		// i.data.nextOffset is the uncompressed size of the first record.
   532  		i.prevOffset = (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data))
   533  	}
   534  	*i.bytesIterated += i.prevOffset
   535  	return key, val
   536  }
   537  
   538  func (i *compactionIterator) Last() (*InternalKey, []byte) {
   539  	panic("pebble: Last unimplemented")
   540  }
   541  
   542  // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
   543  // due to performance. Keep the two in sync.
   544  func (i *compactionIterator) Next() (*InternalKey, []byte) {
   545  	if i.err != nil {
   546  		return nil, nil
   547  	}
   548  	key, val := i.data.Next()
   549  	if key == nil {
   550  		for {
   551  			if i.data.err != nil {
   552  				i.err = i.data.err
   553  				return nil, nil
   554  			}
   555  			if key, _ := i.index.Next(); key == nil {
   556  				return nil, nil
   557  			}
   558  			if i.loadBlock() {
   559  				key, val = i.data.First()
   560  				if key == nil {
   561  					return nil, nil
   562  				}
   563  				break
   564  			}
   565  		}
   566  	}
   567  
   568  	// i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1.
   569  	// i.data.nextOffset is the uncompressed position of the current record in the block.
   570  	// i.dataBH.Offset is the offset of the block in the sstable before decompression.
   571  	recordOffset := (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data))
   572  	curOffset := i.dataBH.Offset + recordOffset
   573  	// Last entry in the block must increment bytes iterated by the size of the block trailer
   574  	// and restart points.
   575  	if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) {
   576  		curOffset = i.dataBH.Offset + i.dataBH.Length + blockTrailerLen
   577  	}
   578  	*i.bytesIterated += uint64(curOffset - i.prevOffset)
   579  	i.prevOffset = curOffset
   580  	return key, val
   581  }
   582  
   583  func (i *compactionIterator) Prev() (*InternalKey, []byte) {
   584  	panic("pebble: Prev unimplemented")
   585  }
   586  
   587  type twoLevelIterator struct {
   588  	singleLevelIterator
   589  	topLevelIndex blockIter
   590  }
   591  
   592  // loadIndex loads the index block at the current top level index position and
   593  // leaves i.index unpositioned. If unsuccessful, it gets i.err to any error
   594  // encountered, which may be nil if we have simply exhausted the entire table.
   595  // This is used for two level indexes.
   596  func (i *twoLevelIterator) loadIndex() bool {
   597  	if !i.topLevelIndex.Valid() {
   598  		i.err = i.topLevelIndex.err
   599  		i.index.offset = 0
   600  		i.index.restarts = 0
   601  		return false
   602  	}
   603  	h, n := decodeBlockHandle(i.topLevelIndex.Value())
   604  	if n == 0 || n != len(i.topLevelIndex.Value()) {
   605  		i.err = errors.New("pebble/table: corrupt top level index entry")
   606  		return false
   607  	}
   608  	indexBlock, err := i.reader.readBlock(h, nil /* transform */)
   609  	if err != nil {
   610  		i.err = err
   611  		return false
   612  	}
   613  	i.index.setCacheHandle(indexBlock)
   614  	i.err = i.index.init(i.cmp, indexBlock.Get(), i.reader.Properties.GlobalSeqNum)
   615  	if i.err != nil {
   616  		return false
   617  	}
   618  	return true
   619  }
   620  
   621  func (i *twoLevelIterator) Init(r *Reader, lower, upper []byte) error {
   622  	*i = twoLevelIterator{
   623  		singleLevelIterator: singleLevelIterator{
   624  			lower:  lower,
   625  			upper:  upper,
   626  			reader: r,
   627  			err:    r.err,
   628  		},
   629  	}
   630  	if i.err == nil {
   631  		topLevelIndex, err := r.readIndex()
   632  		if i.err != nil {
   633  			i.err = err
   634  			return i.err
   635  		}
   636  		i.cmp = r.Compare
   637  		i.err = i.topLevelIndex.init(i.cmp, topLevelIndex, r.Properties.GlobalSeqNum)
   638  	}
   639  	return i.err
   640  }
   641  
   642  // SeekGE implements internalIterator.SeekGE, as documented in the pebble
   643  // package. Note that SeekGE only checks the upper bound. It is up to the
   644  // caller to ensure that key is greater than or equal to the lower bound.
   645  func (i *twoLevelIterator) SeekGE(key []byte) (*InternalKey, []byte) {
   646  	if i.err != nil {
   647  		return nil, nil
   648  	}
   649  
   650  	if ikey, _ := i.topLevelIndex.SeekGE(key); ikey == nil {
   651  		return nil, nil
   652  	}
   653  
   654  	if !i.loadIndex() {
   655  		return nil, nil
   656  	}
   657  
   658  	return i.singleLevelIterator.SeekGE(key)
   659  }
   660  
   661  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
   662  // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up
   663  // to the caller to ensure that key is greater than or equal to the lower bound.
   664  func (i *twoLevelIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
   665  	if i.err != nil {
   666  		return nil, nil
   667  	}
   668  
   669  	if ikey, _ := i.topLevelIndex.SeekGE(key); ikey == nil {
   670  		return nil, nil
   671  	}
   672  
   673  	if !i.loadIndex() {
   674  		return nil, nil
   675  	}
   676  
   677  	return i.singleLevelIterator.SeekPrefixGE(prefix, key)
   678  }
   679  
   680  // SeekLT implements internalIterator.SeekLT, as documented in the pebble
   681  // package. Note that SeekLT only checks the lower bound. It is up to the
   682  // caller to ensure that key is less than the upper bound.
   683  func (i *twoLevelIterator) SeekLT(key []byte) (*InternalKey, []byte) {
   684  	if i.err != nil {
   685  		return nil, nil
   686  	}
   687  
   688  	if ikey, _ := i.topLevelIndex.SeekGE(key); ikey == nil {
   689  		if ikey, _ := i.topLevelIndex.Last(); ikey == nil {
   690  			return nil, nil
   691  		}
   692  
   693  		if !i.loadIndex() {
   694  			return nil, nil
   695  		}
   696  
   697  		return i.singleLevelIterator.Last()
   698  	}
   699  
   700  	if !i.loadIndex() {
   701  		return nil, nil
   702  	}
   703  
   704  	ikey, val := i.singleLevelIterator.SeekLT(key)
   705  	if ikey == nil {
   706  		if ikey, val = i.topLevelIndex.Prev(); ikey == nil {
   707  			return nil, nil
   708  		}
   709  		if !i.loadIndex() {
   710  			return nil, nil
   711  		}
   712  		if ikey, val = i.singleLevelIterator.Last(); ikey == nil {
   713  			return nil, nil
   714  		}
   715  	}
   716  
   717  	return ikey, val
   718  }
   719  
   720  // First implements internalIterator.First, as documented in the pebble
   721  // package. Note that First only checks the upper bound. It is up to the caller
   722  // to ensure that key is greater than or equal to the lower bound (e.g. via a
   723  // call to SeekGE(lower)).
   724  func (i *twoLevelIterator) First() (*InternalKey, []byte) {
   725  	if i.err != nil {
   726  		return nil, nil
   727  	}
   728  
   729  	if ikey, _ := i.topLevelIndex.First(); ikey == nil {
   730  		return nil, nil
   731  	}
   732  
   733  	if !i.loadIndex() {
   734  		return nil, nil
   735  	}
   736  
   737  	return i.singleLevelIterator.First()
   738  }
   739  
   740  // Last implements internalIterator.Last, as documented in the pebble
   741  // package. Note that Last only checks the lower bound. It is up to the caller
   742  // to ensure that key is less than the upper bound (e.g. via a call to
   743  // SeekLT(upper))
   744  func (i *twoLevelIterator) Last() (*InternalKey, []byte) {
   745  	if i.err != nil {
   746  		return nil, nil
   747  	}
   748  
   749  	if ikey, _ := i.topLevelIndex.Last(); ikey == nil {
   750  		return nil, nil
   751  	}
   752  
   753  	if !i.loadIndex() {
   754  		return nil, nil
   755  	}
   756  
   757  	return i.singleLevelIterator.Last()
   758  }
   759  
   760  // Next implements internalIterator.Next, as documented in the pebble
   761  // package.
   762  // Note: twoLevelCompactionIterator.Next mirrors the implementation of
   763  // twoLevelIterator.Next due to performance. Keep the two in sync.
   764  func (i *twoLevelIterator) Next() (*InternalKey, []byte) {
   765  	if i.err != nil {
   766  		return nil, nil
   767  	}
   768  	if key, val := i.singleLevelIterator.Next(); key != nil {
   769  		return key, val
   770  	}
   771  	for {
   772  		if i.index.err != nil {
   773  			i.err = i.index.err
   774  			break
   775  		}
   776  		if ikey, _ := i.topLevelIndex.Next(); ikey == nil {
   777  			return nil, nil
   778  		}
   779  		if !i.loadIndex() {
   780  			return nil, nil
   781  		}
   782  		return i.singleLevelIterator.First()
   783  	}
   784  	return nil, nil
   785  }
   786  
   787  // Prev implements internalIterator.Prev, as documented in the pebble
   788  // package.
   789  func (i *twoLevelIterator) Prev() (*InternalKey, []byte) {
   790  	if i.err != nil {
   791  		return nil, nil
   792  	}
   793  	if key, val := i.singleLevelIterator.Prev(); key != nil {
   794  		return key, val
   795  	}
   796  	for {
   797  		if i.index.err != nil {
   798  			i.err = i.index.err
   799  			break
   800  		}
   801  		if ikey, _ := i.topLevelIndex.Prev(); ikey == nil {
   802  			return nil, nil
   803  		}
   804  		if !i.loadIndex() {
   805  			return nil, nil
   806  		}
   807  		return i.singleLevelIterator.Last()
   808  	}
   809  	return nil, nil
   810  }
   811  
   812  // Close implements internalIterator.Close, as documented in the pebble
   813  // package.
   814  func (i *twoLevelIterator) Close() error {
   815  	if i.closeHook != nil {
   816  		if err := i.closeHook(i); err != nil {
   817  			return err
   818  		}
   819  	}
   820  	if err := i.data.Close(); err != nil {
   821  		return err
   822  	}
   823  	err := i.err
   824  	*i = twoLevelIterator{}
   825  	twoLevelIterPool.Put(i)
   826  	return err
   827  }
   828  
   829  // Note: twoLevelCompactionIterator and compactionIterator are very similar but
   830  // were separated due to performance.
   831  type twoLevelCompactionIterator struct {
   832  	*twoLevelIterator
   833  	bytesIterated *uint64
   834  	prevOffset    uint64
   835  }
   836  
   837  func (i *twoLevelCompactionIterator) SeekGE(key []byte) (*InternalKey, []byte) {
   838  	panic("pebble: SeekGE unimplemented")
   839  }
   840  
   841  func (i *twoLevelCompactionIterator) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
   842  	panic("pebble: SeekPrefixGE unimplemented")
   843  }
   844  
   845  func (i *twoLevelCompactionIterator) SeekLT(key []byte) (*InternalKey, []byte) {
   846  	panic("pebble: SeekLT unimplemented")
   847  }
   848  
   849  func (i *twoLevelCompactionIterator) First() (*InternalKey, []byte) {
   850  	key, val := i.twoLevelIterator.First()
   851  	if key == nil {
   852  		// An empty sstable will still encode the block trailer and restart points, so bytes
   853  		// iterated must be incremented.
   854  
   855  		// We must use i.dataBH.Length instead of (4*(i.data.numRestarts+1)) to calculate the
   856  		// number of bytes for the restart points, since i.dataBH.Length accounts for
   857  		// compression. When uncompressed, i.dataBH.Length == (4*(i.data.numRestarts+1))
   858  		*i.bytesIterated += blockTrailerLen + i.dataBH.Length
   859  		return nil, nil
   860  	}
   861  	// If the sstable only has 1 entry, we are at the last entry in the block and we must
   862  	// increment bytes iterated by the size of the block trailer and restart points.
   863  	if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) {
   864  		i.prevOffset = blockTrailerLen + i.dataBH.Length
   865  	} else {
   866  		// i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1.
   867  		// i.data.nextOffset is the uncompressed size of the first record.
   868  		i.prevOffset = (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data))
   869  	}
   870  	*i.bytesIterated += i.prevOffset
   871  	return key, val
   872  }
   873  
   874  func (i *twoLevelCompactionIterator) Last() (*InternalKey, []byte) {
   875  	panic("pebble: Last unimplemented")
   876  }
   877  
   878  // Note: twoLevelCompactionIterator.Next mirrors the implementation of
   879  // twoLevelIterator.Next due to performance. Keep the two in sync.
   880  func (i *twoLevelCompactionIterator) Next() (*InternalKey, []byte) {
   881  	if i.err != nil {
   882  		return nil, nil
   883  	}
   884  	key, val := i.singleLevelIterator.Next()
   885  	if key == nil {
   886  		for {
   887  			if i.index.err != nil {
   888  				i.err = i.index.err
   889  				return nil, nil
   890  			}
   891  			if key, _ := i.topLevelIndex.Next(); key == nil {
   892  				return nil, nil
   893  			}
   894  			if i.loadIndex() {
   895  				key, val = i.singleLevelIterator.First()
   896  				if key == nil {
   897  					return nil, nil
   898  				}
   899  				break
   900  			}
   901  		}
   902  	}
   903  
   904  	// i.dataBH.Length/len(i.data.data) is the compression ratio. If uncompressed, this is 1.
   905  	// i.data.nextOffset is the uncompressed position of the current record in the block.
   906  	// i.dataBH.Offset is the offset of the block in the sstable before decompression.
   907  	recordOffset := (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data))
   908  	curOffset := i.dataBH.Offset + recordOffset
   909  	// Last entry in the block must increment bytes iterated by the size of the block trailer
   910  	// and restart points.
   911  	if i.data.nextOffset+(4*(i.data.numRestarts+1)) == int32(len(i.data.data)) {
   912  		curOffset = i.dataBH.Offset + i.dataBH.Length + blockTrailerLen
   913  	}
   914  	*i.bytesIterated += uint64(curOffset - i.prevOffset)
   915  	i.prevOffset = curOffset
   916  	return key, val
   917  }
   918  
   919  func (i *twoLevelCompactionIterator) Prev() (*InternalKey, []byte) {
   920  	panic("pebble: Prev unimplemented")
   921  }
   922  
   923  type weakCachedBlock struct {
   924  	bh     BlockHandle
   925  	mu     sync.RWMutex
   926  	handle cache.WeakHandle
   927  }
   928  
   929  type blockTransform func([]byte) ([]byte, error)
   930  
   931  // OpenOptions provide an interface to do work on Reader while it is being
   932  // opened.
   933  type OpenOption interface {
   934  	// Apply is called on the reader its opened.
   935  	Apply(*Reader)
   936  }
   937  
   938  // Comparers is a map from comparer name to comparer. It is used for debugging
   939  // tools which may be used on multiple databases configured with different
   940  // comparers. Comparers implements the OpenOption interface and can be passed
   941  // as a parameter to NewReader.
   942  type Comparers map[string]*Comparer
   943  
   944  // Apply applies the comparers option to the reader.
   945  func (c Comparers) Apply(r *Reader) {
   946  	if r.Compare != nil {
   947  		return
   948  	}
   949  	if comparer, ok := c[r.Properties.ComparerName]; ok {
   950  		r.Compare = comparer.Compare
   951  		r.split = comparer.Split
   952  	}
   953  }
   954  
   955  // Mergers is a map from merger name to merger. It is used for debugging tools
   956  // which may be used on multiple databases configured with different
   957  // mergers. Mergers implements the OpenOption interface and can be passed as
   958  // a parameter to NewReader.
   959  type Mergers map[string]*Merger
   960  
   961  // Apply applies the mergers option to the reader.
   962  func (m Mergers) Apply(r *Reader) {
   963  	if r.mergerOK {
   964  		return
   965  	}
   966  	_, r.mergerOK = m[r.Properties.MergerName]
   967  }
   968  
   969  // Reader is a table reader.
   970  type Reader struct {
   971  	file              vfs.File
   972  	dbNum             uint64
   973  	fileNum           uint64
   974  	err               error
   975  	index             weakCachedBlock
   976  	filter            weakCachedBlock
   977  	rangeDel          weakCachedBlock
   978  	rangeDelTransform blockTransform
   979  	propertiesBH      BlockHandle
   980  	metaIndexBH       BlockHandle
   981  	footerBH          BlockHandle
   982  	opts              *Options
   983  	cache             *cache.Cache
   984  	Compare           Compare
   985  	split             Split
   986  	mergerOK          bool
   987  	tableFilter       *tableFilterReader
   988  	Properties        Properties
   989  }
   990  
   991  // Close implements DB.Close, as documented in the pebble package.
   992  func (r *Reader) Close() error {
   993  	if r.err != nil {
   994  		if r.file != nil {
   995  			r.file.Close()
   996  			r.file = nil
   997  		}
   998  		return r.err
   999  	}
  1000  	if r.file != nil {
  1001  		r.err = r.file.Close()
  1002  		r.file = nil
  1003  		if r.err != nil {
  1004  			return r.err
  1005  		}
  1006  	}
  1007  	// Make any future calls to Get, NewIter or Close return an error.
  1008  	r.err = errors.New("pebble/table: reader is closed")
  1009  	return nil
  1010  }
  1011  
  1012  // get is a testing helper that simulates a read and helps verify bloom filters
  1013  // until they are available through iterators.
  1014  func (r *Reader) get(key []byte) (value []byte, err error) {
  1015  	if r.err != nil {
  1016  		return nil, r.err
  1017  	}
  1018  
  1019  	if r.tableFilter != nil {
  1020  		data, err := r.readFilter()
  1021  		if err != nil {
  1022  			return nil, err
  1023  		}
  1024  		var lookupKey []byte
  1025  		if r.split != nil {
  1026  			lookupKey = key[:r.split(key)]
  1027  		} else {
  1028  			lookupKey = key
  1029  		}
  1030  		if !r.tableFilter.mayContain(data, lookupKey) {
  1031  			return nil, base.ErrNotFound
  1032  		}
  1033  	}
  1034  
  1035  	i := r.NewIter(nil /* lower */, nil /* upper */)
  1036  	i.SeekGE(key)
  1037  
  1038  	if !i.Valid() || r.Compare(key, i.Key().UserKey) != 0 {
  1039  		err := i.Close()
  1040  		if err == nil {
  1041  			err = base.ErrNotFound
  1042  		}
  1043  		return nil, err
  1044  	}
  1045  	return i.Value(), i.Close()
  1046  }
  1047  
  1048  // NewIter returns an iterator for the contents of the table.
  1049  func (r *Reader) NewIter(lower, upper []byte) Iterator {
  1050  	// NB: pebble.tableCache wraps the returned iterator with one which performs
  1051  	// reference counting on the Reader, preventing the Reader from being closed
  1052  	// until the final iterator closes.
  1053  	var i Iterator
  1054  	if r.Properties.IndexType == twoLevelIndex {
  1055  		i = twoLevelIterPool.Get().(*twoLevelIterator)
  1056  	} else {
  1057  		i = singleLevelIterPool.Get().(*singleLevelIterator)
  1058  	}
  1059  	_ = i.Init(r, lower, upper)
  1060  	return i
  1061  }
  1062  
  1063  // NewCompactionIter returns an iterator similar to NewIter but it also increments
  1064  // the number of bytes iterated.
  1065  func (r *Reader) NewCompactionIter(bytesIterated *uint64) Iterator {
  1066  	if r.Properties.IndexType == twoLevelIndex {
  1067  		i := twoLevelIterPool.Get().(*twoLevelIterator)
  1068  		_ = i.Init(r, nil /* lower */, nil /* upper */)
  1069  		return &twoLevelCompactionIterator{
  1070  			twoLevelIterator: i,
  1071  			bytesIterated:    bytesIterated,
  1072  		}
  1073  	} else {
  1074  		i := singleLevelIterPool.Get().(*singleLevelIterator)
  1075  		_ = i.Init(r, nil /* lower */, nil /* upper */)
  1076  		return &compactionIterator{
  1077  			singleLevelIterator: i,
  1078  			bytesIterated:       bytesIterated,
  1079  		}
  1080  	}
  1081  }
  1082  
  1083  // NewRangeDelIter returns an internal iterator for the contents of the
  1084  // range-del block for the table. Returns nil if the table does not contain any
  1085  // range deletions.
  1086  func (r *Reader) NewRangeDelIter() *blockIter {
  1087  	if r.rangeDel.bh.Length == 0 {
  1088  		return nil
  1089  	}
  1090  	b, err := r.readRangeDel()
  1091  	if err != nil {
  1092  		// TODO(peter): propagate the error
  1093  		panic(err)
  1094  	}
  1095  	i := &blockIter{}
  1096  	if err := i.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil {
  1097  		// TODO(peter): propagate the error
  1098  		panic(err)
  1099  	}
  1100  	return i
  1101  }
  1102  
  1103  func (r *Reader) readIndex() (block, error) {
  1104  	return r.readWeakCachedBlock(&r.index, nil /* transform */)
  1105  }
  1106  
  1107  func (r *Reader) readFilter() (block, error) {
  1108  	return r.readWeakCachedBlock(&r.filter, nil /* transform */)
  1109  }
  1110  
  1111  func (r *Reader) readRangeDel() (block, error) {
  1112  	return r.readWeakCachedBlock(&r.rangeDel, r.rangeDelTransform)
  1113  }
  1114  
  1115  func (r *Reader) readWeakCachedBlock(
  1116  	w *weakCachedBlock, transform blockTransform,
  1117  ) (block, error) {
  1118  	// Fast-path for retrieving the block from a weak cache handle.
  1119  	w.mu.RLock()
  1120  	var b []byte
  1121  	if w.handle != nil {
  1122  		b = w.handle.Get()
  1123  	}
  1124  	w.mu.RUnlock()
  1125  	if b != nil {
  1126  		return b, nil
  1127  	}
  1128  
  1129  	// Slow-path: read the index block from disk. This checks the cache again,
  1130  	// but that is ok because somebody else might have inserted it for us.
  1131  	h, err := r.readBlock(w.bh, transform)
  1132  	if err != nil {
  1133  		return nil, err
  1134  	}
  1135  	b = h.Get()
  1136  	if wh := h.Weak(); wh != nil {
  1137  		w.mu.Lock()
  1138  		w.handle = wh
  1139  		w.mu.Unlock()
  1140  	}
  1141  	return b, err
  1142  }
  1143  
  1144  // readBlock reads and decompresses a block from disk into memory.
  1145  func (r *Reader) readBlock(
  1146  	bh BlockHandle, transform blockTransform,
  1147  ) (cache.Handle, error) {
  1148  	if h := r.cache.Get(r.dbNum, r.fileNum, bh.Offset); h.Get() != nil {
  1149  		return h, nil
  1150  	}
  1151  
  1152  	b := r.cache.Alloc(int(bh.Length + blockTrailerLen))
  1153  	if _, err := r.file.ReadAt(b, int64(bh.Offset)); err != nil {
  1154  		return cache.Handle{}, err
  1155  	}
  1156  
  1157  	checksum0 := binary.LittleEndian.Uint32(b[bh.Length+1:])
  1158  	checksum1 := crc.New(b[:bh.Length+1]).Value()
  1159  	if checksum0 != checksum1 {
  1160  		return cache.Handle{}, errors.New("pebble/table: invalid table (checksum mismatch)")
  1161  	}
  1162  
  1163  	typ := b[bh.Length]
  1164  	b = b[:bh.Length]
  1165  
  1166  	switch typ {
  1167  	case noCompressionBlockType:
  1168  		break
  1169  	case snappyCompressionBlockType:
  1170  		decodedLen, err := snappy.DecodedLen(b)
  1171  		if err != nil {
  1172  			return cache.Handle{}, err
  1173  		}
  1174  		decoded := r.cache.Alloc(decodedLen)
  1175  		decoded, err = snappy.Decode(decoded, b)
  1176  		if err != nil {
  1177  			return cache.Handle{}, err
  1178  		}
  1179  		r.cache.Free(b)
  1180  		b = decoded
  1181  	default:
  1182  		return cache.Handle{}, fmt.Errorf("pebble/table: unknown block compression: %d", typ)
  1183  	}
  1184  
  1185  	if transform != nil {
  1186  		// Transforming blocks is rare, so we don't bother to use cache.Alloc.
  1187  		var err error
  1188  		b, err = transform(b)
  1189  		if err != nil {
  1190  			return cache.Handle{}, err
  1191  		}
  1192  	}
  1193  
  1194  	h := r.cache.Set(r.dbNum, r.fileNum, bh.Offset, b)
  1195  	return h, nil
  1196  }
  1197  
  1198  func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
  1199  	// Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The
  1200  	// v1 format range-del blocks have unfragmented and unsorted range
  1201  	// tombstones. We need properly fragmented and sorted range tombstones in
  1202  	// order to serve from them directly.
  1203  	iter := &blockIter{}
  1204  	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil {
  1205  		return nil, err
  1206  	}
  1207  	var tombstones []rangedel.Tombstone
  1208  	for key, value := iter.First(); key != nil; key, value = iter.Next() {
  1209  		t := rangedel.Tombstone{
  1210  			Start: *key,
  1211  			End:   value,
  1212  		}
  1213  		tombstones = append(tombstones, t)
  1214  	}
  1215  	rangedel.Sort(r.Compare, tombstones)
  1216  
  1217  	// Fragment the tombstones, outputting them directly to a block writer.
  1218  	rangeDelBlock := blockWriter{
  1219  		restartInterval: 1,
  1220  	}
  1221  	frag := rangedel.Fragmenter{
  1222  		Cmp: r.Compare,
  1223  		Emit: func(fragmented []rangedel.Tombstone) {
  1224  			for i := range fragmented {
  1225  				t := &fragmented[i]
  1226  				rangeDelBlock.add(t.Start, t.End)
  1227  			}
  1228  		},
  1229  	}
  1230  	for i := range tombstones {
  1231  		t := &tombstones[i]
  1232  		frag.Add(t.Start, t.End)
  1233  	}
  1234  	frag.Finish()
  1235  
  1236  	// Return the contents of the constructed v2 format range-del block.
  1237  	return rangeDelBlock.finish(), nil
  1238  }
  1239  
  1240  func (r *Reader) readMetaindex(metaindexBH BlockHandle, o *Options) error {
  1241  	b, err := r.readBlock(metaindexBH, nil /* transform */)
  1242  	if err != nil {
  1243  		return err
  1244  	}
  1245  	i, err := newRawBlockIter(bytes.Compare, b.Get())
  1246  	b.Release()
  1247  	if err != nil {
  1248  		return err
  1249  	}
  1250  
  1251  	meta := map[string]BlockHandle{}
  1252  	for valid := i.First(); valid; valid = i.Next() {
  1253  		bh, n := decodeBlockHandle(i.Value())
  1254  		if n == 0 {
  1255  			return errors.New("pebble/table: invalid table (bad filter block handle)")
  1256  		}
  1257  		meta[string(i.Key().UserKey)] = bh
  1258  	}
  1259  	if err := i.Close(); err != nil {
  1260  		return err
  1261  	}
  1262  
  1263  	if bh, ok := meta[metaPropertiesName]; ok {
  1264  		b, err = r.readBlock(bh, nil /* transform */)
  1265  		if err != nil {
  1266  			return err
  1267  		}
  1268  		data := b.Get()
  1269  		r.propertiesBH = bh
  1270  		err := r.Properties.load(data, bh.Offset)
  1271  		b.Release()
  1272  		if err != nil {
  1273  			return err
  1274  		}
  1275  	}
  1276  
  1277  	if bh, ok := meta[metaRangeDelV2Name]; ok {
  1278  		r.rangeDel.bh = bh
  1279  	} else if bh, ok := meta[metaRangeDelName]; ok {
  1280  		r.rangeDel.bh = bh
  1281  		r.rangeDelTransform = r.transformRangeDelV1
  1282  	}
  1283  
  1284  	for name, fp := range r.opts.Filters {
  1285  		types := []struct {
  1286  			ftype  FilterType
  1287  			prefix string
  1288  		}{
  1289  			{TableFilter, "fullfilter."},
  1290  		}
  1291  		var done bool
  1292  		for _, t := range types {
  1293  			if bh, ok := meta[t.prefix+name]; ok {
  1294  				r.filter.bh = bh
  1295  
  1296  				switch t.ftype {
  1297  				case TableFilter:
  1298  					r.tableFilter = newTableFilterReader(fp)
  1299  				default:
  1300  					return fmt.Errorf("unknown filter type: %v", t.ftype)
  1301  				}
  1302  
  1303  				done = true
  1304  				break
  1305  			}
  1306  		}
  1307  		if done {
  1308  			break
  1309  		}
  1310  	}
  1311  	return nil
  1312  }
  1313  
  1314  // Layout returns the layout (block organization) for an sstable.
  1315  func (r *Reader) Layout() (*Layout, error) {
  1316  	if r.err != nil {
  1317  		return nil, r.err
  1318  	}
  1319  
  1320  	l := &Layout{
  1321  		Data:       make([]BlockHandle, 0, r.Properties.NumDataBlocks),
  1322  		Filter:     r.filter.bh,
  1323  		RangeDel:   r.rangeDel.bh,
  1324  		Properties: r.propertiesBH,
  1325  		MetaIndex:  r.metaIndexBH,
  1326  		Footer:     r.footerBH,
  1327  	}
  1328  
  1329  	index, err := r.readIndex()
  1330  	if err != nil {
  1331  		return nil, err
  1332  	}
  1333  
  1334  	if r.Properties.IndexPartitions == 0 {
  1335  		l.Index = append(l.Index, r.index.bh)
  1336  		iter, _ := newBlockIter(r.Compare, index)
  1337  		for key, value := iter.First(); key != nil; key, value = iter.Next() {
  1338  			dataBH, n := decodeBlockHandle(value)
  1339  			if n == 0 || n != len(value) {
  1340  				return nil, errors.New("pebble/table: corrupt index entry")
  1341  			}
  1342  			l.Data = append(l.Data, dataBH)
  1343  		}
  1344  	} else {
  1345  		l.TopIndex = r.index.bh
  1346  		topIter, _ := newBlockIter(r.Compare, index)
  1347  		for key, value := topIter.First(); key != nil; key, value = topIter.Next() {
  1348  			indexBH, n := decodeBlockHandle(value)
  1349  			if n == 0 || n != len(value) {
  1350  				return nil, errors.New("pebble/table: corrupt index entry")
  1351  			}
  1352  			l.Index = append(l.Index, indexBH)
  1353  
  1354  			subIndex, err := r.readBlock(indexBH, nil /* transform */)
  1355  			if err != nil {
  1356  				return nil, err
  1357  			}
  1358  			iter, _ := newBlockIter(r.Compare, subIndex.Get())
  1359  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
  1360  				dataBH, n := decodeBlockHandle(value)
  1361  				if n == 0 || n != len(value) {
  1362  					return nil, errors.New("pebble/table: corrupt index entry")
  1363  				}
  1364  				l.Data = append(l.Data, dataBH)
  1365  			}
  1366  			subIndex.Release()
  1367  		}
  1368  	}
  1369  
  1370  	return l, nil
  1371  }
  1372  
  1373  // NewReader returns a new table reader for the file. Closing the reader will
  1374  // close the file.
  1375  func NewReader(
  1376  	f vfs.File, dbNum, fileNum uint64, o *Options, extraOpts ...OpenOption,
  1377  ) (*Reader, error) {
  1378  	o = o.EnsureDefaults()
  1379  
  1380  	r := &Reader{
  1381  		file:    f,
  1382  		dbNum:   dbNum,
  1383  		fileNum: fileNum,
  1384  		opts:    o,
  1385  		cache:   o.Cache,
  1386  	}
  1387  	if f == nil {
  1388  		r.err = errors.New("pebble/table: nil file")
  1389  		return r, r.err
  1390  	}
  1391  	footer, err := readFooter(f)
  1392  	if err != nil {
  1393  		r.err = err
  1394  		return r, r.err
  1395  	}
  1396  	// Read the metaindex.
  1397  	if err := r.readMetaindex(footer.metaindexBH, o); err != nil {
  1398  		r.err = err
  1399  		return r, r.err
  1400  	}
  1401  	r.index.bh = footer.indexBH
  1402  	r.metaIndexBH = footer.metaindexBH
  1403  	r.footerBH = footer.footerBH
  1404  
  1405  	if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName {
  1406  		r.Compare = o.Comparer.Compare
  1407  		r.split = o.Comparer.Split
  1408  	}
  1409  
  1410  	if o.Merger != nil && o.Merger.Name == r.Properties.MergerName {
  1411  		r.mergerOK = true
  1412  	}
  1413  
  1414  	for _, opt := range extraOpts {
  1415  		opt.Apply(r)
  1416  	}
  1417  
  1418  	if r.Compare == nil {
  1419  		r.err = fmt.Errorf("pebble/table: %d: unknown comparer %s",
  1420  			fileNum, r.Properties.ComparerName)
  1421  	}
  1422  	if !r.mergerOK {
  1423  		if name := r.Properties.MergerName; name != "" && name != "nullptr" {
  1424  			r.err = fmt.Errorf("pebble/table: %d: unknown merger %s",
  1425  				fileNum, r.Properties.MergerName)
  1426  		}
  1427  	}
  1428  	return r, r.err
  1429  }
  1430  
  1431  // Layout describes the block organization of an sstable.
  1432  type Layout struct {
  1433  	Data       []BlockHandle
  1434  	Index      []BlockHandle
  1435  	TopIndex   BlockHandle
  1436  	Filter     BlockHandle
  1437  	RangeDel   BlockHandle
  1438  	Properties BlockHandle
  1439  	MetaIndex  BlockHandle
  1440  	Footer     BlockHandle
  1441  }
  1442  
  1443  // Describe returns a description of the layout. If the verbose parameter is
  1444  // true, details of the structure of each block are returned as well.
  1445  func (l *Layout) Describe(
  1446  	w io.Writer,
  1447  	verbose bool,
  1448  	r *Reader,
  1449  	fmtRecord func(key *base.InternalKey, value []byte),
  1450  ) {
  1451  	type block struct {
  1452  		BlockHandle
  1453  		name string
  1454  	}
  1455  	var blocks []block
  1456  
  1457  	for i := range l.Data {
  1458  		blocks = append(blocks, block{l.Data[i], "data"})
  1459  	}
  1460  	for i := range l.Index {
  1461  		blocks = append(blocks, block{l.Index[i], "index"})
  1462  	}
  1463  	if l.TopIndex.Length != 0 {
  1464  		blocks = append(blocks, block{l.TopIndex, "top-index"})
  1465  	}
  1466  	if l.Filter.Length != 0 {
  1467  		blocks = append(blocks, block{l.Filter, "filter"})
  1468  	}
  1469  	if l.RangeDel.Length != 0 {
  1470  		blocks = append(blocks, block{l.RangeDel, "range-del"})
  1471  	}
  1472  	if l.Properties.Length != 0 {
  1473  		blocks = append(blocks, block{l.Properties, "properties"})
  1474  	}
  1475  	if l.MetaIndex.Length != 0 {
  1476  		blocks = append(blocks, block{l.MetaIndex, "meta-index"})
  1477  	}
  1478  	if l.Footer.Length != 0 {
  1479  		if l.Footer.Length == levelDBFooterLen {
  1480  			blocks = append(blocks, block{l.Footer, "leveldb-footer"})
  1481  		} else {
  1482  			blocks = append(blocks, block{l.Footer, "footer"})
  1483  		}
  1484  	}
  1485  
  1486  	sort.Slice(blocks, func(i, j int) bool {
  1487  		return blocks[i].Offset < blocks[j].Offset
  1488  	})
  1489  
  1490  	for i := range blocks {
  1491  		b := &blocks[i]
  1492  		fmt.Fprintf(w, "%10d  %s (%d)\n", b.Offset, b.name, b.Length)
  1493  
  1494  		if !verbose {
  1495  			continue
  1496  		}
  1497  		if b.name == "footer" || b.name == "leveldb-footer" || b.name == "filter" {
  1498  			continue
  1499  		}
  1500  
  1501  		h, err := r.readBlock(b.BlockHandle, nil /* transform */)
  1502  		if err != nil {
  1503  			fmt.Fprintf(w, "  [err: %s]\n", err)
  1504  			continue
  1505  		}
  1506  
  1507  		getRestart := func(data []byte, restarts, i int32) int32 {
  1508  			return int32(binary.LittleEndian.Uint32(data[restarts+4*i:]))
  1509  		}
  1510  
  1511  		formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) {
  1512  			i := sort.Search(int(numRestarts), func(i int) bool {
  1513  				return getRestart(data, restarts, int32(i)) >= offset
  1514  			})
  1515  			if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset {
  1516  				fmt.Fprintf(w, " [restart]\n")
  1517  			} else {
  1518  				fmt.Fprintf(w, "\n")
  1519  			}
  1520  		}
  1521  
  1522  		formatRestarts := func(data []byte, restarts, numRestarts int32) {
  1523  			for i := int32(0); i < numRestarts; i++ {
  1524  				offset := getRestart(data, restarts, i)
  1525  				fmt.Fprintf(w, "%10d    [restart %d]\n",
  1526  					b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset))
  1527  			}
  1528  		}
  1529  
  1530  		var lastKey InternalKey
  1531  		switch b.name {
  1532  		case "data", "range-del":
  1533  			iter, _ := newBlockIter(r.Compare, h.Get())
  1534  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
  1535  				ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset))
  1536  				shared, ptr := decodeVarint(ptr)
  1537  				unshared, ptr := decodeVarint(ptr)
  1538  				value2, _ := decodeVarint(ptr)
  1539  
  1540  				total := iter.nextOffset - iter.offset
  1541  				// The format of the numbers in the record line is:
  1542  				//
  1543  				//   (<total> = <length> [<shared>] + <unshared> + <value>)
  1544  				//
  1545  				// <total>    is the total number of bytes for the record.
  1546  				// <length>   is the size of the 3 varint encoded integers for <shared>,
  1547  				//            <unshared>, and <value>.
  1548  				// <shared>   is the number of key bytes shared with the previous key.
  1549  				// <unshared> is the number of unshared key bytes.
  1550  				// <value>    is the number of value bytes.
  1551  				fmt.Fprintf(w, "%10d    record (%d = %d [%d] + %d + %d)",
  1552  					b.Offset+uint64(iter.offset), total,
  1553  					total-int32(unshared+value2), shared, unshared, value2)
  1554  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  1555  				if fmtRecord != nil {
  1556  					fmt.Fprintf(w, "              ")
  1557  					fmtRecord(key, value)
  1558  				}
  1559  
  1560  				if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
  1561  					fmt.Fprintf(w, "              WARNING: OUT OF ORDER KEYS!\n")
  1562  				}
  1563  				lastKey.Trailer = key.Trailer
  1564  				lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
  1565  			}
  1566  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  1567  		case "index", "top-index":
  1568  			iter, _ := newBlockIter(r.Compare, h.Get())
  1569  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
  1570  				bh, n := decodeBlockHandle(value)
  1571  				if n == 0 || n != len(value) {
  1572  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
  1573  					continue
  1574  				}
  1575  				fmt.Fprintf(w, "%10d    block:%d/%d",
  1576  					b.Offset+uint64(iter.offset), bh.Offset, bh.Length)
  1577  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  1578  			}
  1579  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  1580  		case "properties":
  1581  			iter, _ := newRawBlockIter(r.Compare, h.Get())
  1582  			for valid := iter.First(); valid; valid = iter.Next() {
  1583  				fmt.Fprintf(w, "%10d    %s (%d)",
  1584  					b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset)
  1585  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  1586  			}
  1587  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  1588  		case "meta-index":
  1589  			iter, _ := newRawBlockIter(r.Compare, h.Get())
  1590  			for valid := iter.First(); valid; valid = iter.Next() {
  1591  				value := iter.Value()
  1592  				bh, n := decodeBlockHandle(value)
  1593  				if n == 0 || n != len(value) {
  1594  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
  1595  					continue
  1596  				}
  1597  
  1598  				fmt.Fprintf(w, "%10d    %s block:%d/%d",
  1599  					b.Offset+uint64(iter.offset), iter.Key().UserKey,
  1600  					bh.Offset, bh.Length)
  1601  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  1602  			}
  1603  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  1604  		}
  1605  
  1606  		h.Release()
  1607  	}
  1608  }