github.com/cockroachdb/pebble@v1.1.2/level_checker.go

github.com/cockroachdb/pebble@v1.1.2/level_checker.go (about)

     1  // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"sort"
    12  
    13  	"github.com/cockroachdb/errors"
    14  	"github.com/cockroachdb/pebble/internal/base"
    15  	"github.com/cockroachdb/pebble/internal/keyspan"
    16  	"github.com/cockroachdb/pebble/internal/manifest"
    17  )
    18  
    19  // This file implements DB.CheckLevels() which checks that every entry in the
    20  // DB is consistent with respect to the level invariant: any point (or the
    21  // infinite number of points in a range tombstone) has a seqnum such that a
    22  // point with the same UserKey at a lower level has a lower seqnum. This is an
    23  // expensive check since it involves iterating over all the entries in the DB,
    24  // hence only intended for tests or tools.
    25  //
    26  // If we ignore range tombstones, the consistency checking of points can be
    27  // done with a simplified version of mergingIter. simpleMergingIter is that
    28  // simplified version of mergingIter that only needs to step through points
    29  // (analogous to only doing Next()). It can also easily accommodate
    30  // consistency checking of points relative to range tombstones.
    31  // simpleMergingIter does not do any seek optimizations present in mergingIter
    32  // (it minimally needs to seek the range delete iterators to position them at
    33  // or past the current point) since it does not want to miss points for
    34  // purposes of consistency checking.
    35  //
    36  // Mutual consistency of range tombstones is non-trivial to check. One needs
    37  // to detect inversions of the form [a, c)#8 at higher level and [b, c)#10 at
    38  // a lower level. The start key of the former is not contained in the latter
    39  // and we can't use the exclusive end key, c, for a containment check since it
    40  // is the sentinel key. We observe that if these tombstones were fragmented
    41  // wrt each other we would have [a, b)#8 and [b, c)#8 at the higher level and
    42  // [b, c)#10 at the lower level and then it is is trivial to compare the two
    43  // [b, c) tombstones. Note that this fragmentation needs to take into account
    44  // that tombstones in a file may be untruncated and need to act within the
    45  // bounds of the file. This checking is performed by checkRangeTombstones()
    46  // and its helper functions.
    47  
    48  // The per-level structure used by simpleMergingIter.
    49  type simpleMergingIterLevel struct {
    50  	iter         internalIterator
    51  	rangeDelIter keyspan.FragmentIterator
    52  	levelIterBoundaryContext
    53  
    54  	iterKey   *InternalKey
    55  	iterValue base.LazyValue
    56  	tombstone *keyspan.Span
    57  }
    58  
    59  type simpleMergingIter struct {
    60  	levels   []simpleMergingIterLevel
    61  	snapshot uint64
    62  	heap     simpleMergingIterHeap
    63  	// The last point's key and level. For validation.
    64  	lastKey     InternalKey
    65  	lastLevel   int
    66  	lastIterMsg string
    67  	// A non-nil valueMerger means MERGE record processing is ongoing.
    68  	valueMerger base.ValueMerger
    69  	// The first error will cause step() to return false.
    70  	err       error
    71  	numPoints int64
    72  	merge     Merge
    73  	formatKey base.FormatKey
    74  }
    75  
    76  func (m *simpleMergingIter) init(
    77  	merge Merge,
    78  	cmp Compare,
    79  	snapshot uint64,
    80  	formatKey base.FormatKey,
    81  	levels ...simpleMergingIterLevel,
    82  ) {
    83  	m.levels = levels
    84  	m.formatKey = formatKey
    85  	m.merge = merge
    86  	m.snapshot = snapshot
    87  	m.lastLevel = -1
    88  	m.heap.cmp = cmp
    89  	m.heap.items = make([]simpleMergingIterItem, 0, len(levels))
    90  	for i := range m.levels {
    91  		l := &m.levels[i]
    92  		l.iterKey, l.iterValue = l.iter.First()
    93  		if l.iterKey != nil {
    94  			item := simpleMergingIterItem{
    95  				index: i,
    96  				value: l.iterValue,
    97  			}
    98  			item.key.Trailer = l.iterKey.Trailer
    99  			item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
   100  			m.heap.items = append(m.heap.items, item)
   101  		}
   102  	}
   103  	m.heap.init()
   104  
   105  	if m.heap.len() == 0 {
   106  		return
   107  	}
   108  	m.positionRangeDels()
   109  }
   110  
   111  // Positions all the rangedel iterators at or past the current top of the
   112  // heap, using SeekGE().
   113  func (m *simpleMergingIter) positionRangeDels() {
   114  	item := &m.heap.items[0]
   115  	for i := range m.levels {
   116  		l := &m.levels[i]
   117  		if l.rangeDelIter == nil {
   118  			continue
   119  		}
   120  		l.tombstone = l.rangeDelIter.SeekGE(item.key.UserKey)
   121  	}
   122  }
   123  
   124  // Returns true if not yet done.
   125  func (m *simpleMergingIter) step() bool {
   126  	if m.heap.len() == 0 || m.err != nil {
   127  		return false
   128  	}
   129  	item := &m.heap.items[0]
   130  	l := &m.levels[item.index]
   131  	// Sentinels are not relevant for this point checking.
   132  	if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
   133  		m.numPoints++
   134  		keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0
   135  		if !keyChanged {
   136  			// At the same user key. We will see them in decreasing seqnum
   137  			// order so the lastLevel must not be lower.
   138  			if m.lastLevel > item.index {
   139  				m.err = errors.Errorf("found InternalKey %s in %s and InternalKey %s in %s",
   140  					item.key.Pretty(m.formatKey), l.iter, m.lastKey.Pretty(m.formatKey),
   141  					m.lastIterMsg)
   142  				return false
   143  			}
   144  			m.lastLevel = item.index
   145  		} else {
   146  			// The user key has changed.
   147  			m.lastKey.Trailer = item.key.Trailer
   148  			m.lastKey.UserKey = append(m.lastKey.UserKey[:0], item.key.UserKey...)
   149  			m.lastLevel = item.index
   150  		}
   151  		// Ongoing series of MERGE records ends with a MERGE record.
   152  		if keyChanged && m.valueMerger != nil {
   153  			var closer io.Closer
   154  			_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
   155  			if m.err == nil && closer != nil {
   156  				m.err = closer.Close()
   157  			}
   158  			m.valueMerger = nil
   159  		}
   160  		itemValue, _, err := item.value.Value(nil)
   161  		if err != nil {
   162  			m.err = err
   163  			return false
   164  		}
   165  		if m.valueMerger != nil {
   166  			// Ongoing series of MERGE records.
   167  			switch item.key.Kind() {
   168  			case InternalKeyKindSingleDelete, InternalKeyKindDelete, InternalKeyKindDeleteSized:
   169  				var closer io.Closer
   170  				_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
   171  				if m.err == nil && closer != nil {
   172  					m.err = closer.Close()
   173  				}
   174  				m.valueMerger = nil
   175  			case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   176  				m.err = m.valueMerger.MergeOlder(itemValue)
   177  				if m.err == nil {
   178  					var closer io.Closer
   179  					_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
   180  					if m.err == nil && closer != nil {
   181  						m.err = closer.Close()
   182  					}
   183  				}
   184  				m.valueMerger = nil
   185  			case InternalKeyKindMerge:
   186  				m.err = m.valueMerger.MergeOlder(itemValue)
   187  			default:
   188  				m.err = errors.Errorf("pebble: invalid internal key kind %s in %s",
   189  					item.key.Pretty(m.formatKey),
   190  					l.iter)
   191  				return false
   192  			}
   193  		} else if item.key.Kind() == InternalKeyKindMerge && m.err == nil {
   194  			// New series of MERGE records.
   195  			m.valueMerger, m.err = m.merge(item.key.UserKey, itemValue)
   196  		}
   197  		if m.err != nil {
   198  			m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
   199  				item.key.Pretty(m.formatKey), l.iter)
   200  			return false
   201  		}
   202  		// Is this point covered by a tombstone at a lower level? Note that all these
   203  		// iterators must be positioned at a key > item.key. So the Largest key bound
   204  		// of the sstable containing the tombstone >= item.key. So the upper limit of
   205  		// the tombstone cannot be file-bounds-constrained to < item.key. But it is
   206  		// possible that item.key < smallest key bound of the sstable, in which case
   207  		// this tombstone should be ignored.
   208  		for level := item.index + 1; level < len(m.levels); level++ {
   209  			lvl := &m.levels[level]
   210  			if lvl.rangeDelIter == nil || lvl.tombstone.Empty() {
   211  				continue
   212  			}
   213  			if (lvl.smallestUserKey == nil || m.heap.cmp(lvl.smallestUserKey, item.key.UserKey) <= 0) &&
   214  				lvl.tombstone.Contains(m.heap.cmp, item.key.UserKey) {
   215  				if lvl.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) {
   216  					m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s",
   217  						lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey),
   218  						l.iter)
   219  					return false
   220  				}
   221  			}
   222  		}
   223  	}
   224  
   225  	// The iterator for the current level may be closed in the following call to
   226  	// Next(). We save its debug string for potential use after it is closed -
   227  	// either in this current step() invocation or on the next invocation.
   228  	m.lastIterMsg = l.iter.String()
   229  
   230  	// Step to the next point.
   231  	if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil {
   232  		// Check point keys in an sstable are ordered. Although not required, we check
   233  		// for memtables as well. A subtle check here is that successive sstables of
   234  		// L1 and higher levels are ordered. This happens when levelIter moves to the
   235  		// next sstable in the level, in which case item.key is previous sstable's
   236  		// last point key.
   237  		if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
   238  			m.err = errors.Errorf("out of order keys %s >= %s in %s",
   239  				item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
   240  			return false
   241  		}
   242  		item.key.Trailer = l.iterKey.Trailer
   243  		item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
   244  		item.value = l.iterValue
   245  		if m.heap.len() > 1 {
   246  			m.heap.fix(0)
   247  		}
   248  	} else {
   249  		m.err = l.iter.Close()
   250  		l.iter = nil
   251  		m.heap.pop()
   252  	}
   253  	if m.err != nil {
   254  		return false
   255  	}
   256  	if m.heap.len() == 0 {
   257  		// Last record was a MERGE record.
   258  		if m.valueMerger != nil {
   259  			var closer io.Closer
   260  			_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
   261  			if m.err == nil && closer != nil {
   262  				m.err = closer.Close()
   263  			}
   264  			if m.err != nil {
   265  				m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
   266  					item.key.Pretty(m.formatKey), m.lastIterMsg)
   267  			}
   268  			m.valueMerger = nil
   269  		}
   270  		return false
   271  	}
   272  	m.positionRangeDels()
   273  	return true
   274  }
   275  
   276  // Checking that range tombstones are mutually consistent is performed by checkRangeTombstones().
   277  // See the overview comment at the top of the file.
   278  //
   279  // We do this check as follows:
   280  // - For each level that can have untruncated tombstones, compute the atomic compaction
   281  //   bounds (getAtomicUnitBounds()) and use them to truncate tombstones.
   282  // - Now that we have a set of truncated tombstones for each level, put them into one
   283  //   pool of tombstones along with their level information (addTombstonesFromIter()).
   284  // - Collect the start and end user keys from all these tombstones (collectAllUserKey()) and use
   285  //   them to fragment all the tombstones (fragmentUsingUserKey()).
   286  // - Sort tombstones by start key and decreasing seqnum (tombstonesByStartKeyAndSeqnum) -- all
   287  //   tombstones that have the same start key will have the same end key because they have been
   288  //   fragmented.
   289  // - Iterate and check (iterateAndCheckTombstones()).
   290  // Note that this simple approach requires holding all the tombstones across all levels in-memory.
   291  // A more sophisticated incremental approach could be devised, if necessary.
   292  
   293  // A tombstone and the corresponding level it was found in.
   294  type tombstoneWithLevel struct {
   295  	keyspan.Span
   296  	level int
   297  	// The level in LSM. A -1 means it's a memtable.
   298  	lsmLevel int
   299  	fileNum  FileNum
   300  }
   301  
   302  // For sorting tombstoneWithLevels in increasing order of start UserKey and
   303  // for the same start UserKey in decreasing order of seqnum.
   304  type tombstonesByStartKeyAndSeqnum struct {
   305  	cmp Compare
   306  	buf []tombstoneWithLevel
   307  }
   308  
   309  func (v *tombstonesByStartKeyAndSeqnum) Len() int { return len(v.buf) }
   310  func (v *tombstonesByStartKeyAndSeqnum) Less(i, j int) bool {
   311  	less := v.cmp(v.buf[i].Start, v.buf[j].Start)
   312  	if less == 0 {
   313  		return v.buf[i].LargestSeqNum() > v.buf[j].LargestSeqNum()
   314  	}
   315  	return less < 0
   316  }
   317  func (v *tombstonesByStartKeyAndSeqnum) Swap(i, j int) {
   318  	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
   319  }
   320  
   321  func iterateAndCheckTombstones(
   322  	cmp Compare, formatKey base.FormatKey, tombstones []tombstoneWithLevel,
   323  ) error {
   324  	sortBuf := tombstonesByStartKeyAndSeqnum{
   325  		cmp: cmp,
   326  		buf: tombstones,
   327  	}
   328  	sort.Sort(&sortBuf)
   329  
   330  	// For a sequence of tombstones that share the same start UserKey, we will
   331  	// encounter them in non-increasing seqnum order and so should encounter them
   332  	// in non-decreasing level order.
   333  	lastTombstone := tombstoneWithLevel{}
   334  	for _, t := range tombstones {
   335  		if cmp(lastTombstone.Start, t.Start) == 0 && lastTombstone.level > t.level {
   336  			return errors.Errorf("encountered tombstone %s in %s"+
   337  				" that has a lower seqnum than the same tombstone in %s",
   338  				t.Span.Pretty(formatKey), levelOrMemtable(t.lsmLevel, t.fileNum),
   339  				levelOrMemtable(lastTombstone.lsmLevel, lastTombstone.fileNum))
   340  		}
   341  		lastTombstone = t
   342  	}
   343  	return nil
   344  }
   345  
   346  type checkConfig struct {
   347  	logger    Logger
   348  	comparer  *Comparer
   349  	readState *readState
   350  	newIters  tableNewIters
   351  	seqNum    uint64
   352  	stats     *CheckLevelsStats
   353  	merge     Merge
   354  	formatKey base.FormatKey
   355  }
   356  
   357  // cmp is shorthand for comparer.Compare.
   358  func (c *checkConfig) cmp(a, b []byte) int { return c.comparer.Compare(a, b) }
   359  
   360  func checkRangeTombstones(c *checkConfig) error {
   361  	var level int
   362  	var tombstones []tombstoneWithLevel
   363  	var err error
   364  
   365  	memtables := c.readState.memtables
   366  	for i := len(memtables) - 1; i >= 0; i-- {
   367  		iter := memtables[i].newRangeDelIter(nil)
   368  		if iter == nil {
   369  			continue
   370  		}
   371  		if tombstones, err = addTombstonesFromIter(iter, level, -1, 0, tombstones,
   372  			c.seqNum, c.cmp, c.formatKey, nil); err != nil {
   373  			return err
   374  		}
   375  		level++
   376  	}
   377  
   378  	current := c.readState.current
   379  	addTombstonesFromLevel := func(files manifest.LevelIterator, lsmLevel int) error {
   380  		for f := files.First(); f != nil; f = files.Next() {
   381  			lf := files.Take()
   382  			atomicUnit, _ := expandToAtomicUnit(c.cmp, lf.Slice(), true /* disableIsCompacting */)
   383  			lower, upper := manifest.KeyRange(c.cmp, atomicUnit.Iter())
   384  			iterToClose, iter, err := c.newIters(
   385  				context.Background(), lf.FileMetadata, &IterOptions{level: manifest.Level(lsmLevel)}, internalIterOpts{})
   386  			if err != nil {
   387  				return err
   388  			}
   389  			iterToClose.Close()
   390  			if iter == nil {
   391  				continue
   392  			}
   393  			truncate := func(t keyspan.Span) keyspan.Span {
   394  				// Same checks as in keyspan.Truncate.
   395  				if c.cmp(t.Start, lower.UserKey) < 0 {
   396  					t.Start = lower.UserKey
   397  				}
   398  				if c.cmp(t.End, upper.UserKey) > 0 {
   399  					t.End = upper.UserKey
   400  				}
   401  				if c.cmp(t.Start, t.End) >= 0 {
   402  					// Remove the keys.
   403  					t.Keys = t.Keys[:0]
   404  				}
   405  				return t
   406  			}
   407  			if tombstones, err = addTombstonesFromIter(iter, level, lsmLevel, f.FileNum,
   408  				tombstones, c.seqNum, c.cmp, c.formatKey, truncate); err != nil {
   409  				return err
   410  			}
   411  		}
   412  		return nil
   413  	}
   414  	// Now the levels with untruncated tombsones.
   415  	for i := len(current.L0SublevelFiles) - 1; i >= 0; i-- {
   416  		if current.L0SublevelFiles[i].Empty() {
   417  			continue
   418  		}
   419  		err := addTombstonesFromLevel(current.L0SublevelFiles[i].Iter(), 0)
   420  		if err != nil {
   421  			return err
   422  		}
   423  		level++
   424  	}
   425  	for i := 1; i < len(current.Levels); i++ {
   426  		if err := addTombstonesFromLevel(current.Levels[i].Iter(), i); err != nil {
   427  			return err
   428  		}
   429  		level++
   430  	}
   431  	if c.stats != nil {
   432  		c.stats.NumTombstones = len(tombstones)
   433  	}
   434  	// We now have truncated tombstones.
   435  	// Fragment them all.
   436  	userKeys := collectAllUserKeys(c.cmp, tombstones)
   437  	tombstones = fragmentUsingUserKeys(c.cmp, tombstones, userKeys)
   438  	return iterateAndCheckTombstones(c.cmp, c.formatKey, tombstones)
   439  }
   440  
   441  func levelOrMemtable(lsmLevel int, fileNum FileNum) string {
   442  	if lsmLevel == -1 {
   443  		return "memtable"
   444  	}
   445  	return fmt.Sprintf("L%d: fileNum=%s", lsmLevel, fileNum)
   446  }
   447  
   448  func addTombstonesFromIter(
   449  	iter keyspan.FragmentIterator,
   450  	level int,
   451  	lsmLevel int,
   452  	fileNum FileNum,
   453  	tombstones []tombstoneWithLevel,
   454  	seqNum uint64,
   455  	cmp Compare,
   456  	formatKey base.FormatKey,
   457  	truncate func(tombstone keyspan.Span) keyspan.Span,
   458  ) (_ []tombstoneWithLevel, err error) {
   459  	defer func() {
   460  		err = firstError(err, iter.Close())
   461  	}()
   462  
   463  	var prevTombstone keyspan.Span
   464  	for tomb := iter.First(); tomb != nil; tomb = iter.Next() {
   465  		t := tomb.Visible(seqNum)
   466  		if t.Empty() {
   467  			continue
   468  		}
   469  		t = t.DeepClone()
   470  		// This is mainly a test for rangeDelV2 formatted blocks which are expected to
   471  		// be ordered and fragmented on disk. But we anyways check for memtables,
   472  		// rangeDelV1 as well.
   473  		if cmp(prevTombstone.End, t.Start) > 0 {
   474  			return nil, errors.Errorf("unordered or unfragmented range delete tombstones %s, %s in %s",
   475  				prevTombstone.Pretty(formatKey), t.Pretty(formatKey), levelOrMemtable(lsmLevel, fileNum))
   476  		}
   477  		prevTombstone = t
   478  
   479  		// Truncation of a tombstone must happen after checking its ordering,
   480  		// fragmentation wrt previous tombstone. Since it is possible that after
   481  		// truncation the tombstone is ordered, fragmented when it originally wasn't.
   482  		if truncate != nil {
   483  			t = truncate(t)
   484  		}
   485  		if !t.Empty() {
   486  			tombstones = append(tombstones, tombstoneWithLevel{
   487  				Span:     t,
   488  				level:    level,
   489  				lsmLevel: lsmLevel,
   490  				fileNum:  fileNum,
   491  			})
   492  		}
   493  	}
   494  	return tombstones, nil
   495  }
   496  
   497  type userKeysSort struct {
   498  	cmp Compare
   499  	buf [][]byte
   500  }
   501  
   502  func (v *userKeysSort) Len() int { return len(v.buf) }
   503  func (v *userKeysSort) Less(i, j int) bool {
   504  	return v.cmp(v.buf[i], v.buf[j]) < 0
   505  }
   506  func (v *userKeysSort) Swap(i, j int) {
   507  	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
   508  }
   509  func collectAllUserKeys(cmp Compare, tombstones []tombstoneWithLevel) [][]byte {
   510  	keys := make([][]byte, 0, len(tombstones)*2)
   511  	for _, t := range tombstones {
   512  		keys = append(keys, t.Start)
   513  		keys = append(keys, t.End)
   514  	}
   515  	sorter := userKeysSort{
   516  		cmp: cmp,
   517  		buf: keys,
   518  	}
   519  	sort.Sort(&sorter)
   520  	var last, curr int
   521  	for last, curr = -1, 0; curr < len(keys); curr++ {
   522  		if last < 0 || cmp(keys[last], keys[curr]) != 0 {
   523  			last++
   524  			keys[last] = keys[curr]
   525  		}
   526  	}
   527  	keys = keys[:last+1]
   528  	return keys
   529  }
   530  
   531  func fragmentUsingUserKeys(
   532  	cmp Compare, tombstones []tombstoneWithLevel, userKeys [][]byte,
   533  ) []tombstoneWithLevel {
   534  	var buf []tombstoneWithLevel
   535  	for _, t := range tombstones {
   536  		// Find the first position with tombstone start < user key
   537  		i := sort.Search(len(userKeys), func(i int) bool {
   538  			return cmp(t.Start, userKeys[i]) < 0
   539  		})
   540  		for ; i < len(userKeys); i++ {
   541  			if cmp(userKeys[i], t.End) >= 0 {
   542  				break
   543  			}
   544  			tPartial := t
   545  			tPartial.End = userKeys[i]
   546  			buf = append(buf, tPartial)
   547  			t.Start = userKeys[i]
   548  		}
   549  		buf = append(buf, t)
   550  	}
   551  	return buf
   552  }
   553  
   554  // CheckLevelsStats provides basic stats on points and tombstones encountered.
   555  type CheckLevelsStats struct {
   556  	NumPoints     int64
   557  	NumTombstones int
   558  }
   559  
   560  // CheckLevels checks:
   561  //   - Every entry in the DB is consistent with the level invariant. See the
   562  //     comment at the top of the file.
   563  //   - Point keys in sstables are ordered.
   564  //   - Range delete tombstones in sstables are ordered and fragmented.
   565  //   - Successful processing of all MERGE records.
   566  func (d *DB) CheckLevels(stats *CheckLevelsStats) error {
   567  	// Grab and reference the current readState.
   568  	readState := d.loadReadState()
   569  	defer readState.unref()
   570  
   571  	// Determine the seqnum to read at after grabbing the read state (current and
   572  	// memtables) above.
   573  	seqNum := d.mu.versions.visibleSeqNum.Load()
   574  
   575  	checkConfig := &checkConfig{
   576  		logger:    d.opts.Logger,
   577  		comparer:  d.opts.Comparer,
   578  		readState: readState,
   579  		newIters:  d.newIters,
   580  		seqNum:    seqNum,
   581  		stats:     stats,
   582  		merge:     d.merge,
   583  		formatKey: d.opts.Comparer.FormatKey,
   584  	}
   585  	return checkLevelsInternal(checkConfig)
   586  }
   587  
   588  func checkLevelsInternal(c *checkConfig) (err error) {
   589  	// Phase 1: Use a simpleMergingIter to step through all the points and ensure
   590  	// that points with the same user key at different levels are not inverted
   591  	// wrt sequence numbers and the same holds for tombstones that cover points.
   592  	// To do this, one needs to construct a simpleMergingIter which is similar to
   593  	// how one constructs a mergingIter.
   594  
   595  	// Add mem tables from newest to oldest.
   596  	var mlevels []simpleMergingIterLevel
   597  	defer func() {
   598  		for i := range mlevels {
   599  			l := &mlevels[i]
   600  			if l.iter != nil {
   601  				err = firstError(err, l.iter.Close())
   602  				l.iter = nil
   603  			}
   604  			if l.rangeDelIter != nil {
   605  				err = firstError(err, l.rangeDelIter.Close())
   606  				l.rangeDelIter = nil
   607  			}
   608  		}
   609  	}()
   610  
   611  	memtables := c.readState.memtables
   612  	for i := len(memtables) - 1; i >= 0; i-- {
   613  		mem := memtables[i]
   614  		mlevels = append(mlevels, simpleMergingIterLevel{
   615  			iter:         mem.newIter(nil),
   616  			rangeDelIter: mem.newRangeDelIter(nil),
   617  		})
   618  	}
   619  
   620  	current := c.readState.current
   621  	// Determine the final size for mlevels so that there are no more
   622  	// reallocations. levelIter will hold a pointer to elements in mlevels.
   623  	start := len(mlevels)
   624  	for sublevel := len(current.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
   625  		if current.L0SublevelFiles[sublevel].Empty() {
   626  			continue
   627  		}
   628  		mlevels = append(mlevels, simpleMergingIterLevel{})
   629  	}
   630  	for level := 1; level < len(current.Levels); level++ {
   631  		if current.Levels[level].Empty() {
   632  			continue
   633  		}
   634  		mlevels = append(mlevels, simpleMergingIterLevel{})
   635  	}
   636  	mlevelAlloc := mlevels[start:]
   637  	// Add L0 files by sublevel.
   638  	for sublevel := len(current.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
   639  		if current.L0SublevelFiles[sublevel].Empty() {
   640  			continue
   641  		}
   642  		manifestIter := current.L0SublevelFiles[sublevel].Iter()
   643  		iterOpts := IterOptions{logger: c.logger}
   644  		li := &levelIter{}
   645  		li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter,
   646  			manifest.L0Sublevel(sublevel), internalIterOpts{})
   647  		li.initRangeDel(&mlevelAlloc[0].rangeDelIter)
   648  		li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext)
   649  		mlevelAlloc[0].iter = li
   650  		mlevelAlloc = mlevelAlloc[1:]
   651  	}
   652  	for level := 1; level < len(current.Levels); level++ {
   653  		if current.Levels[level].Empty() {
   654  			continue
   655  		}
   656  
   657  		iterOpts := IterOptions{logger: c.logger}
   658  		li := &levelIter{}
   659  		li.init(context.Background(), iterOpts, c.comparer, c.newIters,
   660  			current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{})
   661  		li.initRangeDel(&mlevelAlloc[0].rangeDelIter)
   662  		li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext)
   663  		mlevelAlloc[0].iter = li
   664  		mlevelAlloc = mlevelAlloc[1:]
   665  	}
   666  
   667  	mergingIter := &simpleMergingIter{}
   668  	mergingIter.init(c.merge, c.cmp, c.seqNum, c.formatKey, mlevels...)
   669  	for cont := mergingIter.step(); cont; cont = mergingIter.step() {
   670  	}
   671  	if err := mergingIter.err; err != nil {
   672  		return err
   673  	}
   674  	if c.stats != nil {
   675  		c.stats.NumPoints = mergingIter.numPoints
   676  	}
   677  
   678  	// Phase 2: Check that the tombstones are mutually consistent.
   679  	return checkRangeTombstones(c)
   680  }
   681  
   682  type simpleMergingIterItem struct {
   683  	index int
   684  	key   InternalKey
   685  	value base.LazyValue
   686  }
   687  
   688  type simpleMergingIterHeap struct {
   689  	cmp     Compare
   690  	reverse bool
   691  	items   []simpleMergingIterItem
   692  }
   693  
   694  func (h *simpleMergingIterHeap) len() int {
   695  	return len(h.items)
   696  }
   697  
   698  func (h *simpleMergingIterHeap) less(i, j int) bool {
   699  	ikey, jkey := h.items[i].key, h.items[j].key
   700  	if c := h.cmp(ikey.UserKey, jkey.UserKey); c != 0 {
   701  		if h.reverse {
   702  			return c > 0
   703  		}
   704  		return c < 0
   705  	}
   706  	if h.reverse {
   707  		return ikey.Trailer < jkey.Trailer
   708  	}
   709  	return ikey.Trailer > jkey.Trailer
   710  }
   711  
   712  func (h *simpleMergingIterHeap) swap(i, j int) {
   713  	h.items[i], h.items[j] = h.items[j], h.items[i]
   714  }
   715  
   716  // init, fix, up and down are copied from the go stdlib.
   717  func (h *simpleMergingIterHeap) init() {
   718  	// heapify
   719  	n := h.len()
   720  	for i := n/2 - 1; i >= 0; i-- {
   721  		h.down(i, n)
   722  	}
   723  }
   724  
   725  func (h *simpleMergingIterHeap) fix(i int) {
   726  	if !h.down(i, h.len()) {
   727  		h.up(i)
   728  	}
   729  }
   730  
   731  func (h *simpleMergingIterHeap) pop() *simpleMergingIterItem {
   732  	n := h.len() - 1
   733  	h.swap(0, n)
   734  	h.down(0, n)
   735  	item := &h.items[n]
   736  	h.items = h.items[:n]
   737  	return item
   738  }
   739  
   740  func (h *simpleMergingIterHeap) up(j int) {
   741  	for {
   742  		i := (j - 1) / 2 // parent
   743  		if i == j || !h.less(j, i) {
   744  			break
   745  		}
   746  		h.swap(i, j)
   747  		j = i
   748  	}
   749  }
   750  
   751  func (h *simpleMergingIterHeap) down(i0, n int) bool {
   752  	i := i0
   753  	for {
   754  		j1 := 2*i + 1
   755  		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
   756  			break
   757  		}
   758  		j := j1 // left child
   759  		if j2 := j1 + 1; j2 < n && h.less(j2, j1) {
   760  			j = j2 // = 2*i + 2  // right child
   761  		}
   762  		if !h.less(j, i) {
   763  			break
   764  		}
   765  		h.swap(i, j)
   766  		i = j
   767  	}
   768  	return i > i0
   769  }