github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/rangedel/fragmenter.go

github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/rangedel/fragmenter.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package rangedel
     6  
     7  import (
     8  	"fmt"
     9  	"sort"
    10  
    11  	"github.com/petermattis/pebble/internal/base"
    12  )
    13  
    14  type tombstonesByStartKey struct {
    15  	cmp base.Compare
    16  	buf []Tombstone
    17  }
    18  
    19  func (v *tombstonesByStartKey) Len() int { return len(v.buf) }
    20  func (v *tombstonesByStartKey) Less(i, j int) bool {
    21  	return base.InternalCompare(v.cmp, v.buf[i].Start, v.buf[j].Start) < 0
    22  }
    23  func (v *tombstonesByStartKey) Swap(i, j int) {
    24  	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
    25  }
    26  
    27  type tombstonesByEndKey struct {
    28  	cmp base.Compare
    29  	buf []Tombstone
    30  }
    31  
    32  func (v *tombstonesByEndKey) Len() int { return len(v.buf) }
    33  func (v *tombstonesByEndKey) Less(i, j int) bool {
    34  	return v.cmp(v.buf[i].End, v.buf[j].End) < 0
    35  }
    36  func (v *tombstonesByEndKey) Swap(i, j int) {
    37  	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
    38  }
    39  
    40  type tombstonesBySeqNum []Tombstone
    41  
    42  func (v *tombstonesBySeqNum) Len() int { return len(*v) }
    43  func (v *tombstonesBySeqNum) Less(i, j int) bool {
    44  	return (*v)[i].Start.SeqNum() > (*v)[j].Start.SeqNum()
    45  }
    46  func (v *tombstonesBySeqNum) Swap(i, j int) {
    47  	(*v)[i], (*v)[j] = (*v)[j], (*v)[i]
    48  }
    49  
    50  // Sort the tombstones by start key. This is the ordering required by the
    51  // Fragmenter. Usually tombstones are naturally sorted by their start key, but
    52  // that isn't true for tombstones in the legacy range-del-v1 block format.
    53  func Sort(cmp base.Compare, tombstones []Tombstone) {
    54  	sorter := tombstonesByStartKey{
    55  		cmp: cmp,
    56  		buf: tombstones,
    57  	}
    58  	sort.Sort(&sorter)
    59  }
    60  
    61  // Fragmenter fragments a set of range tombstones such that overlapping
    62  // tombstones are split at their overlap points. The fragmented tombstones are
    63  // output to the supplied Output function.
    64  type Fragmenter struct {
    65  	Cmp base.Compare
    66  	// Emit is called to emit a chunk of tombstone fragments. Every tombstone
    67  	// within the chunk has the same start and end key, and differ only by
    68  	// sequence number.
    69  	Emit func([]Tombstone)
    70  	// pending contains the list of pending range tombstone fragments that have
    71  	// not been flushed to the block writer. Note that the tombstones have not
    72  	// been fragmented on the end keys yet. That happens as the tombstones are
    73  	// flushed.
    74  	pending  []Tombstone
    75  	doneBuf  []Tombstone
    76  	sortBuf  tombstonesByEndKey
    77  	flushBuf tombstonesBySeqNum
    78  	finished bool
    79  }
    80  
    81  func (f *Fragmenter) checkSameStart(buf []Tombstone) {
    82  	for i := 1; i < len(buf); i++ {
    83  		if f.Cmp(buf[i-1].Start.UserKey, buf[i].Start.UserKey) != 0 {
    84  			panic(fmt.Sprintf("pebble: pending tombstone invariant violated: %s %s",
    85  				buf[i-1].Start, buf[i].Start))
    86  		}
    87  	}
    88  }
    89  
    90  func (f *Fragmenter) checkInvariants() {
    91  	f.checkSameStart(f.pending)
    92  }
    93  
    94  // Add adds a tombstone to the fragmenter. Tombstones may overlap and the
    95  // fragmenter will internally split them. The tombstones must be presented in
    96  // increasing start key order. That is, Add must be called with a series of
    97  // tombstones like:
    98  //
    99  //   a---e
   100  //     c---g
   101  //     c-----i
   102  //            j---n
   103  //            j-l
   104  //
   105  // We need to fragment the tombstones at overlap points. In the above
   106  // example, we'd create:
   107  //
   108  //   a-c-e
   109  //     c-e-g
   110  //     c-e-g-i
   111  //            j-l-n
   112  //            j-l
   113  //
   114  // The fragments need to be output sorted by start key, and for equal start
   115  // keys, sorted by descending sequence number. This last part requires a mild
   116  // bit of care as the fragments are not created in descending sequence number
   117  // order.
   118  //
   119  // Once a start key has been seen, we know that we'll never see a smaller
   120  // start key and can thus flush all of the fragments that lie before that
   121  // start key.
   122  //
   123  // Walking through the example above, we start with:
   124  //
   125  //   a---e
   126  //
   127  // Next we add [c,g) resulting in:
   128  //
   129  //   a-c-e
   130  //     c---g
   131  //
   132  // The fragment [a,c) is flushed leaving the pending tombstones as:
   133  //
   134  //   c-e
   135  //   c---g
   136  //
   137  // The next tombstone is [c,i):
   138  //
   139  //   c-e
   140  //   c---g
   141  //   c-----i
   142  //
   143  // No fragments are flushed. The next tombstone is [j,n):
   144  //
   145  //   c-e
   146  //   c---g
   147  //   c-----i
   148  //          j---n
   149  //
   150  // The fragments [c,e), [c,g) and [c,i) are flushed. We sort these fragments
   151  // by their end key, then split the fragments on the end keys:
   152  //
   153  //   c-e
   154  //   c-e-g
   155  //   c-e---i
   156  //
   157  // The [c,e) fragments all get flushed leaving:
   158  //
   159  //   e-g
   160  //   e---i
   161  //
   162  // This process continues until there are no more fragments to flush.
   163  //
   164  // WARNING: the slices backing start.UserKey and end are retained after this
   165  // method returns and should not be modified. This is safe for tombstones that
   166  // are added from a memtable or batch. It is not safe for a tombstone added
   167  // from an sstable where the range-del block has been prefix compressed.
   168  func (f *Fragmenter) Add(start base.InternalKey, end []byte) {
   169  	if f.finished {
   170  		panic("pebble: tombstone fragmenter already finished")
   171  	}
   172  	if raceEnabled {
   173  		f.checkInvariants()
   174  		defer f.checkInvariants()
   175  	}
   176  
   177  	if len(f.pending) > 0 {
   178  		// Since all of the pending tombstones have the same start key, we only need
   179  		// to compare against the first one.
   180  		switch c := f.Cmp(f.pending[0].Start.UserKey, start.UserKey); {
   181  		case c > 0:
   182  			panic(fmt.Sprintf("pebble: keys must be added in order: %s > %s",
   183  				f.pending[0].Start, start))
   184  		case c == 0:
   185  			// The new tombstone has the same start key as the existing pending
   186  			// tombstones. Add it to the pending buffer.
   187  			f.pending = append(f.pending, Tombstone{
   188  				Start: start,
   189  				End:   end,
   190  			})
   191  			return
   192  		}
   193  
   194  		// At this point we know that the new start key is greater than the pending
   195  		// tombstones start keys.
   196  		f.truncateAndFlush(start.UserKey)
   197  	}
   198  
   199  	f.pending = append(f.pending, Tombstone{
   200  		Start: start,
   201  		End:   end,
   202  	})
   203  }
   204  
   205  // Deleted returns true if the specified key is covered by one of the pending
   206  // tombstones. The key must be consistent with the ordering of the
   207  // tombstones. That is, it is invalid to specify a key here that is out of
   208  // order with the tombstone start keys passed to Add.
   209  func (f *Fragmenter) Deleted(key base.InternalKey, snapshot uint64) bool {
   210  	if f.finished {
   211  		panic("pebble: tombstone fragmenter already finished")
   212  	}
   213  	if len(f.pending) == 0 {
   214  		return false
   215  	}
   216  
   217  	if f.Cmp(f.pending[0].Start.UserKey, key.UserKey) > 0 {
   218  		panic(fmt.Sprintf("pebble: keys must be in order: %s > %s",
   219  			f.pending[0].Start, key))
   220  	}
   221  
   222  	seqNum := key.SeqNum()
   223  	flush := true
   224  	for _, t := range f.pending {
   225  		if f.Cmp(key.UserKey, t.End) < 0 {
   226  			// NB: A range deletion tombstone deletes a point operation at the same
   227  			// sequence number.
   228  			if t.Start.Visible(snapshot) && t.Start.SeqNum() > seqNum {
   229  				return true
   230  			}
   231  			flush = false
   232  		}
   233  	}
   234  
   235  	if flush {
   236  		// All of the pending tombstones ended before the specified key which means
   237  		// we can flush them without causing fragmentation at key. This is an
   238  		// optimization to allow flushing the pending tombstones as early as
   239  		// possible so that we don't have to continually reconsider them in
   240  		// Deleted.
   241  		f.flush(f.pending, true /* all */)
   242  		f.pending = f.pending[:0]
   243  	}
   244  	return false
   245  }
   246  
   247  // FlushTo flushes all of the fragments before key. Used internally by Add to
   248  // flush tombstone fragments, and can be used externally to fragment tombstones
   249  // during compaction when a tombstone straddles an sstable boundary.
   250  func (f *Fragmenter) FlushTo(key []byte) {
   251  	if f.finished {
   252  		panic("pebble: tombstone fragmenter already finished")
   253  	}
   254  	if len(f.pending) == 0 {
   255  		return
   256  	}
   257  	// Since all of the pending tombstones have the same start key, we only need
   258  	// to compare against the first one.
   259  	switch c := f.Cmp(f.pending[0].Start.UserKey, key); {
   260  	case c > 0:
   261  		panic(fmt.Sprintf("pebble: keys must be in order: %s > %s",
   262  			f.pending[0].Start, key))
   263  	}
   264  
   265  	// At this point we know that the new start key is greater than the pending
   266  	// tombstones start keys. We flush the pending first set of fragments for the
   267  	// pending tombstones.
   268  	f.flush(f.pending, false /* all */)
   269  
   270  	for i := range f.pending {
   271  		f.pending[i].Start.UserKey = key
   272  	}
   273  }
   274  
   275  func (f *Fragmenter) truncateAndFlush(key []byte) {
   276  	done := f.doneBuf[:0]
   277  	pending := f.pending
   278  	f.pending = f.pending[:0]
   279  
   280  	for _, t := range pending {
   281  		if f.Cmp(key, t.End) < 0 {
   282  			//   t: a--+--e
   283  			// new:    c------
   284  			done = append(done, Tombstone{Start: t.Start, End: key})
   285  			f.pending = append(f.pending, Tombstone{
   286  				Start: base.MakeInternalKey(key, t.Start.SeqNum(), t.Start.Kind()),
   287  				End:   t.End,
   288  			})
   289  		} else {
   290  			//   t: a-----e
   291  			// new:       e----
   292  			done = append(done, t)
   293  		}
   294  	}
   295  
   296  	f.doneBuf = done[:0]
   297  	f.flush(done, true /* all */)
   298  }
   299  
   300  // flush a group of range tombstones to the block. The tombstones are required
   301  // to all have the same start key.
   302  func (f *Fragmenter) flush(buf []Tombstone, all bool) {
   303  	if raceEnabled {
   304  		f.checkSameStart(buf)
   305  	}
   306  
   307  	// Sort the tombstones by end key. This will allow us to walk over the
   308  	// tombstones and easily determine the next split point (the smallest
   309  	// end-key).
   310  	f.sortBuf.cmp = f.Cmp
   311  	f.sortBuf.buf = buf
   312  	sort.Sort(&f.sortBuf)
   313  
   314  	// Loop over the range tombstones, splitting by end key.
   315  	for len(buf) > 0 {
   316  		remove := 1
   317  		split := buf[0].End
   318  		f.flushBuf = append(f.flushBuf[:0], buf[0])
   319  
   320  		for i := 1; i < len(buf); i++ {
   321  			if f.Cmp(split, buf[i].End) == 0 {
   322  				remove++
   323  			}
   324  			f.flushBuf = append(f.flushBuf, Tombstone{
   325  				Start: buf[i].Start,
   326  				End:   split,
   327  			})
   328  		}
   329  
   330  		buf = buf[remove:]
   331  
   332  		sort.Sort(&f.flushBuf)
   333  		f.Emit(f.flushBuf)
   334  
   335  		if !all {
   336  			break
   337  		}
   338  
   339  		// Adjust the start key for every remaining tombstone.
   340  		for i := range buf {
   341  			buf[i].Start.UserKey = split
   342  		}
   343  	}
   344  }
   345  
   346  // Finish flushes any remaining fragments to the output. It is an error to call
   347  // this if any other tombstones will be added.
   348  func (f *Fragmenter) Finish() {
   349  	if f.finished {
   350  		panic("pebble: tombstone fragmenter already finished")
   351  	}
   352  	f.flush(f.pending, true /* all */)
   353  	f.finished = true
   354  }