github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/disk_map.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"sync/atomic"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    21  	"github.com/cockroachdb/cockroach/pkg/util/log"
    22  	"github.com/cockroachdb/errors"
    23  	"github.com/cockroachdb/pebble"
    24  )
    25  
    26  // defaultBatchCapacityBytes is the default capacity for a
    27  // SortedDiskMapBatchWriter.
    28  const defaultBatchCapacityBytes = 4096
    29  
    30  // rocksDBMapBatchWriter batches writes to a RocksDBMap.
    31  type rocksDBMapBatchWriter struct {
    32  	// capacity is the number of bytes to write before a Flush() is triggered.
    33  	capacity int
    34  
    35  	// makeKey is a function that transforms a key into an MVCCKey with a prefix
    36  	// to be written to the underlying store.
    37  	makeKey           func(k []byte) MVCCKey
    38  	batch             Batch
    39  	numPutsSinceFlush int
    40  	store             Engine
    41  }
    42  
    43  // rocksDBMapIterator iterates over the keys of a RocksDBMap in sorted order.
    44  type rocksDBMapIterator struct {
    45  	iter Iterator
    46  	// makeKey is a function that transforms a key into an MVCCKey with a prefix
    47  	// used to SeekGE() the underlying iterator.
    48  	makeKey func(k []byte) MVCCKey
    49  	// prefix is the prefix of keys that this iterator iterates over.
    50  	prefix []byte
    51  }
    52  
    53  // rocksDBMap is a SortedDiskMap that uses RocksDB as its underlying storage
    54  // engine.
    55  type rocksDBMap struct {
    56  	// TODO(asubiotto): Add memory accounting.
    57  	prefix          []byte
    58  	store           Engine
    59  	allowDuplicates bool
    60  	keyID           int64
    61  }
    62  
    63  var _ diskmap.SortedDiskMapBatchWriter = &rocksDBMapBatchWriter{}
    64  var _ diskmap.SortedDiskMapIterator = &rocksDBMapIterator{}
    65  var _ diskmap.SortedDiskMap = &rocksDBMap{}
    66  
    67  // tempStorageID is the temp ID generator for a node. It generates unique
    68  // prefixes for NewRocksDBMap. It is a global because newRocksDBMap needs to
    69  // prefix its writes uniquely, and using a global prevents users from having to
    70  // specify the prefix themselves and correctly guarantee that it is unique.
    71  var tempStorageID uint64
    72  
    73  func generateTempStorageID() uint64 {
    74  	return atomic.AddUint64(&tempStorageID, 1)
    75  }
    76  
    77  // newRocksDBMap creates a new rocksDBMap with the passed in Engine as the
    78  // underlying store. The rocksDBMap instance will have a keyspace prefixed by a
    79  // unique prefix. The allowDuplicates parameter controls whether Puts with
    80  // identical keys will write multiple entries or overwrite previous entries.
    81  func newRocksDBMap(e Engine, allowDuplicates bool) *rocksDBMap {
    82  	prefix := generateTempStorageID()
    83  	return &rocksDBMap{
    84  		prefix:          encoding.EncodeUvarintAscending([]byte(nil), prefix),
    85  		store:           e,
    86  		allowDuplicates: allowDuplicates,
    87  	}
    88  }
    89  
    90  // makeKey appends k to the rocksDBMap's prefix to keep the key local to this
    91  // instance and creates an MVCCKey, which is what the underlying storage engine
    92  // expects. The returned key is only valid until the next call to makeKey().
    93  func (r *rocksDBMap) makeKey(k []byte) MVCCKey {
    94  	// TODO(asubiotto): We can make this more performant by bypassing MVCCKey
    95  	// creation (have to generalize storage API). See
    96  	// https://github.com/cockroachdb/cockroach/issues/16718#issuecomment-311493414
    97  	prefixLen := len(r.prefix)
    98  	r.prefix = append(r.prefix, k...)
    99  	mvccKey := MVCCKey{Key: r.prefix}
   100  	r.prefix = r.prefix[:prefixLen]
   101  	return mvccKey
   102  }
   103  
   104  // makeKeyWithTimestamp makes a key appropriate for a Put operation. It is like
   105  // makeKey except it respects allowDuplicates, which uses the MVCC timestamp
   106  // field to assign a unique keyID so duplicate keys don't overwrite each other.
   107  func (r *rocksDBMap) makeKeyWithTimestamp(k []byte) MVCCKey {
   108  	mvccKey := r.makeKey(k)
   109  	if r.allowDuplicates {
   110  		r.keyID++
   111  		mvccKey.Timestamp.WallTime = r.keyID
   112  	}
   113  	return mvccKey
   114  }
   115  
   116  // NewIterator implements the SortedDiskMap interface.
   117  func (r *rocksDBMap) NewIterator() diskmap.SortedDiskMapIterator {
   118  	// NOTE: prefix is only false because we can't use the normal prefix
   119  	// extractor. This iterator still only does prefix iteration. See
   120  	// rocksDBMapIterator.Valid().
   121  	return &rocksDBMapIterator{
   122  		iter: r.store.NewIterator(IterOptions{
   123  			UpperBound: roachpb.Key(r.prefix).PrefixEnd(),
   124  		}),
   125  		makeKey: r.makeKey,
   126  		prefix:  r.prefix,
   127  	}
   128  }
   129  
   130  // NewBatchWriter implements the SortedDiskMap interface.
   131  func (r *rocksDBMap) NewBatchWriter() diskmap.SortedDiskMapBatchWriter {
   132  	return r.NewBatchWriterCapacity(defaultBatchCapacityBytes)
   133  }
   134  
   135  // NewBatchWriterCapacity implements the SortedDiskMap interface.
   136  func (r *rocksDBMap) NewBatchWriterCapacity(capacityBytes int) diskmap.SortedDiskMapBatchWriter {
   137  	makeKey := r.makeKey
   138  	if r.allowDuplicates {
   139  		makeKey = r.makeKeyWithTimestamp
   140  	}
   141  	return &rocksDBMapBatchWriter{
   142  		capacity: capacityBytes,
   143  		makeKey:  makeKey,
   144  		batch:    r.store.NewWriteOnlyBatch(),
   145  		store:    r.store,
   146  	}
   147  }
   148  
   149  // Clear implements the SortedDiskMap interface.
   150  func (r *rocksDBMap) Clear() error {
   151  	if err := r.store.ClearRange(
   152  		MVCCKey{Key: r.prefix},
   153  		MVCCKey{Key: roachpb.Key(r.prefix).PrefixEnd()},
   154  	); err != nil {
   155  		return errors.Wrapf(err, "unable to clear range with prefix %v", r.prefix)
   156  	}
   157  	// NB: we manually flush after performing the clear range to ensure that the
   158  	// range tombstone is pushed to disk which will kick off compactions that
   159  	// will eventually free up the deleted space.
   160  	return r.store.Flush()
   161  }
   162  
   163  // Close implements the SortedDiskMap interface.
   164  func (r *rocksDBMap) Close(ctx context.Context) {
   165  	if err := r.Clear(); err != nil {
   166  		log.Errorf(ctx, "%v", err)
   167  	}
   168  }
   169  
   170  // SeekGE implements the SortedDiskMapIterator interface.
   171  func (i *rocksDBMapIterator) SeekGE(k []byte) {
   172  	i.iter.SeekGE(i.makeKey(k))
   173  }
   174  
   175  // Rewind implements the SortedDiskMapIterator interface.
   176  func (i *rocksDBMapIterator) Rewind() {
   177  	i.iter.SeekGE(i.makeKey(nil))
   178  }
   179  
   180  // Valid implements the SortedDiskMapIterator interface.
   181  func (i *rocksDBMapIterator) Valid() (bool, error) {
   182  	ok, err := i.iter.Valid()
   183  	if err != nil {
   184  		return false, err
   185  	}
   186  	if ok && !bytes.HasPrefix(i.iter.UnsafeKey().Key, i.prefix) {
   187  		return false, nil
   188  	}
   189  
   190  	return ok, nil
   191  }
   192  
   193  // Next implements the SortedDiskMapIterator interface.
   194  func (i *rocksDBMapIterator) Next() {
   195  	i.iter.Next()
   196  }
   197  
   198  // UnsafeKey implements the SortedDiskMapIterator interface.
   199  func (i *rocksDBMapIterator) UnsafeKey() []byte {
   200  	return i.iter.UnsafeKey().Key[len(i.prefix):]
   201  }
   202  
   203  // UnsafeValue implements the SortedDiskMapIterator interface.
   204  func (i *rocksDBMapIterator) UnsafeValue() []byte {
   205  	return i.iter.UnsafeValue()
   206  }
   207  
   208  // Close implements the SortedDiskMapIterator interface.
   209  func (i *rocksDBMapIterator) Close() {
   210  	i.iter.Close()
   211  }
   212  
   213  // Put implements the SortedDiskMapBatchWriter interface.
   214  func (b *rocksDBMapBatchWriter) Put(k []byte, v []byte) error {
   215  	if err := b.batch.Put(b.makeKey(k), v); err != nil {
   216  		return err
   217  	}
   218  	b.numPutsSinceFlush++
   219  	if b.batch.Len() >= b.capacity {
   220  		return b.Flush()
   221  	}
   222  	return nil
   223  }
   224  
   225  // Flush implements the SortedDiskMapBatchWriter interface.
   226  func (b *rocksDBMapBatchWriter) Flush() error {
   227  	if b.batch.Empty() {
   228  		return nil
   229  	}
   230  	if err := b.batch.Commit(false /* syncCommit */); err != nil {
   231  		return err
   232  	}
   233  	b.numPutsSinceFlush = 0
   234  	b.batch = b.store.NewWriteOnlyBatch()
   235  	return nil
   236  }
   237  
   238  // NumPutsSinceFlush implements the SortedDiskMapBatchWriter interface.
   239  func (b *rocksDBMapBatchWriter) NumPutsSinceFlush() int {
   240  	return b.numPutsSinceFlush
   241  }
   242  
   243  // Close implements the SortedDiskMapBatchWriter interface.
   244  func (b *rocksDBMapBatchWriter) Close(ctx context.Context) error {
   245  	err := b.Flush()
   246  	b.batch.Close()
   247  	return err
   248  }
   249  
   250  // pebbleMapBatchWriter batches writes to a pebbleMap.
   251  type pebbleMapBatchWriter struct {
   252  	// capacity is the number of bytes to write before a Flush() is triggered.
   253  	capacity int
   254  
   255  	// makeKey is a function that transforms a key into a byte slice with a prefix
   256  	// to be written to the underlying store.
   257  	makeKey           func(k []byte) []byte
   258  	batch             *pebble.Batch
   259  	numPutsSinceFlush int
   260  	store             *pebble.DB
   261  }
   262  
   263  // pebbleMapIterator iterates over the keys of a pebbleMap in sorted order.
   264  type pebbleMapIterator struct {
   265  	allowDuplicates bool
   266  	iter            *pebble.Iterator
   267  	// makeKey is a function that transforms a key into a byte slice with a prefix
   268  	// used to SeekGE() the underlying iterator.
   269  	makeKey func(k []byte) []byte
   270  	// prefix is the prefix of keys that this iterator iterates over.
   271  	prefix []byte
   272  }
   273  
   274  // pebbleMap is a SortedDiskMap, similar to rocksDBMap, that uses pebble as its
   275  // underlying storage engine.
   276  type pebbleMap struct {
   277  	prefix          []byte
   278  	store           *pebble.DB
   279  	allowDuplicates bool
   280  	keyID           int64
   281  }
   282  
   283  var _ diskmap.SortedDiskMapBatchWriter = &pebbleMapBatchWriter{}
   284  var _ diskmap.SortedDiskMapIterator = &pebbleMapIterator{}
   285  var _ diskmap.SortedDiskMap = &pebbleMap{}
   286  
   287  // newPebbleMap creates a new pebbleMap with the passed in Engine as the
   288  // underlying store. The pebbleMap instance will have a keyspace prefixed by a
   289  // unique prefix. The allowDuplicates parameter controls whether Puts with
   290  // identical keys will write multiple entries or overwrite previous entries.
   291  func newPebbleMap(e *pebble.DB, allowDuplicates bool) *pebbleMap {
   292  	prefix := generateTempStorageID()
   293  	return &pebbleMap{
   294  		prefix:          encoding.EncodeUvarintAscending([]byte(nil), prefix),
   295  		store:           e,
   296  		allowDuplicates: allowDuplicates,
   297  	}
   298  }
   299  
   300  // makeKey appends k to the pebbleMap's prefix to keep the key local to this
   301  // instance and returns a byte slice containing the user-provided key and the
   302  // prefix. Pebble's operations can take this byte slice as a key. This key is
   303  // only valid until the next call to makeKey.
   304  func (r *pebbleMap) makeKey(k []byte) []byte {
   305  	prefixLen := len(r.prefix)
   306  	r.prefix = append(r.prefix, k...)
   307  	key := r.prefix
   308  	r.prefix = r.prefix[:prefixLen]
   309  	return key
   310  }
   311  
   312  // makeKeyWithSequence makes a key appropriate for a Put operation. It is like
   313  // makeKey except it respects allowDuplicates, by appending a sequence number to
   314  // the user-provided key.
   315  func (r *pebbleMap) makeKeyWithSequence(k []byte) []byte {
   316  	byteKey := r.makeKey(k)
   317  	if r.allowDuplicates {
   318  		r.keyID++
   319  		byteKey = encoding.EncodeUint64Ascending(byteKey, uint64(r.keyID))
   320  	}
   321  	return byteKey
   322  }
   323  
   324  // NewIterator implements the SortedDiskMap interface.
   325  func (r *pebbleMap) NewIterator() diskmap.SortedDiskMapIterator {
   326  	return &pebbleMapIterator{
   327  		allowDuplicates: r.allowDuplicates,
   328  		iter: r.store.NewIter(&pebble.IterOptions{
   329  			UpperBound: roachpb.Key(r.prefix).PrefixEnd(),
   330  		}),
   331  		makeKey: r.makeKey,
   332  		prefix:  r.prefix,
   333  	}
   334  }
   335  
   336  // NewBatchWriter implements the SortedDiskMap interface.
   337  func (r *pebbleMap) NewBatchWriter() diskmap.SortedDiskMapBatchWriter {
   338  	return r.NewBatchWriterCapacity(defaultBatchCapacityBytes)
   339  }
   340  
   341  // NewBatchWriterCapacity implements the SortedDiskMap interface.
   342  func (r *pebbleMap) NewBatchWriterCapacity(capacityBytes int) diskmap.SortedDiskMapBatchWriter {
   343  	makeKey := r.makeKey
   344  	if r.allowDuplicates {
   345  		makeKey = r.makeKeyWithSequence
   346  	}
   347  	return &pebbleMapBatchWriter{
   348  		capacity: capacityBytes,
   349  		makeKey:  makeKey,
   350  		batch:    r.store.NewBatch(),
   351  		store:    r.store,
   352  	}
   353  }
   354  
   355  // Clear implements the SortedDiskMap interface.
   356  func (r *pebbleMap) Clear() error {
   357  	if err := r.store.DeleteRange(
   358  		r.prefix,
   359  		roachpb.Key(r.prefix).PrefixEnd(),
   360  		pebble.NoSync,
   361  	); err != nil {
   362  		return errors.Wrapf(err, "unable to clear range with prefix %v", r.prefix)
   363  	}
   364  	// NB: we manually flush after performing the clear range to ensure that the
   365  	// range tombstone is pushed to disk which will kick off compactions that
   366  	// will eventually free up the deleted space.
   367  	_, err := r.store.AsyncFlush()
   368  	return err
   369  }
   370  
   371  // Close implements the SortedDiskMap interface.
   372  func (r *pebbleMap) Close(ctx context.Context) {
   373  	if err := r.Clear(); err != nil {
   374  		log.Errorf(ctx, "%v", err)
   375  	}
   376  }
   377  
   378  // SeekGE implements the SortedDiskMapIterator interface.
   379  func (i *pebbleMapIterator) SeekGE(k []byte) {
   380  	i.iter.SeekGE(i.makeKey(k))
   381  }
   382  
   383  // Rewind implements the SortedDiskMapIterator interface.
   384  func (i *pebbleMapIterator) Rewind() {
   385  	i.iter.SeekGE(i.makeKey(nil))
   386  }
   387  
   388  // Valid implements the SortedDiskMapIterator interface.
   389  func (i *pebbleMapIterator) Valid() (bool, error) {
   390  	return i.iter.Valid(), nil
   391  }
   392  
   393  // Next implements the SortedDiskMapIterator interface.
   394  func (i *pebbleMapIterator) Next() {
   395  	i.iter.Next()
   396  }
   397  
   398  // UnsafeKey implements the SortedDiskMapIterator interface.
   399  func (i *pebbleMapIterator) UnsafeKey() []byte {
   400  	unsafeKey := i.iter.Key()
   401  	end := len(unsafeKey)
   402  	if i.allowDuplicates {
   403  		// There are 8 bytes of sequence number at the end of the key, remove them.
   404  		end -= 8
   405  	}
   406  	return unsafeKey[len(i.prefix):end]
   407  }
   408  
   409  // UnsafeValue implements the SortedDiskMapIterator interface.
   410  func (i *pebbleMapIterator) UnsafeValue() []byte {
   411  	return i.iter.Value()
   412  }
   413  
   414  // Close implements the SortedDiskMapIterator interface.
   415  func (i *pebbleMapIterator) Close() {
   416  	_ = i.iter.Close()
   417  }
   418  
   419  // Put implements the SortedDiskMapBatchWriter interface.
   420  func (b *pebbleMapBatchWriter) Put(k []byte, v []byte) error {
   421  	key := b.makeKey(k)
   422  	if err := b.batch.Set(key, v, nil); err != nil {
   423  		return err
   424  	}
   425  	b.numPutsSinceFlush++
   426  	if len(b.batch.Repr()) >= b.capacity {
   427  		return b.Flush()
   428  	}
   429  	return nil
   430  }
   431  
   432  // Flush implements the SortedDiskMapBatchWriter interface.
   433  func (b *pebbleMapBatchWriter) Flush() error {
   434  	if err := b.batch.Commit(pebble.NoSync); err != nil {
   435  		return err
   436  	}
   437  	b.numPutsSinceFlush = 0
   438  	b.batch = b.store.NewBatch()
   439  	return nil
   440  }
   441  
   442  // NumPutsSinceFlush implements the SortedDiskMapBatchWriter interface.
   443  func (b *pebbleMapBatchWriter) NumPutsSinceFlush() int {
   444  	return b.numPutsSinceFlush
   445  }
   446  
   447  // Close implements the SortedDiskMapBatchWriter interface.
   448  func (b *pebbleMapBatchWriter) Close(ctx context.Context) error {
   449  	err := b.Flush()
   450  	if err != nil {
   451  		return err
   452  	}
   453  	return b.batch.Close()
   454  }