github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/suffix_rewriter.go (about)

     1  package sstable
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"math"
     7  	"sync"
     8  
     9  	"github.com/cespare/xxhash/v2"
    10  	"github.com/cockroachdb/errors"
    11  	"github.com/cockroachdb/pebble/internal/base"
    12  	"github.com/cockroachdb/pebble/internal/bytealloc"
    13  	"github.com/cockroachdb/pebble/internal/invariants"
    14  	"github.com/cockroachdb/pebble/internal/rangekey"
    15  	"github.com/cockroachdb/pebble/objstorage"
    16  )
    17  
    18  // RewriteKeySuffixes is deprecated.
    19  //
    20  // TODO(sumeer): remove after switching CockroachDB to RewriteKeySuffixesAndReturnFormat.
    21  func RewriteKeySuffixes(
    22  	sst []byte,
    23  	rOpts ReaderOptions,
    24  	out objstorage.Writable,
    25  	o WriterOptions,
    26  	from, to []byte,
    27  	concurrency int,
    28  ) (*WriterMetadata, error) {
    29  	meta, _, err := RewriteKeySuffixesAndReturnFormat(sst, rOpts, out, o, from, to, concurrency)
    30  	return meta, err
    31  }
    32  
    33  // RewriteKeySuffixesAndReturnFormat copies the content of the passed SSTable
    34  // bytes to a new sstable, written to `out`, in which the suffix `from` has is
    35  // replaced with `to` in every key. The input sstable must consist of only
    36  // Sets or RangeKeySets and every key must have `from` as its suffix as
    37  // determined by the Split function of the Comparer in the passed
    38  // WriterOptions. Range deletes must not exist in this sstable, as they will
    39  // be ignored.
    40  //
    41  // Data blocks are rewritten in parallel by `concurrency` workers and then
    42  // assembled into a final SST. Filters are copied from the original SST without
    43  // modification as they are not affected by the suffix, while block and table
    44  // properties are only minimally recomputed.
    45  //
    46  // TODO(sumeer): document limitations, if any, due to this limited
    47  // re-computation of properties (is there any loss of fidelity?).
    48  //
    49  // Any block and table property collectors configured in the WriterOptions must
    50  // implement SuffixReplaceableTableCollector/SuffixReplaceableBlockCollector.
    51  //
    52  // The WriterOptions.TableFormat is ignored, and the output sstable has the
    53  // same TableFormat as the input, which is returned in case the caller wants
    54  // to do some error checking. Suffix rewriting is meant to be efficient, and
    55  // allowing changes in the TableFormat detracts from that efficiency.
    56  //
    57  // Any obsolete bits that key-value pairs may be annotated with are ignored
    58  // and lost during the rewrite. Additionally, the output sstable has the
    59  // pebble.obsolete.is_strict property set to false. These limitations could be
    60  // removed if needed. The current use case for
    61  // RewriteKeySuffixesAndReturnFormat in CockroachDB is for MVCC-compliant file
    62  // ingestion, where these files do not contain RANGEDELs and have one
    63  // key-value pair per userkey -- so they trivially satisfy the strict
    64  // criteria, and we don't need the obsolete bit as a performance optimization.
    65  // For disaggregated storage, strict obsolete sstables are needed for L5 and
    66  // L6, but at the time of writing, we expect such MVCC-compliant file
    67  // ingestion to only ingest into levels L4 and higher. If this changes, we can
    68  // do one of two things to get rid of this limitation:
    69  //   - Validate that there are no duplicate userkeys and no RANGEDELs/MERGEs
    70  //     in the sstable to be rewritten. Validating no duplicate userkeys is
    71  //     non-trivial when rewriting blocks in parallel, so we could encode the
    72  //     pre-existing condition in the (existing) SnapshotPinnedKeys property --
    73  //     we need to update the external sst writer to calculate and encode this
    74  //     property.
    75  //   - Preserve the obsolete bit (with changes to the blockIter).
    76  func RewriteKeySuffixesAndReturnFormat(
    77  	sst []byte,
    78  	rOpts ReaderOptions,
    79  	out objstorage.Writable,
    80  	o WriterOptions,
    81  	from, to []byte,
    82  	concurrency int,
    83  ) (*WriterMetadata, TableFormat, error) {
    84  	r, err := NewMemReader(sst, rOpts)
    85  	if err != nil {
    86  		return nil, TableFormatUnspecified, err
    87  	}
    88  	defer r.Close()
    89  	return rewriteKeySuffixesInBlocks(r, out, o, from, to, concurrency)
    90  }
    91  
    92  func rewriteKeySuffixesInBlocks(
    93  	r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte, concurrency int,
    94  ) (*WriterMetadata, TableFormat, error) {
    95  	if o.Comparer == nil || o.Comparer.Split == nil {
    96  		return nil, TableFormatUnspecified,
    97  			errors.New("a valid splitter is required to rewrite suffixes")
    98  	}
    99  	if concurrency < 1 {
   100  		return nil, TableFormatUnspecified, errors.New("concurrency must be >= 1")
   101  	}
   102  	// Even though NumValueBlocks = 0 => NumValuesInValueBlocks = 0, check both
   103  	// as a defensive measure.
   104  	if r.Properties.NumValueBlocks > 0 || r.Properties.NumValuesInValueBlocks > 0 {
   105  		return nil, TableFormatUnspecified,
   106  			errors.New("sstable with a single suffix should not have value blocks")
   107  	}
   108  
   109  	tableFormat := r.tableFormat
   110  	o.TableFormat = tableFormat
   111  	w := NewWriter(out, o)
   112  	defer func() {
   113  		if w != nil {
   114  			w.Close()
   115  		}
   116  	}()
   117  
   118  	for _, c := range w.propCollectors {
   119  		if _, ok := c.(SuffixReplaceableTableCollector); !ok {
   120  			return nil, TableFormatUnspecified,
   121  				errors.Errorf("property collector %s does not support suffix replacement", c.Name())
   122  		}
   123  	}
   124  	for _, c := range w.blockPropCollectors {
   125  		if _, ok := c.(SuffixReplaceableBlockCollector); !ok {
   126  			return nil, TableFormatUnspecified,
   127  				errors.Errorf("block property collector %s does not support suffix replacement", c.Name())
   128  		}
   129  	}
   130  
   131  	l, err := r.Layout()
   132  	if err != nil {
   133  		return nil, TableFormatUnspecified, errors.Wrap(err, "reading layout")
   134  	}
   135  
   136  	if err := rewriteDataBlocksToWriter(r, w, l.Data, from, to, w.split, concurrency); err != nil {
   137  		return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting data blocks")
   138  	}
   139  
   140  	// Copy over the range key block and replace suffixes in it if it exists.
   141  	if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil {
   142  		return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting range key blocks")
   143  	}
   144  
   145  	// Copy over the filter block if it exists (rewriteDataBlocksToWriter will
   146  	// already have ensured this is valid if it exists).
   147  	if w.filter != nil && l.Filter.Length > 0 {
   148  		filterBlock, _, err := readBlockBuf(r, l.Filter, nil)
   149  		if err != nil {
   150  			return nil, TableFormatUnspecified, errors.Wrap(err, "reading filter")
   151  		}
   152  		w.filter = copyFilterWriter{
   153  			origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filterBlock,
   154  		}
   155  	}
   156  
   157  	if err := w.Close(); err != nil {
   158  		w = nil
   159  		return nil, TableFormatUnspecified, err
   160  	}
   161  	writerMeta, err := w.Metadata()
   162  	w = nil
   163  	return writerMeta, tableFormat, err
   164  }
   165  
   166  var errBadKind = errors.New("key does not have expected kind (set)")
   167  
   168  type blockWithSpan struct {
   169  	start, end InternalKey
   170  	data       []byte
   171  }
   172  
   173  func rewriteBlocks(
   174  	r *Reader,
   175  	restartInterval int,
   176  	checksumType ChecksumType,
   177  	compression Compression,
   178  	input []BlockHandleWithProperties,
   179  	output []blockWithSpan,
   180  	totalWorkers, worker int,
   181  	from, to []byte,
   182  	split Split,
   183  ) error {
   184  	bw := blockWriter{
   185  		restartInterval: restartInterval,
   186  	}
   187  	buf := blockBuf{checksummer: checksummer{checksumType: checksumType}}
   188  	if checksumType == ChecksumTypeXXHash {
   189  		buf.checksummer.xxHasher = xxhash.New()
   190  	}
   191  
   192  	var blockAlloc bytealloc.A
   193  	var keyAlloc bytealloc.A
   194  	var scratch InternalKey
   195  
   196  	var inputBlock, inputBlockBuf []byte
   197  
   198  	iter := &blockIter{}
   199  
   200  	// We'll assume all blocks are _roughly_ equal so round-robin static partition
   201  	// of each worker doing every ith block is probably enough.
   202  	for i := worker; i < len(input); i += totalWorkers {
   203  		bh := input[i]
   204  
   205  		var err error
   206  		inputBlock, inputBlockBuf, err = readBlockBuf(r, bh.BlockHandle, inputBlockBuf)
   207  		if err != nil {
   208  			return err
   209  		}
   210  		if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum, false); err != nil {
   211  			return err
   212  		}
   213  
   214  		if cap(bw.restarts) < int(iter.restarts) {
   215  			bw.restarts = make([]uint32, 0, iter.restarts)
   216  		}
   217  		if cap(bw.buf) == 0 {
   218  			bw.buf = make([]byte, 0, len(inputBlock))
   219  		}
   220  		if cap(bw.restarts) < int(iter.numRestarts) {
   221  			bw.restarts = make([]uint32, 0, iter.numRestarts)
   222  		}
   223  
   224  		for key, val := iter.First(); key != nil; key, val = iter.Next() {
   225  			if key.Kind() != InternalKeyKindSet {
   226  				return errBadKind
   227  			}
   228  			si := split(key.UserKey)
   229  			oldSuffix := key.UserKey[si:]
   230  			if !bytes.Equal(oldSuffix, from) {
   231  				err := errors.Errorf("key has suffix %q, expected %q", oldSuffix, from)
   232  				return err
   233  			}
   234  			newLen := si + len(to)
   235  			if cap(scratch.UserKey) < newLen {
   236  				scratch.UserKey = make([]byte, 0, len(key.UserKey)*2+len(to)-len(from))
   237  			}
   238  
   239  			scratch.Trailer = key.Trailer
   240  			scratch.UserKey = scratch.UserKey[:newLen]
   241  			copy(scratch.UserKey, key.UserKey[:si])
   242  			copy(scratch.UserKey[si:], to)
   243  
   244  			// NB: for TableFormatPebblev3 and higher, since
   245  			// !iter.lazyValueHandling.hasValuePrefix, it will return the raw value
   246  			// in the block, which includes the 1-byte prefix. This is fine since bw
   247  			// also does not know about the prefix and will preserve it in bw.add.
   248  			v := val.InPlaceValue()
   249  			if invariants.Enabled && r.tableFormat >= TableFormatPebblev3 &&
   250  				key.Kind() == InternalKeyKindSet {
   251  				if len(v) < 1 {
   252  					return errors.Errorf("value has no prefix")
   253  				}
   254  				prefix := valuePrefix(v[0])
   255  				if isValueHandle(prefix) {
   256  					return errors.Errorf("value prefix is incorrect")
   257  				}
   258  				if setHasSamePrefix(prefix) {
   259  					return errors.Errorf("multiple keys with same key prefix")
   260  				}
   261  			}
   262  			bw.add(scratch, v)
   263  			if output[i].start.UserKey == nil {
   264  				keyAlloc, output[i].start = cloneKeyWithBuf(scratch, keyAlloc)
   265  			}
   266  		}
   267  		*iter = iter.resetForReuse()
   268  
   269  		keyAlloc, output[i].end = cloneKeyWithBuf(scratch, keyAlloc)
   270  
   271  		finished := compressAndChecksum(bw.finish(), compression, &buf)
   272  
   273  		// copy our finished block into the output buffer.
   274  		blockAlloc, output[i].data = blockAlloc.Alloc(len(finished) + blockTrailerLen)
   275  		copy(output[i].data, finished)
   276  		copy(output[i].data[len(finished):], buf.tmp[:blockTrailerLen])
   277  	}
   278  	return nil
   279  }
   280  
   281  func rewriteDataBlocksToWriter(
   282  	r *Reader,
   283  	w *Writer,
   284  	data []BlockHandleWithProperties,
   285  	from, to []byte,
   286  	split Split,
   287  	concurrency int,
   288  ) error {
   289  	if r.Properties.NumEntries == 0 {
   290  		// No point keys.
   291  		return nil
   292  	}
   293  	blocks := make([]blockWithSpan, len(data))
   294  
   295  	if w.filter != nil {
   296  		if r.Properties.FilterPolicyName != w.filter.policyName() {
   297  			return errors.New("mismatched filters")
   298  		}
   299  		if was, is := r.Properties.ComparerName, w.props.ComparerName; was != is {
   300  			return errors.Errorf("mismatched Comparer %s vs %s, replacement requires same splitter to copy filters", was, is)
   301  		}
   302  	}
   303  
   304  	g := &sync.WaitGroup{}
   305  	g.Add(concurrency)
   306  	errCh := make(chan error, concurrency)
   307  	for i := 0; i < concurrency; i++ {
   308  		worker := i
   309  		go func() {
   310  			defer g.Done()
   311  			err := rewriteBlocks(
   312  				r,
   313  				w.dataBlockBuf.dataBlock.restartInterval,
   314  				w.blockBuf.checksummer.checksumType,
   315  				w.compression,
   316  				data,
   317  				blocks,
   318  				concurrency,
   319  				worker,
   320  				from, to,
   321  				split,
   322  			)
   323  			if err != nil {
   324  				errCh <- err
   325  			}
   326  		}()
   327  	}
   328  	g.Wait()
   329  	close(errCh)
   330  	if err, ok := <-errCh; ok {
   331  		return err
   332  	}
   333  
   334  	for _, p := range w.propCollectors {
   335  		if err := p.(SuffixReplaceableTableCollector).UpdateKeySuffixes(r.Properties.UserProperties, from, to); err != nil {
   336  			return err
   337  		}
   338  	}
   339  
   340  	var decoder blockPropertiesDecoder
   341  	var oldShortIDs []shortID
   342  	var oldProps [][]byte
   343  	if len(w.blockPropCollectors) > 0 {
   344  		oldProps = make([][]byte, len(w.blockPropCollectors))
   345  		oldShortIDs = make([]shortID, math.MaxUint8)
   346  		for i, p := range w.blockPropCollectors {
   347  			if prop, ok := r.Properties.UserProperties[p.Name()]; ok {
   348  				was, is := shortID(byte(prop[0])), shortID(i)
   349  				oldShortIDs[was] = is
   350  			}
   351  		}
   352  	}
   353  
   354  	for i := range blocks {
   355  		// Write the rewritten block to the file.
   356  		if err := w.writable.Write(blocks[i].data); err != nil {
   357  			return err
   358  		}
   359  
   360  		n := len(blocks[i].data)
   361  		bh := BlockHandle{Offset: w.meta.Size, Length: uint64(n) - blockTrailerLen}
   362  		// Update the overall size.
   363  		w.meta.Size += uint64(n)
   364  
   365  		// Load any previous values for our prop collectors into oldProps.
   366  		for i := range oldProps {
   367  			oldProps[i] = nil
   368  		}
   369  		decoder.props = data[i].Props
   370  		for !decoder.done() {
   371  			id, val, err := decoder.next()
   372  			if err != nil {
   373  				return err
   374  			}
   375  			oldProps[oldShortIDs[id]] = val
   376  		}
   377  
   378  		for i, p := range w.blockPropCollectors {
   379  			if err := p.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProps[i], from, to); err != nil {
   380  				return err
   381  			}
   382  		}
   383  
   384  		bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh)
   385  		if err != nil {
   386  			return err
   387  		}
   388  		var nextKey InternalKey
   389  		if i+1 < len(blocks) {
   390  			nextKey = blocks[i+1].start
   391  		}
   392  		if err = w.addIndexEntrySync(blocks[i].end, nextKey, bhp, w.dataBlockBuf.tmp[:]); err != nil {
   393  			return err
   394  		}
   395  	}
   396  
   397  	w.meta.updateSeqNum(blocks[0].start.SeqNum())
   398  	w.props.NumEntries = r.Properties.NumEntries
   399  	w.props.RawKeySize = r.Properties.RawKeySize
   400  	w.props.RawValueSize = r.Properties.RawValueSize
   401  	w.meta.SetSmallestPointKey(blocks[0].start)
   402  	w.meta.SetLargestPointKey(blocks[len(blocks)-1].end)
   403  	return nil
   404  }
   405  
   406  func rewriteRangeKeyBlockToWriter(r *Reader, w *Writer, from, to []byte) error {
   407  	iter, err := r.NewRawRangeKeyIter()
   408  	if err != nil {
   409  		return err
   410  	}
   411  	if iter == nil {
   412  		// No range keys.
   413  		return nil
   414  	}
   415  	defer iter.Close()
   416  
   417  	for s := iter.First(); s != nil; s = iter.Next() {
   418  		if !s.Valid() {
   419  			break
   420  		}
   421  		for i := range s.Keys {
   422  			if s.Keys[i].Kind() != base.InternalKeyKindRangeKeySet {
   423  				return errBadKind
   424  			}
   425  			if !bytes.Equal(s.Keys[i].Suffix, from) {
   426  				return errors.Errorf("key has suffix %q, expected %q", s.Keys[i].Suffix, from)
   427  			}
   428  			s.Keys[i].Suffix = to
   429  		}
   430  
   431  		err := rangekey.Encode(s, func(k base.InternalKey, v []byte) error {
   432  			// Calling AddRangeKey instead of addRangeKeySpan bypasses the fragmenter.
   433  			// This is okay because the raw fragments off of `iter` are already
   434  			// fragmented, and suffix replacement should not affect fragmentation.
   435  			return w.AddRangeKey(k, v)
   436  		})
   437  		if err != nil {
   438  			return err
   439  		}
   440  	}
   441  
   442  	return nil
   443  }
   444  
   445  type copyFilterWriter struct {
   446  	origMetaName   string
   447  	origPolicyName string
   448  	data           []byte
   449  }
   450  
   451  func (copyFilterWriter) addKey(key []byte)         { panic("unimplemented") }
   452  func (c copyFilterWriter) finish() ([]byte, error) { return c.data, nil }
   453  func (c copyFilterWriter) metaName() string        { return c.origMetaName }
   454  func (c copyFilterWriter) policyName() string      { return c.origPolicyName }
   455  
   456  // RewriteKeySuffixesViaWriter is similar to RewriteKeySuffixes but uses just a
   457  // single loop over the Reader that writes each key to the Writer with the new
   458  // suffix. The is significantly slower than the parallelized rewriter, and does
   459  // more work to rederive filters, props, etc.
   460  //
   461  // Any obsolete bits that key-value pairs may be annotated with are ignored
   462  // and lost during the rewrite. Some of the obsolete bits may be recreated --
   463  // specifically when there are multiple keys with the same user key.
   464  // Additionally, the output sstable has the pebble.obsolete.is_strict property
   465  // set to false. See the longer comment at RewriteKeySuffixesAndReturnFormat.
   466  func RewriteKeySuffixesViaWriter(
   467  	r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte,
   468  ) (*WriterMetadata, error) {
   469  	if o.Comparer == nil || o.Comparer.Split == nil {
   470  		return nil, errors.New("a valid splitter is required to rewrite suffixes")
   471  	}
   472  
   473  	o.IsStrictObsolete = false
   474  	w := NewWriter(out, o)
   475  	defer func() {
   476  		if w != nil {
   477  			w.Close()
   478  		}
   479  	}()
   480  	i, err := r.NewIter(nil, nil)
   481  	if err != nil {
   482  		return nil, err
   483  	}
   484  	defer i.Close()
   485  
   486  	k, v := i.First()
   487  	var scratch InternalKey
   488  	for k != nil {
   489  		if k.Kind() != InternalKeyKindSet {
   490  			return nil, errors.New("invalid key type")
   491  		}
   492  		oldSuffix := k.UserKey[r.Split(k.UserKey):]
   493  		if !bytes.Equal(oldSuffix, from) {
   494  			return nil, errors.Errorf("key has suffix %q, expected %q", oldSuffix, from)
   495  		}
   496  		scratch.UserKey = append(scratch.UserKey[:0], k.UserKey[:len(k.UserKey)-len(from)]...)
   497  		scratch.UserKey = append(scratch.UserKey, to...)
   498  		scratch.Trailer = k.Trailer
   499  
   500  		val, _, err := v.Value(nil)
   501  		if err != nil {
   502  			return nil, err
   503  		}
   504  		if w.addPoint(scratch, val, false); err != nil {
   505  			return nil, err
   506  		}
   507  		k, v = i.Next()
   508  	}
   509  	if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil {
   510  		return nil, err
   511  	}
   512  	if err := w.Close(); err != nil {
   513  		w = nil
   514  		return nil, err
   515  	}
   516  	writerMeta, err := w.Metadata()
   517  	w = nil
   518  	return writerMeta, err
   519  }
   520  
   521  // NewMemReader opens a reader over the SST stored in the passed []byte.
   522  func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error) {
   523  	return NewReader(newMemReader(sst), o)
   524  }
   525  
   526  func readBlockBuf(r *Reader, bh BlockHandle, buf []byte) ([]byte, []byte, error) {
   527  	raw := r.readable.(*memReader).b[bh.Offset : bh.Offset+bh.Length+blockTrailerLen]
   528  	if err := checkChecksum(r.checksumType, raw, bh, 0); err != nil {
   529  		return nil, buf, err
   530  	}
   531  	typ := blockType(raw[bh.Length])
   532  	raw = raw[:bh.Length]
   533  	if typ == noCompressionBlockType {
   534  		return raw, buf, nil
   535  	}
   536  	decompressedLen, prefix, err := decompressedLen(typ, raw)
   537  	if err != nil {
   538  		return nil, buf, err
   539  	}
   540  	if cap(buf) < decompressedLen {
   541  		buf = make([]byte, decompressedLen)
   542  	}
   543  	res, err := decompressInto(typ, raw[prefix:], buf[:decompressedLen])
   544  	return res, buf, err
   545  }
   546  
   547  // memReader is a thin wrapper around a []byte such that it can be passed to
   548  // sstable.Reader. It supports concurrent use, and does so without locking in
   549  // contrast to the heavier read/write vfs.MemFile.
   550  type memReader struct {
   551  	b  []byte
   552  	r  *bytes.Reader
   553  	rh objstorage.NoopReadHandle
   554  }
   555  
   556  var _ objstorage.Readable = (*memReader)(nil)
   557  
   558  func newMemReader(b []byte) *memReader {
   559  	r := &memReader{
   560  		b: b,
   561  		r: bytes.NewReader(b),
   562  	}
   563  	r.rh = objstorage.MakeNoopReadHandle(r)
   564  	return r
   565  }
   566  
   567  // ReadAt is part of objstorage.Readable.
   568  func (m *memReader) ReadAt(_ context.Context, p []byte, off int64) error {
   569  	n, err := m.r.ReadAt(p, off)
   570  	if invariants.Enabled && err == nil && n != len(p) {
   571  		panic("short read")
   572  	}
   573  	return err
   574  }
   575  
   576  // Close is part of objstorage.Readable.
   577  func (*memReader) Close() error {
   578  	return nil
   579  }
   580  
   581  // Stat is part of objstorage.Readable.
   582  func (m *memReader) Size() int64 {
   583  	return int64(len(m.b))
   584  }
   585  
   586  // NewReadHandle is part of objstorage.Readable.
   587  func (m *memReader) NewReadHandle(_ context.Context) objstorage.ReadHandle {
   588  	return &m.rh
   589  }