github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/suffix_rewriter.go (about)

     1  package sstable
     2  
     3  import (
     4  	"bytes"
     5  	"math"
     6  	"os"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/cespare/xxhash/v2"
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/zuoyebang/bitalostable/internal/base"
    13  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    14  )
    15  
    16  // RewriteKeySuffixes copies the content of the passed SSTable bytes to a new
    17  // sstable, written to `out`, in which the suffix `from` has is replaced with
    18  // `to` in every key. The input sstable must consist of only Sets or RangeKeySets
    19  // and every key must have `from` as its suffix as determined by the Split
    20  // function of the Comparer in the passed WriterOptions. Range deletes must not
    21  // exist in this sstable, as they will be ignored.
    22  //
    23  // Data blocks are rewritten in parallel by `concurrency` workers and then
    24  // assembled into a final SST. Filters are copied from the original SST without
    25  // modification as they are not affected by the suffix, while block and table
    26  // properties are only minimally recomputed.
    27  //
    28  // Any block and table property collectors configured in the WriterOptions must
    29  // implement SuffixReplaceableTableCollector/SuffixReplaceableBlockCollector.
    30  func RewriteKeySuffixes(
    31  	sst []byte,
    32  	rOpts ReaderOptions,
    33  	out writeCloseSyncer,
    34  	o WriterOptions,
    35  	from, to []byte,
    36  	concurrency int,
    37  ) (*WriterMetadata, error) {
    38  	r, err := NewMemReader(sst, rOpts)
    39  	if err != nil {
    40  		return nil, err
    41  	}
    42  	defer r.Close()
    43  	return rewriteKeySuffixesInBlocks(r, out, o, from, to, concurrency)
    44  }
    45  
    46  func rewriteKeySuffixesInBlocks(
    47  	r *Reader, out writeCloseSyncer, o WriterOptions, from, to []byte, concurrency int,
    48  ) (*WriterMetadata, error) {
    49  	if o.Comparer == nil || o.Comparer.Split == nil {
    50  		return nil, errors.New("a valid splitter is required to define suffix to replace replace suffix")
    51  	}
    52  	if concurrency < 1 {
    53  		return nil, errors.New("concurrency must be >= 1")
    54  	}
    55  
    56  	w := NewWriter(out, o)
    57  	defer w.Close()
    58  
    59  	for _, c := range w.propCollectors {
    60  		if _, ok := c.(SuffixReplaceableTableCollector); !ok {
    61  			return nil, errors.Errorf("property collector %s does not support suffix replacement", c.Name())
    62  		}
    63  	}
    64  	for _, c := range w.blockPropCollectors {
    65  		if _, ok := c.(SuffixReplaceableBlockCollector); !ok {
    66  			return nil, errors.Errorf("block property collector %s does not support suffix replacement", c.Name())
    67  		}
    68  	}
    69  
    70  	l, err := r.Layout()
    71  	if err != nil {
    72  		return nil, errors.Wrap(err, "reading layout")
    73  	}
    74  
    75  	if err := rewriteDataBlocksToWriter(r, w, l.Data, from, to, w.split, concurrency); err != nil {
    76  		return nil, errors.Wrap(err, "rewriting data blocks")
    77  	}
    78  
    79  	// Copy over the range key block and replace suffixes in it if it exists.
    80  	if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil {
    81  		return nil, errors.Wrap(err, "rewriting range key blocks")
    82  	}
    83  
    84  	// Copy over the filter block if it exists (rewriteDataBlocksToWriter will
    85  	// already have ensured this is valid if it exists).
    86  	if w.filter != nil && l.Filter.Length > 0 {
    87  		filterBlock, _, err := readBlockBuf(r, l.Filter, nil)
    88  		if err != nil {
    89  			return nil, errors.Wrap(err, "reading filter")
    90  		}
    91  		w.filter = copyFilterWriter{
    92  			origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filterBlock,
    93  		}
    94  	}
    95  
    96  	if err := w.Close(); err != nil {
    97  		return nil, err
    98  	}
    99  
   100  	return w.Metadata()
   101  }
   102  
   103  var errBadKind = errors.New("key does not have expected kind (set)")
   104  
   105  type blockWithSpan struct {
   106  	start, end InternalKey
   107  	data       []byte
   108  }
   109  
   110  func rewriteBlocks(
   111  	r *Reader,
   112  	restartInterval int,
   113  	checksumType ChecksumType,
   114  	compression Compression,
   115  	input []BlockHandleWithProperties,
   116  	output []blockWithSpan,
   117  	totalWorkers, worker int,
   118  	from, to []byte,
   119  	split Split,
   120  ) error {
   121  	bw := blockWriter{
   122  		restartInterval: restartInterval,
   123  	}
   124  	buf := blockBuf{checksummer: checksummer{checksumType: checksumType}}
   125  	if checksumType == ChecksumTypeXXHash {
   126  		buf.checksummer.xxHasher = xxhash.New()
   127  	}
   128  
   129  	var blockAlloc []byte
   130  	var keyAlloc []byte
   131  	var scratch InternalKey
   132  
   133  	var inputBlock, inputBlockBuf []byte
   134  
   135  	iter := &blockIter{}
   136  
   137  	// We'll assume all blocks are _roughly_ equal so round-robin static partition
   138  	// of each worker doing every ith block is probably enough.
   139  	for i := worker; i < len(input); i += totalWorkers {
   140  		bh := input[i]
   141  
   142  		var err error
   143  		inputBlock, inputBlockBuf, err = readBlockBuf(r, bh.BlockHandle, inputBlockBuf)
   144  		if err != nil {
   145  			return err
   146  		}
   147  		if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum); err != nil {
   148  			return err
   149  		}
   150  
   151  		if cap(bw.restarts) < int(iter.restarts) {
   152  			bw.restarts = make([]uint32, 0, iter.restarts)
   153  		}
   154  		if cap(bw.buf) == 0 {
   155  			bw.buf = make([]byte, 0, len(inputBlock))
   156  		}
   157  		if cap(bw.restarts) < int(iter.numRestarts) {
   158  			bw.restarts = make([]uint32, 0, iter.numRestarts)
   159  		}
   160  
   161  		for key, val := iter.First(); key != nil; key, val = iter.Next() {
   162  			if key.Kind() != InternalKeyKindSet {
   163  				return errBadKind
   164  			}
   165  			si := split(key.UserKey)
   166  			oldSuffix := key.UserKey[si:]
   167  			if !bytes.Equal(oldSuffix, from) {
   168  				err := errors.Errorf("key has suffix %q, expected %q", oldSuffix, from)
   169  				return err
   170  			}
   171  			newLen := si + len(to)
   172  			if cap(scratch.UserKey) < newLen {
   173  				scratch.UserKey = make([]byte, 0, len(key.UserKey)*2+len(to)-len(from))
   174  			}
   175  
   176  			scratch.Trailer = key.Trailer
   177  			scratch.UserKey = scratch.UserKey[:newLen]
   178  			copy(scratch.UserKey, key.UserKey[:si])
   179  			copy(scratch.UserKey[si:], to)
   180  
   181  			bw.add(scratch, val)
   182  			if output[i].start.UserKey == nil {
   183  				keyAlloc, output[i].start = cloneKeyWithBuf(scratch, keyAlloc)
   184  			}
   185  		}
   186  		*iter = iter.resetForReuse()
   187  
   188  		keyAlloc, output[i].end = cloneKeyWithBuf(scratch, keyAlloc)
   189  
   190  		finished := compressAndChecksum(bw.finish(), compression, &buf)
   191  
   192  		// copy our finished block into the output buffer.
   193  		sz := len(finished) + blockTrailerLen
   194  		if cap(blockAlloc) < sz {
   195  			blockAlloc = make([]byte, sz*128)
   196  		}
   197  		output[i].data = blockAlloc[:sz:sz]
   198  		blockAlloc = blockAlloc[sz:]
   199  		copy(output[i].data, finished)
   200  		copy(output[i].data[len(finished):], buf.tmp[:blockTrailerLen])
   201  	}
   202  	return nil
   203  }
   204  
   205  func rewriteDataBlocksToWriter(
   206  	r *Reader,
   207  	w *Writer,
   208  	data []BlockHandleWithProperties,
   209  	from, to []byte,
   210  	split Split,
   211  	concurrency int,
   212  ) error {
   213  	if r.Properties.NumEntries == 0 {
   214  		// No point keys.
   215  		return nil
   216  	}
   217  	blocks := make([]blockWithSpan, len(data))
   218  
   219  	if w.filter != nil {
   220  		if r.Properties.FilterPolicyName != w.filter.policyName() {
   221  			return errors.New("mismatched filters")
   222  		}
   223  		if was, is := r.Properties.ComparerName, w.props.ComparerName; was != is {
   224  			return errors.Errorf("mismatched Comparer %s vs %s, replacement requires same splitter to copy filters", was, is)
   225  		}
   226  	}
   227  
   228  	g := &sync.WaitGroup{}
   229  	g.Add(concurrency)
   230  	errCh := make(chan error, concurrency)
   231  	for i := 0; i < concurrency; i++ {
   232  		worker := i
   233  		go func() {
   234  			defer g.Done()
   235  			err := rewriteBlocks(
   236  				r,
   237  				w.dataBlockBuf.dataBlock.restartInterval,
   238  				w.blockBuf.checksummer.checksumType,
   239  				w.compression,
   240  				data,
   241  				blocks,
   242  				concurrency,
   243  				worker,
   244  				from, to,
   245  				split,
   246  			)
   247  			if err != nil {
   248  				errCh <- err
   249  			}
   250  		}()
   251  	}
   252  	g.Wait()
   253  	close(errCh)
   254  	if err, ok := <-errCh; ok {
   255  		return err
   256  	}
   257  
   258  	for _, p := range w.propCollectors {
   259  		if err := p.(SuffixReplaceableTableCollector).UpdateKeySuffixes(r.Properties.UserProperties, from, to); err != nil {
   260  			return err
   261  		}
   262  	}
   263  
   264  	var decoder blockPropertiesDecoder
   265  	var oldShortIDs []shortID
   266  	var oldProps [][]byte
   267  	if len(w.blockPropCollectors) > 0 {
   268  		oldProps = make([][]byte, len(w.blockPropCollectors))
   269  		oldShortIDs = make([]shortID, math.MaxUint8)
   270  		for i, p := range w.blockPropCollectors {
   271  			if prop, ok := r.Properties.UserProperties[p.Name()]; ok {
   272  				was, is := shortID(byte(prop[0])), shortID(i)
   273  				oldShortIDs[was] = is
   274  			}
   275  		}
   276  	}
   277  
   278  	for i := range blocks {
   279  		// Write the rewritten block to the file.
   280  		n, err := w.writer.Write(blocks[i].data)
   281  		if err != nil {
   282  			return err
   283  		}
   284  
   285  		bh := BlockHandle{Offset: w.meta.Size, Length: uint64(n) - blockTrailerLen}
   286  		// Update the overall size.
   287  		w.meta.Size += uint64(n)
   288  
   289  		// Load any previous values for our prop collectors into oldProps.
   290  		for i := range oldProps {
   291  			oldProps[i] = nil
   292  		}
   293  		decoder.props = data[i].Props
   294  		for !decoder.done() {
   295  			id, val, err := decoder.next()
   296  			if err != nil {
   297  				return err
   298  			}
   299  			oldProps[oldShortIDs[id]] = val
   300  		}
   301  
   302  		for i, p := range w.blockPropCollectors {
   303  			if err := p.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProps[i], from, to); err != nil {
   304  				return err
   305  			}
   306  		}
   307  
   308  		var bhp BlockHandleWithProperties
   309  		if bhp, err = w.maybeAddBlockPropertiesToBlockHandle(bh); err != nil {
   310  			return err
   311  		}
   312  		var nextKey InternalKey
   313  		if i+1 < len(blocks) {
   314  			nextKey = blocks[i+1].start
   315  		}
   316  		if err = w.addIndexEntrySync(blocks[i].end, nextKey, bhp, w.dataBlockBuf.tmp[:]); err != nil {
   317  			return err
   318  		}
   319  	}
   320  
   321  	w.meta.updateSeqNum(blocks[0].start.SeqNum())
   322  	w.props.NumEntries = r.Properties.NumEntries
   323  	w.props.RawKeySize = r.Properties.RawKeySize
   324  	w.props.RawValueSize = r.Properties.RawValueSize
   325  	w.meta.SetSmallestPointKey(blocks[0].start)
   326  	w.meta.SetLargestPointKey(blocks[len(blocks)-1].end)
   327  	return nil
   328  }
   329  
   330  func rewriteRangeKeyBlockToWriter(r *Reader, w *Writer, from, to []byte) error {
   331  	iter, err := r.NewRawRangeKeyIter()
   332  	if err != nil {
   333  		return err
   334  	}
   335  	if iter == nil {
   336  		// No range keys.
   337  		return nil
   338  	}
   339  	defer iter.Close()
   340  
   341  	for s := iter.First(); s != nil; s = iter.Next() {
   342  		if !s.Valid() {
   343  			break
   344  		}
   345  		for i := range s.Keys {
   346  			if s.Keys[i].Kind() != base.InternalKeyKindRangeKeySet {
   347  				return errBadKind
   348  			}
   349  			if !bytes.Equal(s.Keys[i].Suffix, from) {
   350  				return errors.Errorf("key has suffix %q, expected %q", s.Keys[i].Suffix, from)
   351  			}
   352  			s.Keys[i].Suffix = to
   353  		}
   354  
   355  		err := rangekey.Encode(s, func(k base.InternalKey, v []byte) error {
   356  			// Calling AddRangeKey instead of addRangeKeySpan bypasses the fragmenter.
   357  			// This is okay because the raw fragments off of `iter` are already
   358  			// fragmented, and suffix replacement should not affect fragmentation.
   359  			return w.AddRangeKey(k, v)
   360  		})
   361  		if err != nil {
   362  			return err
   363  		}
   364  	}
   365  
   366  	return nil
   367  }
   368  
   369  type copyFilterWriter struct {
   370  	origMetaName   string
   371  	origPolicyName string
   372  	data           []byte
   373  }
   374  
   375  func (copyFilterWriter) addKey(key []byte)         { panic("unimplemented") }
   376  func (c copyFilterWriter) finish() ([]byte, error) { return c.data, nil }
   377  func (c copyFilterWriter) metaName() string        { return c.origMetaName }
   378  func (c copyFilterWriter) policyName() string      { return c.origPolicyName }
   379  
   380  // RewriteKeySuffixesViaWriter is similar to RewriteKeySuffixes but uses just a
   381  // single loop over the Reader that writes each key to the Writer with the new
   382  // suffix. The is significantly slower than the parallelized rewriter, and does
   383  // more work to rederive filters, props, etc, however re-doing that work makes
   384  // it less restrictive -- props no longer need to
   385  func RewriteKeySuffixesViaWriter(
   386  	r *Reader, out writeCloseSyncer, o WriterOptions, from, to []byte,
   387  ) (*WriterMetadata, error) {
   388  	if o.Comparer == nil || o.Comparer.Split == nil {
   389  		return nil, errors.New("a valid splitter is required to define suffix to replace replace suffix")
   390  	}
   391  
   392  	w := NewWriter(out, o)
   393  	i, err := r.NewIter(nil, nil)
   394  	if err != nil {
   395  		return nil, err
   396  	}
   397  	defer i.Close()
   398  
   399  	k, v := i.First()
   400  	var scratch InternalKey
   401  	for k != nil {
   402  		if k.Kind() != InternalKeyKindSet {
   403  			return nil, errors.New("invalid key type")
   404  		}
   405  		oldSuffix := k.UserKey[r.Split(k.UserKey):]
   406  		if !bytes.Equal(oldSuffix, from) {
   407  			return nil, errors.Errorf("key has suffix %q, expected %q", oldSuffix, from)
   408  		}
   409  		scratch.UserKey = append(scratch.UserKey[:0], k.UserKey[:len(k.UserKey)-len(from)]...)
   410  		scratch.UserKey = append(scratch.UserKey, to...)
   411  		scratch.Trailer = k.Trailer
   412  
   413  		if w.addPoint(scratch, v); err != nil {
   414  			return nil, err
   415  		}
   416  		k, v = i.Next()
   417  	}
   418  	if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil {
   419  		return nil, err
   420  	}
   421  	if err := w.Close(); err != nil {
   422  		return nil, err
   423  	}
   424  	return &w.meta, nil
   425  }
   426  
   427  // NewMemReader opens a reader over the SST stored in the passed []byte.
   428  func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error) {
   429  	return NewReader(memReader{sst, bytes.NewReader(sst), sizeOnlyStat(int64(len(sst)))}, o)
   430  }
   431  
   432  func readBlockBuf(r *Reader, bh BlockHandle, buf []byte) ([]byte, []byte, error) {
   433  	raw := r.file.(memReader).b[bh.Offset : bh.Offset+bh.Length+blockTrailerLen]
   434  	if err := checkChecksum(r.checksumType, raw, bh, 0); err != nil {
   435  		return nil, buf, err
   436  	}
   437  	typ := blockType(raw[bh.Length])
   438  	raw = raw[:bh.Length]
   439  	if typ == noCompressionBlockType {
   440  		return raw, buf, nil
   441  	}
   442  	decompressedLen, prefix, err := decompressedLen(typ, raw)
   443  	if err != nil {
   444  		return nil, buf, err
   445  	}
   446  	if cap(buf) < decompressedLen {
   447  		buf = make([]byte, decompressedLen)
   448  	}
   449  	res, err := decompressInto(typ, raw[prefix:], buf[:decompressedLen])
   450  	return res, buf, err
   451  }
   452  
   453  // memReader is a thin wrapper around a []byte such that it can be passed to an
   454  // sstable.Reader. It supports concurrent use, and does so without locking in
   455  // contrast to the heavier read/write vfs.MemFile.
   456  type memReader struct {
   457  	b []byte
   458  	r *bytes.Reader
   459  	s sizeOnlyStat
   460  }
   461  
   462  var _ ReadableFile = memReader{}
   463  
   464  // ReadAt implements io.ReaderAt.
   465  func (m memReader) ReadAt(p []byte, off int64) (n int, err error) { return m.r.ReadAt(p, off) }
   466  
   467  // Close implements io.Closer.
   468  func (memReader) Close() error { return nil }
   469  
   470  // Stat implements ReadableFile.
   471  func (m memReader) Stat() (os.FileInfo, error) { return m.s, nil }
   472  
   473  type sizeOnlyStat int64
   474  
   475  func (s sizeOnlyStat) Size() int64      { return int64(s) }
   476  func (sizeOnlyStat) IsDir() bool        { panic(errors.AssertionFailedf("unimplemented")) }
   477  func (sizeOnlyStat) ModTime() time.Time { panic(errors.AssertionFailedf("unimplemented")) }
   478  func (sizeOnlyStat) Mode() os.FileMode  { panic(errors.AssertionFailedf("unimplemented")) }
   479  func (sizeOnlyStat) Name() string       { panic(errors.AssertionFailedf("unimplemented")) }
   480  func (sizeOnlyStat) Sys() interface{}   { panic(errors.AssertionFailedf("unimplemented")) }