github.com/cockroachdb/pebble@v1.1.5/ingest.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"sort"
    10  	"time"
    11  
    12  	"github.com/cockroachdb/errors"
    13  	"github.com/cockroachdb/pebble/internal/base"
    14  	"github.com/cockroachdb/pebble/internal/invariants"
    15  	"github.com/cockroachdb/pebble/internal/keyspan"
    16  	"github.com/cockroachdb/pebble/internal/manifest"
    17  	"github.com/cockroachdb/pebble/internal/private"
    18  	"github.com/cockroachdb/pebble/objstorage"
    19  	"github.com/cockroachdb/pebble/objstorage/remote"
    20  	"github.com/cockroachdb/pebble/sstable"
    21  )
    22  
    23  func sstableKeyCompare(userCmp Compare, a, b InternalKey) int {
    24  	c := userCmp(a.UserKey, b.UserKey)
    25  	if c != 0 {
    26  		return c
    27  	}
    28  	if a.IsExclusiveSentinel() {
    29  		if !b.IsExclusiveSentinel() {
    30  			return -1
    31  		}
    32  	} else if b.IsExclusiveSentinel() {
    33  		return +1
    34  	}
    35  	return 0
    36  }
    37  
    38  // KeyRange encodes a key range in user key space. A KeyRange's Start is
    39  // inclusive while its End is exclusive.
    40  type KeyRange struct {
    41  	Start, End []byte
    42  }
    43  
    44  // Valid returns true if the KeyRange is defined.
    45  func (k *KeyRange) Valid() bool {
    46  	return k.Start != nil && k.End != nil
    47  }
    48  
    49  // Contains returns whether the specified key exists in the KeyRange.
    50  func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool {
    51  	v := cmp(key.UserKey, k.End)
    52  	return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0
    53  }
    54  
    55  // OverlapsInternalKeyRange checks if the specified internal key range has an
    56  // overlap with the KeyRange. Note that we aren't checking for full containment
    57  // of smallest-largest within k, rather just that there's some intersection
    58  // between the two ranges.
    59  func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool {
    60  	v := cmp(k.Start, largest.UserKey)
    61  	return v <= 0 && !(largest.IsExclusiveSentinel() && v == 0) &&
    62  		cmp(k.End, smallest.UserKey) > 0
    63  }
    64  
    65  // Overlaps checks if the specified file has an overlap with the KeyRange.
    66  // Note that we aren't checking for full containment of m within k, rather just
    67  // that there's some intersection between m and k's bounds.
    68  func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool {
    69  	return k.OverlapsInternalKeyRange(cmp, m.Smallest, m.Largest)
    70  }
    71  
    72  // OverlapsKeyRange checks if this span overlaps with the provided KeyRange.
    73  // Note that we aren't checking for full containment of either span in the other,
    74  // just that there's a key x that is in both key ranges.
    75  func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool {
    76  	return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0
    77  }
    78  
    79  func ingestValidateKey(opts *Options, key *InternalKey) error {
    80  	if key.Kind() == InternalKeyKindInvalid {
    81  		return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s",
    82  			key.Pretty(opts.Comparer.FormatKey))
    83  	}
    84  	if key.SeqNum() != 0 {
    85  		return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s",
    86  			key.Pretty(opts.Comparer.FormatKey))
    87  	}
    88  	return nil
    89  }
    90  
    91  // ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned
    92  // or shared by another node.
    93  func ingestSynthesizeShared(
    94  	opts *Options, sm SharedSSTMeta, fileNum base.DiskFileNum,
    95  ) (*fileMetadata, error) {
    96  	if sm.Size == 0 {
    97  		// Disallow 0 file sizes
    98  		return nil, errors.New("pebble: cannot ingest shared file with size 0")
    99  	}
   100  	// Don't load table stats. Doing a round trip to shared storage, one SST
   101  	// at a time is not worth it as it slows down ingestion.
   102  	meta := &fileMetadata{
   103  		FileNum:      fileNum.FileNum(),
   104  		CreationTime: time.Now().Unix(),
   105  		Virtual:      true,
   106  		Size:         sm.Size,
   107  	}
   108  	meta.InitProviderBacking(fileNum)
   109  	// Set the underlying FileBacking's size to the same size as the virtualized
   110  	// view of the sstable. This ensures that we don't over-prioritize this
   111  	// sstable for compaction just yet, as we do not have a clear sense of what
   112  	// parts of this sstable are referenced by other nodes.
   113  	meta.FileBacking.Size = sm.Size
   114  	if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil {
   115  		// Initialize meta.{HasRangeKeys,Smallest,Largest}, etc.
   116  		//
   117  		// NB: We create new internal keys and pass them into ExternalRangeKeyBounds
   118  		// so that we can sub a zero sequence number into the bounds. We can set
   119  		// the sequence number to anything here; it'll be reset in ingestUpdateSeqNum
   120  		// anyway. However we do need to use the same sequence number across all
   121  		// bound keys at this step so that we end up with bounds that are consistent
   122  		// across point/range keys.
   123  		smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, sm.SmallestRangeKey.Kind())
   124  		largestRangeKey := base.MakeExclusiveSentinelKey(sm.LargestRangeKey.Kind(), sm.LargestRangeKey.UserKey)
   125  		meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey)
   126  	}
   127  	if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil {
   128  		// Initialize meta.{HasPointKeys,Smallest,Largest}, etc.
   129  		//
   130  		// See point above in the ExtendRangeKeyBounds call on why we use a zero
   131  		// sequence number here.
   132  		smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, sm.SmallestPointKey.Kind())
   133  		largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, sm.LargestPointKey.Kind())
   134  		if sm.LargestPointKey.IsExclusiveSentinel() {
   135  			largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey)
   136  		}
   137  		meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey)
   138  	}
   139  	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
   140  		return nil, err
   141  	}
   142  	return meta, nil
   143  }
   144  
   145  // ingestLoad1External loads the fileMetadata for one external sstable.
   146  // Sequence number and target level calculation happens during prepare/apply.
   147  func ingestLoad1External(
   148  	opts *Options,
   149  	e ExternalFile,
   150  	fileNum base.DiskFileNum,
   151  	objprovider objstorage.Provider,
   152  	jobID int,
   153  ) (*fileMetadata, error) {
   154  	if e.Size == 0 {
   155  		// Disallow 0 file sizes
   156  		return nil, errors.New("pebble: cannot ingest external file with size 0")
   157  	}
   158  	if !e.HasRangeKey && !e.HasPointKey {
   159  		return nil, errors.New("pebble: cannot ingest external file with no point or range keys")
   160  	}
   161  	// Don't load table stats. Doing a round trip to shared storage, one SST
   162  	// at a time is not worth it as it slows down ingestion.
   163  	meta := &fileMetadata{}
   164  	meta.FileNum = fileNum.FileNum()
   165  	meta.CreationTime = time.Now().Unix()
   166  	meta.Virtual = true
   167  	meta.Size = e.Size
   168  	meta.InitProviderBacking(fileNum)
   169  
   170  	// Try to resolve a reference to the external file.
   171  	backing, err := objprovider.CreateExternalObjectBacking(e.Locator, e.ObjName)
   172  	if err != nil {
   173  		return nil, err
   174  	}
   175  	metas, err := objprovider.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
   176  		FileNum:  fileNum,
   177  		FileType: fileTypeTable,
   178  		Backing:  backing,
   179  	}})
   180  	if err != nil {
   181  		return nil, err
   182  	}
   183  	if opts.EventListener.TableCreated != nil {
   184  		opts.EventListener.TableCreated(TableCreateInfo{
   185  			JobID:   jobID,
   186  			Reason:  "ingesting",
   187  			Path:    objprovider.Path(metas[0]),
   188  			FileNum: fileNum.FileNum(),
   189  		})
   190  	}
   191  	// In the name of keeping this ingestion as fast as possible, we avoid
   192  	// *all* existence checks and synthesize a file metadata with smallest/largest
   193  	// keys that overlap whatever the passed-in span was.
   194  	smallestCopy := make([]byte, len(e.SmallestUserKey))
   195  	copy(smallestCopy, e.SmallestUserKey)
   196  	largestCopy := make([]byte, len(e.LargestUserKey))
   197  	copy(largestCopy, e.LargestUserKey)
   198  	if e.HasPointKey {
   199  		meta.ExtendPointKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax),
   200  			base.MakeRangeDeleteSentinelKey(largestCopy))
   201  	}
   202  	if e.HasRangeKey {
   203  		meta.ExtendRangeKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeySet),
   204  			base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyDelete, largestCopy))
   205  	}
   206  
   207  	// Set the underlying FileBacking's size to the same size as the virtualized
   208  	// view of the sstable. This ensures that we don't over-prioritize this
   209  	// sstable for compaction just yet, as we do not have a clear sense of
   210  	// what parts of this sstable are referenced by other nodes.
   211  	meta.FileBacking.Size = e.Size
   212  
   213  	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
   214  		return nil, err
   215  	}
   216  	return meta, nil
   217  }
   218  
   219  // ingestLoad1 creates the FileMetadata for one file. This file will be owned
   220  // by this store.
   221  func ingestLoad1(
   222  	opts *Options,
   223  	fmv FormatMajorVersion,
   224  	readable objstorage.Readable,
   225  	cacheID uint64,
   226  	fileNum base.DiskFileNum,
   227  ) (*fileMetadata, error) {
   228  	cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption)
   229  	r, err := sstable.NewReader(readable, opts.MakeReaderOptions(), cacheOpts)
   230  	if err != nil {
   231  		return nil, err
   232  	}
   233  	defer r.Close()
   234  
   235  	// Avoid ingesting tables with format versions this DB doesn't support.
   236  	tf, err := r.TableFormat()
   237  	if err != nil {
   238  		return nil, err
   239  	}
   240  	if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() {
   241  		return nil, errors.Newf(
   242  			"pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)",
   243  			tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(),
   244  		)
   245  	}
   246  
   247  	meta := &fileMetadata{}
   248  	meta.FileNum = fileNum.FileNum()
   249  	meta.Size = uint64(readable.Size())
   250  	meta.CreationTime = time.Now().Unix()
   251  	meta.InitPhysicalBacking()
   252  
   253  	// Avoid loading into the table cache for collecting stats if we
   254  	// don't need to. If there are no range deletions, we have all the
   255  	// information to compute the stats here.
   256  	//
   257  	// This is helpful in tests for avoiding awkwardness around deletion of
   258  	// ingested files from MemFS. MemFS implements the Windows semantics of
   259  	// disallowing removal of an open file. Under MemFS, if we don't populate
   260  	// meta.Stats here, the file will be loaded into the table cache for
   261  	// calculating stats before we can remove the original link.
   262  	maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties)
   263  
   264  	{
   265  		iter, err := r.NewIter(nil /* lower */, nil /* upper */)
   266  		if err != nil {
   267  			return nil, err
   268  		}
   269  		defer iter.Close()
   270  		var smallest InternalKey
   271  		if key, _ := iter.First(); key != nil {
   272  			if err := ingestValidateKey(opts, key); err != nil {
   273  				return nil, err
   274  			}
   275  			smallest = (*key).Clone()
   276  		}
   277  		if err := iter.Error(); err != nil {
   278  			return nil, err
   279  		}
   280  		if key, _ := iter.Last(); key != nil {
   281  			if err := ingestValidateKey(opts, key); err != nil {
   282  				return nil, err
   283  			}
   284  			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone())
   285  		}
   286  		if err := iter.Error(); err != nil {
   287  			return nil, err
   288  		}
   289  	}
   290  
   291  	iter, err := r.NewRawRangeDelIter()
   292  	if err != nil {
   293  		return nil, err
   294  	}
   295  	if iter != nil {
   296  		defer iter.Close()
   297  		var smallest InternalKey
   298  		if s := iter.First(); s != nil {
   299  			key := s.SmallestKey()
   300  			if err := ingestValidateKey(opts, &key); err != nil {
   301  				return nil, err
   302  			}
   303  			smallest = key.Clone()
   304  		}
   305  		if err := iter.Error(); err != nil {
   306  			return nil, err
   307  		}
   308  		if s := iter.Last(); s != nil {
   309  			k := s.SmallestKey()
   310  			if err := ingestValidateKey(opts, &k); err != nil {
   311  				return nil, err
   312  			}
   313  			largest := s.LargestKey().Clone()
   314  			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
   315  		}
   316  	}
   317  
   318  	// Update the range-key bounds for the table.
   319  	{
   320  		iter, err := r.NewRawRangeKeyIter()
   321  		if err != nil {
   322  			return nil, err
   323  		}
   324  		if iter != nil {
   325  			defer iter.Close()
   326  			var smallest InternalKey
   327  			if s := iter.First(); s != nil {
   328  				key := s.SmallestKey()
   329  				if err := ingestValidateKey(opts, &key); err != nil {
   330  					return nil, err
   331  				}
   332  				smallest = key.Clone()
   333  			}
   334  			if err := iter.Error(); err != nil {
   335  				return nil, err
   336  			}
   337  			if s := iter.Last(); s != nil {
   338  				k := s.SmallestKey()
   339  				if err := ingestValidateKey(opts, &k); err != nil {
   340  					return nil, err
   341  				}
   342  				// As range keys are fragmented, the end key of the last range key in
   343  				// the table provides the upper bound for the table.
   344  				largest := s.LargestKey().Clone()
   345  				meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest)
   346  			}
   347  			if err := iter.Error(); err != nil {
   348  				return nil, err
   349  			}
   350  		}
   351  	}
   352  
   353  	if !meta.HasPointKeys && !meta.HasRangeKeys {
   354  		return nil, nil
   355  	}
   356  
   357  	// Sanity check that the various bounds on the file were set consistently.
   358  	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
   359  		return nil, err
   360  	}
   361  
   362  	return meta, nil
   363  }
   364  
   365  type ingestLoadResult struct {
   366  	localMeta, sharedMeta []*fileMetadata
   367  	externalMeta          []*fileMetadata
   368  	localPaths            []string
   369  	sharedLevels          []uint8
   370  	fileCount             int
   371  }
   372  
   373  func ingestLoad(
   374  	opts *Options,
   375  	fmv FormatMajorVersion,
   376  	paths []string,
   377  	shared []SharedSSTMeta,
   378  	external []ExternalFile,
   379  	cacheID uint64,
   380  	pending []base.DiskFileNum,
   381  	objProvider objstorage.Provider,
   382  	jobID int,
   383  ) (ingestLoadResult, error) {
   384  	meta := make([]*fileMetadata, 0, len(paths))
   385  	newPaths := make([]string, 0, len(paths))
   386  	for i := range paths {
   387  		f, err := opts.FS.Open(paths[i])
   388  		if err != nil {
   389  			return ingestLoadResult{}, err
   390  		}
   391  
   392  		readable, err := sstable.NewSimpleReadable(f)
   393  		if err != nil {
   394  			return ingestLoadResult{}, err
   395  		}
   396  		m, err := ingestLoad1(opts, fmv, readable, cacheID, pending[i])
   397  		if err != nil {
   398  			return ingestLoadResult{}, err
   399  		}
   400  		if m != nil {
   401  			meta = append(meta, m)
   402  			newPaths = append(newPaths, paths[i])
   403  		}
   404  	}
   405  	if len(shared) == 0 && len(external) == 0 {
   406  		return ingestLoadResult{localMeta: meta, localPaths: newPaths, fileCount: len(meta)}, nil
   407  	}
   408  
   409  	// Sort the shared files according to level.
   410  	sort.Sort(sharedByLevel(shared))
   411  
   412  	sharedMeta := make([]*fileMetadata, 0, len(shared))
   413  	levels := make([]uint8, 0, len(shared))
   414  	for i := range shared {
   415  		m, err := ingestSynthesizeShared(opts, shared[i], pending[len(paths)+i])
   416  		if err != nil {
   417  			return ingestLoadResult{}, err
   418  		}
   419  		if shared[i].Level < sharedLevelsStart {
   420  			return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart")
   421  		}
   422  		sharedMeta = append(sharedMeta, m)
   423  		levels = append(levels, shared[i].Level)
   424  	}
   425  	externalMeta := make([]*fileMetadata, 0, len(external))
   426  	for i := range external {
   427  		m, err := ingestLoad1External(opts, external[i], pending[len(paths)+len(shared)+i], objProvider, jobID)
   428  		if err != nil {
   429  			return ingestLoadResult{}, err
   430  		}
   431  		externalMeta = append(externalMeta, m)
   432  	}
   433  	result := ingestLoadResult{
   434  		localMeta:    meta,
   435  		sharedMeta:   sharedMeta,
   436  		externalMeta: externalMeta,
   437  		localPaths:   newPaths,
   438  		sharedLevels: levels,
   439  		fileCount:    len(meta) + len(sharedMeta) + len(externalMeta),
   440  	}
   441  	return result, nil
   442  }
   443  
   444  // Struct for sorting metadatas by smallest user keys, while ensuring the
   445  // matching path also gets swapped to the same index. For use in
   446  // ingestSortAndVerify.
   447  type metaAndPaths struct {
   448  	meta  []*fileMetadata
   449  	paths []string
   450  	cmp   Compare
   451  }
   452  
   453  func (m metaAndPaths) Len() int {
   454  	return len(m.meta)
   455  }
   456  
   457  func (m metaAndPaths) Less(i, j int) bool {
   458  	return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0
   459  }
   460  
   461  func (m metaAndPaths) Swap(i, j int) {
   462  	m.meta[i], m.meta[j] = m.meta[j], m.meta[i]
   463  	if m.paths != nil {
   464  		m.paths[i], m.paths[j] = m.paths[j], m.paths[i]
   465  	}
   466  }
   467  
   468  func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error {
   469  	// Verify that all the shared files (i.e. files in sharedMeta)
   470  	// fit within the exciseSpan.
   471  	for i := range lr.sharedMeta {
   472  		f := lr.sharedMeta[i]
   473  		if !exciseSpan.Contains(cmp, f.Smallest) || !exciseSpan.Contains(cmp, f.Largest) {
   474  			return errors.AssertionFailedf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String())
   475  		}
   476  	}
   477  	if len(lr.externalMeta) > 0 {
   478  		if len(lr.localMeta) > 0 || len(lr.sharedMeta) > 0 {
   479  			// Currently we only support external ingests on their own. If external
   480  			// files are present alongside local/shared files, return an error.
   481  			return errors.AssertionFailedf("pebble: external files cannot be ingested atomically alongside other types of files")
   482  		}
   483  		sort.Sort(&metaAndPaths{
   484  			meta: lr.externalMeta,
   485  			cmp:  cmp,
   486  		})
   487  		for i := 1; i < len(lr.externalMeta); i++ {
   488  			if sstableKeyCompare(cmp, lr.externalMeta[i-1].Largest, lr.externalMeta[i].Smallest) >= 0 {
   489  				return errors.AssertionFailedf("pebble: external sstables have overlapping ranges")
   490  			}
   491  		}
   492  		return nil
   493  	}
   494  	if len(lr.localMeta) <= 1 || len(lr.localPaths) <= 1 {
   495  		return nil
   496  	}
   497  
   498  	sort.Sort(&metaAndPaths{
   499  		meta:  lr.localMeta,
   500  		paths: lr.localPaths,
   501  		cmp:   cmp,
   502  	})
   503  
   504  	for i := 1; i < len(lr.localPaths); i++ {
   505  		if sstableKeyCompare(cmp, lr.localMeta[i-1].Largest, lr.localMeta[i].Smallest) >= 0 {
   506  			return errors.AssertionFailedf("pebble: local ingestion sstables have overlapping ranges")
   507  		}
   508  	}
   509  	if len(lr.sharedMeta) == 0 {
   510  		return nil
   511  	}
   512  	filesInLevel := make([]*fileMetadata, 0, len(lr.sharedMeta))
   513  	for l := sharedLevelsStart; l < numLevels; l++ {
   514  		filesInLevel = filesInLevel[:0]
   515  		for i := range lr.sharedMeta {
   516  			if lr.sharedLevels[i] == uint8(l) {
   517  				filesInLevel = append(filesInLevel, lr.sharedMeta[i])
   518  			}
   519  		}
   520  		sort.Slice(filesInLevel, func(i, j int) bool {
   521  			return cmp(filesInLevel[i].Smallest.UserKey, filesInLevel[j].Smallest.UserKey) < 0
   522  		})
   523  		for i := 1; i < len(filesInLevel); i++ {
   524  			if sstableKeyCompare(cmp, filesInLevel[i-1].Largest, filesInLevel[i].Smallest) >= 0 {
   525  				return errors.AssertionFailedf("pebble: external shared sstables have overlapping ranges")
   526  			}
   527  		}
   528  	}
   529  	return nil
   530  }
   531  
   532  func ingestCleanup(objProvider objstorage.Provider, meta []*fileMetadata) error {
   533  	var firstErr error
   534  	for i := range meta {
   535  		if err := objProvider.Remove(fileTypeTable, meta[i].FileBacking.DiskFileNum); err != nil {
   536  			firstErr = firstError(firstErr, err)
   537  		}
   538  	}
   539  	return firstErr
   540  }
   541  
   542  // ingestLink creates new objects which are backed by either hardlinks to or
   543  // copies of the ingested files. It also attaches shared objects to the provider.
   544  func ingestLink(
   545  	jobID int,
   546  	opts *Options,
   547  	objProvider objstorage.Provider,
   548  	lr ingestLoadResult,
   549  	shared []SharedSSTMeta,
   550  ) error {
   551  	for i := range lr.localPaths {
   552  		objMeta, err := objProvider.LinkOrCopyFromLocal(
   553  			context.TODO(), opts.FS, lr.localPaths[i], fileTypeTable, lr.localMeta[i].FileBacking.DiskFileNum,
   554  			objstorage.CreateOptions{PreferSharedStorage: true},
   555  		)
   556  		if err != nil {
   557  			if err2 := ingestCleanup(objProvider, lr.localMeta[:i]); err2 != nil {
   558  				opts.Logger.Infof("ingest cleanup failed: %v", err2)
   559  			}
   560  			return err
   561  		}
   562  		if opts.EventListener.TableCreated != nil {
   563  			opts.EventListener.TableCreated(TableCreateInfo{
   564  				JobID:   jobID,
   565  				Reason:  "ingesting",
   566  				Path:    objProvider.Path(objMeta),
   567  				FileNum: lr.localMeta[i].FileNum,
   568  			})
   569  		}
   570  	}
   571  	sharedObjs := make([]objstorage.RemoteObjectToAttach, 0, len(shared))
   572  	for i := range shared {
   573  		backing, err := shared[i].Backing.Get()
   574  		if err != nil {
   575  			return err
   576  		}
   577  		sharedObjs = append(sharedObjs, objstorage.RemoteObjectToAttach{
   578  			FileNum:  lr.sharedMeta[i].FileBacking.DiskFileNum,
   579  			FileType: fileTypeTable,
   580  			Backing:  backing,
   581  		})
   582  	}
   583  	sharedObjMetas, err := objProvider.AttachRemoteObjects(sharedObjs)
   584  	if err != nil {
   585  		return err
   586  	}
   587  	for i := range sharedObjMetas {
   588  		// One corner case around file sizes we need to be mindful of, is that
   589  		// if one of the shareObjs was initially created by us (and has boomeranged
   590  		// back from another node), we'll need to update the FileBacking's size
   591  		// to be the true underlying size. Otherwise, we could hit errors when we
   592  		// open the db again after a crash/restart (see checkConsistency in open.go),
   593  		// plus it more accurately allows us to prioritize compactions of files
   594  		// that were originally created by us.
   595  		if sharedObjMetas[i].IsShared() && !objProvider.IsSharedForeign(sharedObjMetas[i]) {
   596  			size, err := objProvider.Size(sharedObjMetas[i])
   597  			if err != nil {
   598  				return err
   599  			}
   600  			lr.sharedMeta[i].FileBacking.Size = uint64(size)
   601  		}
   602  		if opts.EventListener.TableCreated != nil {
   603  			opts.EventListener.TableCreated(TableCreateInfo{
   604  				JobID:   jobID,
   605  				Reason:  "ingesting",
   606  				Path:    objProvider.Path(sharedObjMetas[i]),
   607  				FileNum: lr.sharedMeta[i].FileNum,
   608  			})
   609  		}
   610  	}
   611  	// We do not need to do anything about lr.externalMetas. Those were already
   612  	// linked in ingestLoad.
   613  
   614  	return nil
   615  }
   616  
   617  func ingestMemtableOverlaps(cmp Compare, mem flushable, keyRanges []internalKeyRange) bool {
   618  	iter := mem.newIter(nil)
   619  	rangeDelIter := mem.newRangeDelIter(nil)
   620  	rkeyIter := mem.newRangeKeyIter(nil)
   621  
   622  	closeIters := func() error {
   623  		err := iter.Close()
   624  		if rangeDelIter != nil {
   625  			err = firstError(err, rangeDelIter.Close())
   626  		}
   627  		if rkeyIter != nil {
   628  			err = firstError(err, rkeyIter.Close())
   629  		}
   630  		return err
   631  	}
   632  
   633  	for _, kr := range keyRanges {
   634  		if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, cmp) {
   635  			closeIters()
   636  			return true
   637  		}
   638  	}
   639  
   640  	// Assume overlap if any iterator errored out.
   641  	return closeIters() != nil
   642  }
   643  
   644  func ingestUpdateSeqNum(
   645  	cmp Compare, format base.FormatKey, seqNum uint64, loadResult ingestLoadResult,
   646  ) error {
   647  	setSeqFn := func(k base.InternalKey) base.InternalKey {
   648  		return base.MakeInternalKey(k.UserKey, seqNum, k.Kind())
   649  	}
   650  	updateMetadata := func(m *fileMetadata) error {
   651  		// NB: we set the fields directly here, rather than via their Extend*
   652  		// methods, as we are updating sequence numbers.
   653  		if m.HasPointKeys {
   654  			m.SmallestPointKey = setSeqFn(m.SmallestPointKey)
   655  		}
   656  		if m.HasRangeKeys {
   657  			m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey)
   658  		}
   659  		m.Smallest = setSeqFn(m.Smallest)
   660  		// Only update the seqnum for the largest key if that key is not an
   661  		// "exclusive sentinel" (i.e. a range deletion sentinel or a range key
   662  		// boundary), as doing so effectively drops the exclusive sentinel (by
   663  		// lowering the seqnum from the max value), and extends the bounds of the
   664  		// table.
   665  		// NB: as the largest range key is always an exclusive sentinel, it is never
   666  		// updated.
   667  		if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() {
   668  			m.LargestPointKey = setSeqFn(m.LargestPointKey)
   669  		}
   670  		if !m.Largest.IsExclusiveSentinel() {
   671  			m.Largest = setSeqFn(m.Largest)
   672  		}
   673  		// Setting smallestSeqNum == largestSeqNum triggers the setting of
   674  		// Properties.GlobalSeqNum when an sstable is loaded.
   675  		m.SmallestSeqNum = seqNum
   676  		m.LargestSeqNum = seqNum
   677  		// Ensure the new bounds are consistent.
   678  		if err := m.Validate(cmp, format); err != nil {
   679  			return err
   680  		}
   681  		seqNum++
   682  		return nil
   683  	}
   684  
   685  	// Shared sstables are required to be sorted by level ascending. We then
   686  	// iterate the shared sstables in reverse, assigning the lower sequence
   687  	// numbers to the shared sstables that will be ingested into the lower
   688  	// (larger numbered) levels first. This ensures sequence number shadowing is
   689  	// correct.
   690  	for i := len(loadResult.sharedMeta) - 1; i >= 0; i-- {
   691  		if i-1 >= 0 && loadResult.sharedLevels[i-1] > loadResult.sharedLevels[i] {
   692  			panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.sharedMeta[i-1], loadResult.sharedMeta[i]))
   693  		}
   694  		if err := updateMetadata(loadResult.sharedMeta[i]); err != nil {
   695  			return err
   696  		}
   697  	}
   698  	for i := range loadResult.localMeta {
   699  		if err := updateMetadata(loadResult.localMeta[i]); err != nil {
   700  			return err
   701  		}
   702  	}
   703  	for i := range loadResult.externalMeta {
   704  		if err := updateMetadata(loadResult.externalMeta[i]); err != nil {
   705  			return err
   706  		}
   707  	}
   708  	return nil
   709  }
   710  
   711  // Denotes an internal key range. Smallest and largest are both inclusive.
   712  type internalKeyRange struct {
   713  	smallest, largest InternalKey
   714  }
   715  
   716  func overlapWithIterator(
   717  	iter internalIterator,
   718  	rangeDelIter *keyspan.FragmentIterator,
   719  	rkeyIter keyspan.FragmentIterator,
   720  	keyRange internalKeyRange,
   721  	cmp Compare,
   722  ) bool {
   723  	// Check overlap with point operations.
   724  	//
   725  	// When using levelIter, it seeks to the SST whose boundaries
   726  	// contain keyRange.smallest.UserKey(S).
   727  	// It then tries to find a point in that SST that is >= S.
   728  	// If there's no such point it means the SST ends in a tombstone in which case
   729  	// levelIter.SeekGE generates a boundary range del sentinel.
   730  	// The comparison of this boundary with keyRange.largest(L) below
   731  	// is subtle but maintains correctness.
   732  	// 1) boundary < L,
   733  	//    since boundary is also > S (initial seek),
   734  	//    whatever the boundary's start key may be, we're always overlapping.
   735  	// 2) boundary > L,
   736  	//    overlap with boundary cannot be determined since we don't know boundary's start key.
   737  	//    We require checking for overlap with rangeDelIter.
   738  	// 3) boundary == L and L is not sentinel,
   739  	//    means boundary < L and hence is similar to 1).
   740  	// 4) boundary == L and L is sentinel,
   741  	//    we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap.
   742  	key, _ := iter.SeekGE(keyRange.smallest.UserKey, base.SeekGEFlagsNone)
   743  	if key != nil {
   744  		c := sstableKeyCompare(cmp, *key, keyRange.largest)
   745  		if c <= 0 {
   746  			return true
   747  		}
   748  	}
   749  	// Assume overlap if iterator errored.
   750  	if err := iter.Error(); err != nil {
   751  		return true
   752  	}
   753  
   754  	computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) bool {
   755  		// NB: The spans surfaced by the fragment iterator are non-overlapping.
   756  		span := rIter.SeekLT(keyRange.smallest.UserKey)
   757  		if span == nil {
   758  			span = rIter.Next()
   759  		}
   760  		for ; span != nil; span = rIter.Next() {
   761  			if span.Empty() {
   762  				continue
   763  			}
   764  			key := span.SmallestKey()
   765  			c := sstableKeyCompare(cmp, key, keyRange.largest)
   766  			if c > 0 {
   767  				// The start of the span is after the largest key in the
   768  				// ingested table.
   769  				return false
   770  			}
   771  			if cmp(span.End, keyRange.smallest.UserKey) > 0 {
   772  				// The end of the span is greater than the smallest in the
   773  				// table. Note that the span end key is exclusive, thus ">0"
   774  				// instead of ">=0".
   775  				return true
   776  			}
   777  		}
   778  		// Assume overlap if iterator errored.
   779  		if err := rIter.Error(); err != nil {
   780  			return true
   781  		}
   782  		return false
   783  	}
   784  
   785  	// rkeyIter is either a range key level iter, or a range key iterator
   786  	// over a single file.
   787  	if rkeyIter != nil {
   788  		if computeOverlapWithSpans(rkeyIter) {
   789  			return true
   790  		}
   791  	}
   792  
   793  	// Check overlap with range deletions.
   794  	if rangeDelIter == nil || *rangeDelIter == nil {
   795  		return false
   796  	}
   797  	return computeOverlapWithSpans(*rangeDelIter)
   798  }
   799  
   800  // ingestTargetLevel returns the target level for a file being ingested.
   801  // If suggestSplit is true, it accounts for ingest-time splitting as part of
   802  // its target level calculation, and if a split candidate is found, that file
   803  // is returned as the splitFile.
   804  func ingestTargetLevel(
   805  	newIters tableNewIters,
   806  	newRangeKeyIter keyspan.TableNewSpanIter,
   807  	iterOps IterOptions,
   808  	comparer *Comparer,
   809  	v *version,
   810  	baseLevel int,
   811  	compactions map[*compaction]struct{},
   812  	meta *fileMetadata,
   813  	suggestSplit bool,
   814  ) (targetLevel int, splitFile *fileMetadata, err error) {
   815  	// Find the lowest level which does not have any files which overlap meta. We
   816  	// search from L0 to L6 looking for whether there are any files in the level
   817  	// which overlap meta. We want the "lowest" level (where lower means
   818  	// increasing level number) in order to reduce write amplification.
   819  	//
   820  	// There are 2 kinds of overlap we need to check for: file boundary overlap
   821  	// and data overlap. Data overlap implies file boundary overlap. Note that it
   822  	// is always possible to ingest into L0.
   823  	//
   824  	// To place meta at level i where i > 0:
   825  	// - there must not be any data overlap with levels <= i, since that will
   826  	//   violate the sequence number invariant.
   827  	// - no file boundary overlap with level i, since that will violate the
   828  	//   invariant that files do not overlap in levels i > 0.
   829  	//   - if there is only a file overlap at a given level, and no data overlap,
   830  	//     we can still slot a file at that level. We return the fileMetadata with
   831  	//     which we have file boundary overlap (must be only one file, as sstable
   832  	//     bounds are usually tight on user keys) and the caller is expected to split
   833  	//     that sstable into two virtual sstables, allowing this file to go into that
   834  	//     level. Note that if we have file boundary overlap with two files, which
   835  	//     should only happen on rare occasions, we treat it as data overlap and
   836  	//     don't use this optimization.
   837  	//
   838  	// The file boundary overlap check is simpler to conceptualize. Consider the
   839  	// following example, in which the ingested file lies completely before or
   840  	// after the file being considered.
   841  	//
   842  	//   |--|           |--|  ingested file: [a,b] or [f,g]
   843  	//         |-----|        existing file: [c,e]
   844  	//  _____________________
   845  	//   a  b  c  d  e  f  g
   846  	//
   847  	// In both cases the ingested file can move to considering the next level.
   848  	//
   849  	// File boundary overlap does not necessarily imply data overlap. The check
   850  	// for data overlap is a little more nuanced. Consider the following examples:
   851  	//
   852  	//  1. No data overlap:
   853  	//
   854  	//          |-|   |--|    ingested file: [cc-d] or [ee-ff]
   855  	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
   856  	//  _____________________
   857  	//   a  b  c  d  e  f  g
   858  	//
   859  	// In this case the ingested files can "fall through" this level. The checks
   860  	// continue at the next level.
   861  	//
   862  	//  2. Data overlap:
   863  	//
   864  	//            |--|        ingested file: [d-e]
   865  	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
   866  	//  _____________________
   867  	//   a  b  c  d  e  f  g
   868  	//
   869  	// In this case the file cannot be ingested into this level as the point 'dd'
   870  	// is in the way.
   871  	//
   872  	// It is worth noting that the check for data overlap is only approximate. In
   873  	// the previous example, the ingested table [d-e] could contain only the
   874  	// points 'd' and 'e', in which case the table would be eligible for
   875  	// considering lower levels. However, such a fine-grained check would need to
   876  	// be exhaustive (comparing points and ranges in both the ingested existing
   877  	// tables) and such a check is prohibitively expensive. Thus Pebble treats any
   878  	// existing point that falls within the ingested table bounds as being "data
   879  	// overlap".
   880  
   881  	// This assertion implicitly checks that we have the current version of
   882  	// the metadata.
   883  	if v.L0Sublevels == nil {
   884  		return 0, nil, errors.AssertionFailedf("could not read L0 sublevels")
   885  	}
   886  	// Check for overlap over the keys of L0 by iterating over the sublevels.
   887  	for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ {
   888  		iter := newLevelIter(iterOps, comparer, newIters,
   889  			v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{})
   890  
   891  		var rangeDelIter keyspan.FragmentIterator
   892  		// Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
   893  		// sets it up for the target file.
   894  		iter.initRangeDel(&rangeDelIter)
   895  
   896  		levelIter := keyspan.LevelIter{}
   897  		levelIter.Init(
   898  			keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
   899  			v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange,
   900  		)
   901  
   902  		kr := internalKeyRange{
   903  			smallest: meta.Smallest,
   904  			largest:  meta.Largest,
   905  		}
   906  		overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, kr, comparer.Compare)
   907  		err := iter.Close() // Closes range del iter as well.
   908  		err = firstError(err, levelIter.Close())
   909  		if err != nil {
   910  			return 0, nil, err
   911  		}
   912  		if overlap {
   913  			return targetLevel, nil, nil
   914  		}
   915  	}
   916  
   917  	level := baseLevel
   918  	for ; level < numLevels; level++ {
   919  		levelIter := newLevelIter(iterOps, comparer, newIters,
   920  			v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{})
   921  		var rangeDelIter keyspan.FragmentIterator
   922  		// Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
   923  		// sets it up for the target file.
   924  		levelIter.initRangeDel(&rangeDelIter)
   925  
   926  		rkeyLevelIter := &keyspan.LevelIter{}
   927  		rkeyLevelIter.Init(
   928  			keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
   929  			v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange,
   930  		)
   931  
   932  		kr := internalKeyRange{
   933  			smallest: meta.Smallest,
   934  			largest:  meta.Largest,
   935  		}
   936  		overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, kr, comparer.Compare)
   937  		err := levelIter.Close() // Closes range del iter as well.
   938  		err = firstError(err, rkeyLevelIter.Close())
   939  		if err != nil {
   940  			return 0, nil, err
   941  		}
   942  		if overlap {
   943  			return targetLevel, splitFile, nil
   944  		}
   945  
   946  		// Check boundary overlap.
   947  		var candidateSplitFile *fileMetadata
   948  		boundaryOverlaps := v.Overlaps(level, comparer.Compare, meta.Smallest.UserKey,
   949  			meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
   950  		if !boundaryOverlaps.Empty() {
   951  			// We are already guaranteed to not have any data overlaps with files
   952  			// in boundaryOverlaps, otherwise we'd have returned in the above if
   953  			// statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for
   954  			// the case where we can slot this file into the current level despite
   955  			// a boundary overlap, by splitting one existing file into two virtual
   956  			// sstables.
   957  			if suggestSplit && boundaryOverlaps.Len() == 1 {
   958  				iter := boundaryOverlaps.Iter()
   959  				candidateSplitFile = iter.First()
   960  			} else {
   961  				// We either don't want to suggest ingest-time splits (i.e.
   962  				// !suggestSplit), or we boundary-overlapped with more than one file.
   963  				continue
   964  			}
   965  		}
   966  
   967  		// Check boundary overlap with any ongoing compactions. We consider an
   968  		// overlapping compaction that's writing files to an output level as
   969  		// equivalent to boundary overlap with files in that output level.
   970  		//
   971  		// We cannot check for data overlap with the new SSTs compaction will produce
   972  		// since compaction hasn't been done yet. However, there's no need to check
   973  		// since all keys in them will be from levels in [c.startLevel,
   974  		// c.outputLevel], and all those levels have already had their data overlap
   975  		// tested negative (else we'd have returned earlier).
   976  		//
   977  		// An alternative approach would be to cancel these compactions and proceed
   978  		// with an ingest-time split on this level if necessary. However, compaction
   979  		// cancellation can result in significant wasted effort and is best avoided
   980  		// unless necessary.
   981  		overlaps := false
   982  		for c := range compactions {
   983  			if c.outputLevel == nil || level != c.outputLevel.level {
   984  				continue
   985  			}
   986  			if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 &&
   987  				comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 {
   988  				overlaps = true
   989  				break
   990  			}
   991  		}
   992  		if !overlaps {
   993  			targetLevel = level
   994  			splitFile = candidateSplitFile
   995  		}
   996  	}
   997  	return targetLevel, splitFile, nil
   998  }
   999  
  1000  // Ingest ingests a set of sstables into the DB. Ingestion of the files is
  1001  // atomic and semantically equivalent to creating a single batch containing all
  1002  // of the mutations in the sstables. Ingestion may require the memtable to be
  1003  // flushed. The ingested sstable files are moved into the DB and must reside on
  1004  // the same filesystem as the DB. Sstables can be created for ingestion using
  1005  // sstable.Writer. On success, Ingest removes the input paths.
  1006  //
  1007  // Two types of sstables are accepted for ingestion(s): one is sstables present
  1008  // in the instance's vfs.FS and can be referenced locally. The other is sstables
  1009  // present in remote.Storage, referred to as shared or foreign sstables. These
  1010  // shared sstables can be linked through objstorageprovider.Provider, and do not
  1011  // need to already be present on the local vfs.FS. Foreign sstables must all fit
  1012  // in an excise span, and are destined for a level specified in SharedSSTMeta.
  1013  //
  1014  // All sstables *must* be Sync()'d by the caller after all bytes are written
  1015  // and before its file handle is closed; failure to do so could violate
  1016  // durability or lead to corrupted on-disk state. This method cannot, in a
  1017  // platform-and-FS-agnostic way, ensure that all sstables in the input are
  1018  // properly synced to disk. Opening new file handles and Sync()-ing them
  1019  // does not always guarantee durability; see the discussion here on that:
  1020  // https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379
  1021  //
  1022  // Ingestion loads each sstable into the lowest level of the LSM which it
  1023  // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable,
  1024  // ingestion forces the memtable to flush, and then waits for the flush to
  1025  // occur. In some cases, such as with no foreign sstables and no excise span,
  1026  // ingestion that gets blocked on a memtable can join the flushable queue and
  1027  // finish even before the memtable has been flushed.
  1028  //
  1029  // The steps for ingestion are:
  1030  //
  1031  //  1. Allocate file numbers for every sstable being ingested.
  1032  //  2. Load the metadata for all sstables being ingested.
  1033  //  3. Sort the sstables by smallest key, verifying non overlap (for local
  1034  //     sstables).
  1035  //  4. Hard link (or copy) the local sstables into the DB directory.
  1036  //  5. Allocate a sequence number to use for all of the entries in the
  1037  //     local sstables. This is the step where overlap with memtables is
  1038  //     determined. If there is overlap, we remember the most recent memtable
  1039  //     that overlaps.
  1040  //  6. Update the sequence number in the ingested local sstables. (Remote
  1041  //     sstables get fixed sequence numbers that were determined at load time.)
  1042  //  7. Wait for the most recent memtable that overlaps to flush (if any).
  1043  //  8. Add the ingested sstables to the version (DB.ingestApply).
  1044  //     8.1.  If an excise span was specified, figure out what sstables in the
  1045  //     current version overlap with the excise span, and create new virtual
  1046  //     sstables out of those sstables that exclude the excised span (DB.excise).
  1047  //  9. Publish the ingestion sequence number.
  1048  //
  1049  // Note that if the mutable memtable overlaps with ingestion, a flush of the
  1050  // memtable is forced equivalent to DB.Flush. Additionally, subsequent
  1051  // mutations that get sequence numbers larger than the ingestion sequence
  1052  // number get queued up behind the ingestion waiting for it to complete. This
  1053  // can produce a noticeable hiccup in performance. See
  1054  // https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix
  1055  // this hiccup.
  1056  func (d *DB) Ingest(paths []string) error {
  1057  	if err := d.closed.Load(); err != nil {
  1058  		panic(err)
  1059  	}
  1060  	if d.opts.ReadOnly {
  1061  		return ErrReadOnly
  1062  	}
  1063  	_, err := d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
  1064  	return err
  1065  }
  1066  
  1067  // IngestOperationStats provides some information about where in the LSM the
  1068  // bytes were ingested.
  1069  type IngestOperationStats struct {
  1070  	// Bytes is the total bytes in the ingested sstables.
  1071  	Bytes uint64
  1072  	// ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested
  1073  	// into L0. This value is approximate when flushable ingests are active and
  1074  	// an ingest overlaps an entry in the flushable queue. Currently, this
  1075  	// approximation is very rough, only including tables that overlapped the
  1076  	// memtable. This estimate may be improved with #2112.
  1077  	ApproxIngestedIntoL0Bytes uint64
  1078  	// MemtableOverlappingFiles is the count of ingested sstables
  1079  	// that overlapped keys in the memtables.
  1080  	MemtableOverlappingFiles int
  1081  }
  1082  
  1083  // ExternalFile are external sstables that can be referenced through
  1084  // objprovider and ingested as remote files that will not be refcounted or
  1085  // cleaned up. For use with online restore. Note that the underlying sstable
  1086  // could contain keys outside the [Smallest,Largest) bounds; however Pebble
  1087  // is expected to only read the keys within those bounds.
  1088  type ExternalFile struct {
  1089  	// Locator is the shared.Locator that can be used with objProvider to
  1090  	// resolve a reference to this external sstable.
  1091  	Locator remote.Locator
  1092  	// ObjName is the unique name of this sstable on Locator.
  1093  	ObjName string
  1094  	// Size of the referenced proportion of the virtualized sstable. An estimate
  1095  	// is acceptable in lieu of the backing file size.
  1096  	Size uint64
  1097  	// SmallestUserKey and LargestUserKey are the [smallest,largest) user key
  1098  	// bounds of the sstable. Both these bounds are loose i.e. it's possible for
  1099  	// the sstable to not span the entirety of this range. However, multiple
  1100  	// ExternalFiles in one ingestion must all have non-overlapping
  1101  	// [smallest, largest) spans. Note that this Largest bound is exclusive.
  1102  	SmallestUserKey, LargestUserKey []byte
  1103  	// HasPointKey and HasRangeKey denote whether this file contains point keys
  1104  	// or range keys. If both structs are false, an error is returned during
  1105  	// ingestion.
  1106  	HasPointKey, HasRangeKey bool
  1107  }
  1108  
  1109  // IngestWithStats does the same as Ingest, and additionally returns
  1110  // IngestOperationStats.
  1111  func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) {
  1112  	if err := d.closed.Load(); err != nil {
  1113  		panic(err)
  1114  	}
  1115  	if d.opts.ReadOnly {
  1116  		return IngestOperationStats{}, ErrReadOnly
  1117  	}
  1118  	return d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
  1119  }
  1120  
  1121  // IngestExternalFiles does the same as IngestWithStats, and additionally
  1122  // accepts external files (with locator info that can be resolved using
  1123  // d.opts.SharedStorage). These files must also be non-overlapping with
  1124  // each other, and must be resolvable through d.objProvider.
  1125  func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error) {
  1126  	if err := d.closed.Load(); err != nil {
  1127  		panic(err)
  1128  	}
  1129  
  1130  	if d.opts.ReadOnly {
  1131  		return IngestOperationStats{}, ErrReadOnly
  1132  	}
  1133  	if d.opts.Experimental.RemoteStorage == nil {
  1134  		return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured")
  1135  	}
  1136  	return d.ingest(nil, ingestTargetLevel, nil /* shared */, KeyRange{}, external)
  1137  }
  1138  
  1139  // IngestAndExcise does the same as IngestWithStats, and additionally accepts a
  1140  // list of shared files to ingest that can be read from a remote.Storage through
  1141  // a Provider. All the shared files must live within exciseSpan, and any existing
  1142  // keys in exciseSpan are deleted by turning existing sstables into virtual
  1143  // sstables (if not virtual already) and shrinking their spans to exclude
  1144  // exciseSpan. See the comment at Ingest for a more complete picture of the
  1145  // ingestion process.
  1146  //
  1147  // Panics if this DB instance was not instantiated with a remote.Storage and
  1148  // shared sstables are present.
  1149  func (d *DB) IngestAndExcise(
  1150  	paths []string, shared []SharedSSTMeta, exciseSpan KeyRange,
  1151  ) (IngestOperationStats, error) {
  1152  	if err := d.closed.Load(); err != nil {
  1153  		panic(err)
  1154  	}
  1155  	if d.opts.ReadOnly {
  1156  		return IngestOperationStats{}, ErrReadOnly
  1157  	}
  1158  	return d.ingest(paths, ingestTargetLevel, shared, exciseSpan, nil /* external */)
  1159  }
  1160  
  1161  // Both DB.mu and commitPipeline.mu must be held while this is called.
  1162  func (d *DB) newIngestedFlushableEntry(
  1163  	meta []*fileMetadata, seqNum uint64, logNum FileNum,
  1164  ) (*flushableEntry, error) {
  1165  	// Update the sequence number for all of the sstables in the
  1166  	// metadata. Writing the metadata to the manifest when the
  1167  	// version edit is applied is the mechanism that persists the
  1168  	// sequence number. The sstables themselves are left unmodified.
  1169  	// In this case, a version edit will only be written to the manifest
  1170  	// when the flushable is eventually flushed. If Pebble restarts in that
  1171  	// time, then we'll lose the ingest sequence number information. But this
  1172  	// information will also be reconstructed on node restart.
  1173  	if err := ingestUpdateSeqNum(
  1174  		d.cmp, d.opts.Comparer.FormatKey, seqNum, ingestLoadResult{localMeta: meta},
  1175  	); err != nil {
  1176  		return nil, err
  1177  	}
  1178  
  1179  	f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter)
  1180  
  1181  	// NB: The logNum/seqNum are the WAL number which we're writing this entry
  1182  	// to and the sequence number within the WAL which we'll write this entry
  1183  	// to.
  1184  	entry := d.newFlushableEntry(f, logNum, seqNum)
  1185  	// The flushable entry starts off with a single reader ref, so increment
  1186  	// the FileMetadata.Refs.
  1187  	for _, file := range f.files {
  1188  		file.Ref()
  1189  	}
  1190  	entry.unrefFiles = func() []*fileBacking {
  1191  		var obsolete []*fileBacking
  1192  		for _, file := range f.files {
  1193  			if file.Unref() == 0 {
  1194  				obsolete = append(obsolete, file.FileMetadata.FileBacking)
  1195  			}
  1196  		}
  1197  		return obsolete
  1198  	}
  1199  
  1200  	entry.flushForced = true
  1201  	entry.releaseMemAccounting = func() {}
  1202  	return entry, nil
  1203  }
  1204  
  1205  // Both DB.mu and commitPipeline.mu must be held while this is called. Since
  1206  // we're holding both locks, the order in which we rotate the memtable or
  1207  // recycle the WAL in this function is irrelevant as long as the correct log
  1208  // numbers are assigned to the appropriate flushable.
  1209  func (d *DB) handleIngestAsFlushable(meta []*fileMetadata, seqNum uint64) error {
  1210  	b := d.NewBatch()
  1211  	for _, m := range meta {
  1212  		b.ingestSST(m.FileNum)
  1213  	}
  1214  	b.setSeqNum(seqNum)
  1215  
  1216  	// If the WAL is disabled, then the logNum used to create the flushable
  1217  	// entry doesn't matter. We just use the logNum assigned to the current
  1218  	// mutable memtable. If the WAL is enabled, then this logNum will be
  1219  	// overwritten by the logNum of the log which will contain the log entry
  1220  	// for the ingestedFlushable.
  1221  	logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
  1222  	if !d.opts.DisableWAL {
  1223  		// We create a new WAL for the flushable instead of reusing the end of
  1224  		// the previous WAL. This simplifies the increment of the minimum
  1225  		// unflushed log number, and also simplifies WAL replay.
  1226  		var prevLogSize uint64
  1227  		logNum, prevLogSize = d.rotateWAL()
  1228  		// As the rotator of the WAL, we're responsible for updating the
  1229  		// previous flushable queue tail's log size.
  1230  		d.mu.mem.queue[len(d.mu.mem.queue)-1].logSize = prevLogSize
  1231  
  1232  		d.mu.Unlock()
  1233  		err := d.commit.directWrite(b)
  1234  		if err != nil {
  1235  			d.opts.Logger.Fatalf("%v", err)
  1236  		}
  1237  		d.mu.Lock()
  1238  	}
  1239  
  1240  	entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum)
  1241  	if err != nil {
  1242  		return err
  1243  	}
  1244  	nextSeqNum := seqNum + uint64(b.Count())
  1245  
  1246  	// Set newLogNum to the logNum of the previous flushable. This value is
  1247  	// irrelevant if the WAL is disabled. If the WAL is enabled, then we set
  1248  	// the appropriate value below.
  1249  	newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
  1250  	if !d.opts.DisableWAL {
  1251  		// newLogNum will be the WAL num of the next mutable memtable which
  1252  		// comes after the ingestedFlushable in the flushable queue. The mutable
  1253  		// memtable will be created below.
  1254  		//
  1255  		// The prevLogSize returned by rotateWAL is the WAL to which the
  1256  		// flushable ingest keys were appended. This intermediary WAL is only
  1257  		// used to record the flushable ingest and nothing else.
  1258  		newLogNum, entry.logSize = d.rotateWAL()
  1259  	}
  1260  
  1261  	currMem := d.mu.mem.mutable
  1262  	// NB: Placing ingested sstables above the current memtables
  1263  	// requires rotating of the existing memtables/WAL. There is
  1264  	// some concern of churning through tiny memtables due to
  1265  	// ingested sstables being placed on top of them, but those
  1266  	// memtables would have to be flushed anyways.
  1267  	d.mu.mem.queue = append(d.mu.mem.queue, entry)
  1268  	d.rotateMemtable(newLogNum, nextSeqNum, currMem)
  1269  	d.updateReadStateLocked(d.opts.DebugCheck)
  1270  	d.maybeScheduleFlush()
  1271  	return nil
  1272  }
  1273  
  1274  // See comment at Ingest() for details on how this works.
  1275  func (d *DB) ingest(
  1276  	paths []string,
  1277  	targetLevelFunc ingestTargetLevelFunc,
  1278  	shared []SharedSSTMeta,
  1279  	exciseSpan KeyRange,
  1280  	external []ExternalFile,
  1281  ) (IngestOperationStats, error) {
  1282  	if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil {
  1283  		panic("cannot ingest shared sstables with nil SharedStorage")
  1284  	}
  1285  	if (exciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables {
  1286  		return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion")
  1287  	}
  1288  	// Allocate file numbers for all of the files being ingested and mark them as
  1289  	// pending in order to prevent them from being deleted. Note that this causes
  1290  	// the file number ordering to be out of alignment with sequence number
  1291  	// ordering. The sorting of L0 tables by sequence number avoids relying on
  1292  	// that (busted) invariant.
  1293  	d.mu.Lock()
  1294  	pendingOutputs := make([]base.DiskFileNum, len(paths)+len(shared)+len(external))
  1295  	for i := 0; i < len(paths)+len(shared)+len(external); i++ {
  1296  		pendingOutputs[i] = d.mu.versions.getNextFileNum().DiskFileNum()
  1297  	}
  1298  
  1299  	jobID := d.mu.nextJobID
  1300  	d.mu.nextJobID++
  1301  	d.mu.Unlock()
  1302  
  1303  	// Load the metadata for all the files being ingested. This step detects
  1304  	// and elides empty sstables.
  1305  	loadResult, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheID, pendingOutputs, d.objProvider, jobID)
  1306  	if err != nil {
  1307  		return IngestOperationStats{}, err
  1308  	}
  1309  
  1310  	if loadResult.fileCount == 0 {
  1311  		// All of the sstables to be ingested were empty. Nothing to do.
  1312  		return IngestOperationStats{}, nil
  1313  	}
  1314  
  1315  	// Verify the sstables do not overlap.
  1316  	if err := ingestSortAndVerify(d.cmp, loadResult, exciseSpan); err != nil {
  1317  		return IngestOperationStats{}, err
  1318  	}
  1319  
  1320  	// Hard link the sstables into the DB directory. Since the sstables aren't
  1321  	// referenced by a version, they won't be used. If the hard linking fails
  1322  	// (e.g. because the files reside on a different filesystem), ingestLink will
  1323  	// fall back to copying, and if that fails we undo our work and return an
  1324  	// error.
  1325  	if err := ingestLink(jobID, d.opts, d.objProvider, loadResult, shared); err != nil {
  1326  		return IngestOperationStats{}, err
  1327  	}
  1328  
  1329  	// Make the new tables durable. We need to do this at some point before we
  1330  	// update the MANIFEST (via logAndApply), otherwise a crash can have the
  1331  	// tables referenced in the MANIFEST, but not present in the provider.
  1332  	if err := d.objProvider.Sync(); err != nil {
  1333  		return IngestOperationStats{}, err
  1334  	}
  1335  
  1336  	// metaFlushableOverlaps is a slice parallel to meta indicating which of the
  1337  	// ingested sstables overlap some table in the flushable queue. It's used to
  1338  	// approximate ingest-into-L0 stats when using flushable ingests.
  1339  	metaFlushableOverlaps := make([]bool, loadResult.fileCount)
  1340  	var mem *flushableEntry
  1341  	var mut *memTable
  1342  	// asFlushable indicates whether the sstable was ingested as a flushable.
  1343  	var asFlushable bool
  1344  	prepare := func(seqNum uint64) {
  1345  		// Note that d.commit.mu is held by commitPipeline when calling prepare.
  1346  
  1347  		d.mu.Lock()
  1348  		defer d.mu.Unlock()
  1349  
  1350  		// Check to see if any files overlap with any of the memtables. The queue
  1351  		// is ordered from oldest to newest with the mutable memtable being the
  1352  		// last element in the slice. We want to wait for the newest table that
  1353  		// overlaps.
  1354  
  1355  		for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
  1356  			m := d.mu.mem.queue[i]
  1357  			iter := m.newIter(nil)
  1358  			rangeDelIter := m.newRangeDelIter(nil)
  1359  			rkeyIter := m.newRangeKeyIter(nil)
  1360  
  1361  			checkForOverlap := func(i int, meta *fileMetadata) {
  1362  				if metaFlushableOverlaps[i] {
  1363  					// This table already overlapped a more recent flushable.
  1364  					return
  1365  				}
  1366  				kr := internalKeyRange{
  1367  					smallest: meta.Smallest,
  1368  					largest:  meta.Largest,
  1369  				}
  1370  				if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
  1371  					// If this is the first table to overlap a flushable, save
  1372  					// the flushable. This ingest must be ingested or flushed
  1373  					// after it.
  1374  					if mem == nil {
  1375  						mem = m
  1376  					}
  1377  					metaFlushableOverlaps[i] = true
  1378  				}
  1379  			}
  1380  			for i := range loadResult.localMeta {
  1381  				checkForOverlap(i, loadResult.localMeta[i])
  1382  			}
  1383  			for i := range loadResult.sharedMeta {
  1384  				checkForOverlap(len(loadResult.localMeta)+i, loadResult.sharedMeta[i])
  1385  			}
  1386  			for i := range loadResult.externalMeta {
  1387  				checkForOverlap(len(loadResult.localMeta)+len(loadResult.sharedMeta)+i, loadResult.externalMeta[i])
  1388  			}
  1389  			if exciseSpan.Valid() {
  1390  				kr := internalKeyRange{
  1391  					smallest: base.MakeInternalKey(exciseSpan.Start, InternalKeySeqNumMax, InternalKeyKindMax),
  1392  					largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, exciseSpan.End),
  1393  				}
  1394  				if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
  1395  					if mem == nil {
  1396  						mem = m
  1397  					}
  1398  				}
  1399  			}
  1400  			err := iter.Close()
  1401  			if rangeDelIter != nil {
  1402  				err = firstError(err, rangeDelIter.Close())
  1403  			}
  1404  			if rkeyIter != nil {
  1405  				err = firstError(err, rkeyIter.Close())
  1406  			}
  1407  			if err != nil {
  1408  				d.opts.Logger.Infof("ingest error reading flushable for log %s: %s", m.logNum, err)
  1409  			}
  1410  		}
  1411  
  1412  		if mem == nil {
  1413  			// No overlap with any of the queued flushables, so no need to queue
  1414  			// after them.
  1415  
  1416  			// New writes with higher sequence numbers may be concurrently
  1417  			// committed. We must ensure they don't flush before this ingest
  1418  			// completes. To do that, we ref the mutable memtable as a writer,
  1419  			// preventing its flushing (and the flushing of all subsequent
  1420  			// flushables in the queue). Once we've acquired the manifest lock
  1421  			// to add the ingested sstables to the LSM, we can unref as we're
  1422  			// guaranteed that the flush won't edit the LSM before this ingest.
  1423  			mut = d.mu.mem.mutable
  1424  			mut.writerRef()
  1425  			return
  1426  		}
  1427  		// The ingestion overlaps with some entry in the flushable queue.
  1428  		if d.FormatMajorVersion() < FormatFlushableIngest ||
  1429  			d.opts.Experimental.DisableIngestAsFlushable() ||
  1430  			len(shared) > 0 || exciseSpan.Valid() || len(external) > 0 ||
  1431  			(len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) {
  1432  			// We're not able to ingest as a flushable,
  1433  			// so we must synchronously flush.
  1434  			//
  1435  			// TODO(bilal): Currently, if any of the files being ingested are shared or
  1436  			// there's an excise span present, we cannot use flushable ingests and need
  1437  			// to wait synchronously. Either remove this caveat by fleshing out
  1438  			// flushable ingest logic to also account for these cases, or remove this
  1439  			// comment. Tracking issue: https://github.com/cockroachdb/pebble/issues/2676
  1440  			if mem.flushable == d.mu.mem.mutable {
  1441  				err = d.makeRoomForWrite(nil)
  1442  			}
  1443  			// New writes with higher sequence numbers may be concurrently
  1444  			// committed. We must ensure they don't flush before this ingest
  1445  			// completes. To do that, we ref the mutable memtable as a writer,
  1446  			// preventing its flushing (and the flushing of all subsequent
  1447  			// flushables in the queue). Once we've acquired the manifest lock
  1448  			// to add the ingested sstables to the LSM, we can unref as we're
  1449  			// guaranteed that the flush won't edit the LSM before this ingest.
  1450  			mut = d.mu.mem.mutable
  1451  			mut.writerRef()
  1452  			mem.flushForced = true
  1453  			d.maybeScheduleFlush()
  1454  			return
  1455  		}
  1456  		// Since there aren't too many memtables already queued up, we can
  1457  		// slide the ingested sstables on top of the existing memtables.
  1458  		asFlushable = true
  1459  		err = d.handleIngestAsFlushable(loadResult.localMeta, seqNum)
  1460  	}
  1461  
  1462  	var ve *versionEdit
  1463  	apply := func(seqNum uint64) {
  1464  		if err != nil || asFlushable {
  1465  			// An error occurred during prepare.
  1466  			if mut != nil {
  1467  				if mut.writerUnref() {
  1468  					d.mu.Lock()
  1469  					d.maybeScheduleFlush()
  1470  					d.mu.Unlock()
  1471  				}
  1472  			}
  1473  			return
  1474  		}
  1475  
  1476  		// Update the sequence numbers for all ingested sstables'
  1477  		// metadata. When the version edit is applied, the metadata is
  1478  		// written to the manifest, persisting the sequence number.
  1479  		// The sstables themselves are left unmodified.
  1480  		if err = ingestUpdateSeqNum(
  1481  			d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult,
  1482  		); err != nil {
  1483  			if mut != nil {
  1484  				if mut.writerUnref() {
  1485  					d.mu.Lock()
  1486  					d.maybeScheduleFlush()
  1487  					d.mu.Unlock()
  1488  				}
  1489  			}
  1490  			return
  1491  		}
  1492  
  1493  		// If we overlapped with a memtable in prepare wait for the flush to
  1494  		// finish.
  1495  		if mem != nil {
  1496  			<-mem.flushed
  1497  		}
  1498  
  1499  		// Assign the sstables to the correct level in the LSM and apply the
  1500  		// version edit.
  1501  		ve, err = d.ingestApply(jobID, loadResult, targetLevelFunc, mut, exciseSpan)
  1502  	}
  1503  
  1504  	// Only one ingest can occur at a time because if not, one would block waiting
  1505  	// for the other to finish applying. This blocking would happen while holding
  1506  	// the commit mutex which would prevent unrelated batches from writing their
  1507  	// changes to the WAL and memtable. This will cause a bigger commit hiccup
  1508  	// during ingestion.
  1509  	d.commit.ingestSem <- struct{}{}
  1510  	d.commit.AllocateSeqNum(loadResult.fileCount, prepare, apply)
  1511  	<-d.commit.ingestSem
  1512  
  1513  	if err != nil {
  1514  		if err2 := ingestCleanup(d.objProvider, loadResult.localMeta); err2 != nil {
  1515  			d.opts.Logger.Infof("ingest cleanup failed: %v", err2)
  1516  		}
  1517  	} else {
  1518  		// Since we either created a hard link to the ingesting files, or copied
  1519  		// them over, it is safe to remove the originals paths.
  1520  		for _, path := range loadResult.localPaths {
  1521  			if err2 := d.opts.FS.Remove(path); err2 != nil {
  1522  				d.opts.Logger.Infof("ingest failed to remove original file: %s", err2)
  1523  			}
  1524  		}
  1525  	}
  1526  
  1527  	if invariants.Enabled {
  1528  		for _, sharedMeta := range loadResult.sharedMeta {
  1529  			d.checkVirtualBounds(sharedMeta)
  1530  		}
  1531  	}
  1532  
  1533  	info := TableIngestInfo{
  1534  		JobID:     jobID,
  1535  		Err:       err,
  1536  		flushable: asFlushable,
  1537  	}
  1538  	if len(loadResult.localMeta) > 0 {
  1539  		info.GlobalSeqNum = loadResult.localMeta[0].SmallestSeqNum
  1540  	} else if len(loadResult.sharedMeta) > 0 {
  1541  		info.GlobalSeqNum = loadResult.sharedMeta[0].SmallestSeqNum
  1542  	} else {
  1543  		info.GlobalSeqNum = loadResult.externalMeta[0].SmallestSeqNum
  1544  	}
  1545  	var stats IngestOperationStats
  1546  	if ve != nil {
  1547  		info.Tables = make([]struct {
  1548  			TableInfo
  1549  			Level int
  1550  		}, len(ve.NewFiles))
  1551  		for i := range ve.NewFiles {
  1552  			e := &ve.NewFiles[i]
  1553  			info.Tables[i].Level = e.Level
  1554  			info.Tables[i].TableInfo = e.Meta.TableInfo()
  1555  			stats.Bytes += e.Meta.Size
  1556  			if e.Level == 0 {
  1557  				stats.ApproxIngestedIntoL0Bytes += e.Meta.Size
  1558  			}
  1559  			if i < len(metaFlushableOverlaps) && metaFlushableOverlaps[i] {
  1560  				stats.MemtableOverlappingFiles++
  1561  			}
  1562  		}
  1563  	} else if asFlushable {
  1564  		// NB: If asFlushable == true, there are no shared sstables.
  1565  		info.Tables = make([]struct {
  1566  			TableInfo
  1567  			Level int
  1568  		}, len(loadResult.localMeta))
  1569  		for i, f := range loadResult.localMeta {
  1570  			info.Tables[i].Level = -1
  1571  			info.Tables[i].TableInfo = f.TableInfo()
  1572  			stats.Bytes += f.Size
  1573  			// We don't have exact stats on which files will be ingested into
  1574  			// L0, because actual ingestion into the LSM has been deferred until
  1575  			// flush time. Instead, we infer based on memtable overlap.
  1576  			//
  1577  			// TODO(jackson): If we optimistically compute data overlap (#2112)
  1578  			// before entering the commit pipeline, we can use that overlap to
  1579  			// improve our approximation by incorporating overlap with L0, not
  1580  			// just memtables.
  1581  			if metaFlushableOverlaps[i] {
  1582  				stats.ApproxIngestedIntoL0Bytes += f.Size
  1583  				stats.MemtableOverlappingFiles++
  1584  			}
  1585  		}
  1586  	}
  1587  	d.opts.EventListener.TableIngested(info)
  1588  
  1589  	return stats, err
  1590  }
  1591  
  1592  // excise updates ve to include a replacement of the file m with new virtual
  1593  // sstables that exclude exciseSpan, returning a slice of newly-created files if
  1594  // any. If the entirety of m is deleted by exciseSpan, no new sstables are added
  1595  // and m is deleted. Note that ve is updated in-place.
  1596  //
  1597  // The manifest lock must be held when calling this method.
  1598  func (d *DB) excise(
  1599  	exciseSpan KeyRange, m *fileMetadata, ve *versionEdit, level int,
  1600  ) ([]manifest.NewFileEntry, error) {
  1601  	numCreatedFiles := 0
  1602  	// Check if there's actually an overlap between m and exciseSpan.
  1603  	if !exciseSpan.Overlaps(d.cmp, m) {
  1604  		return nil, nil
  1605  	}
  1606  	ve.DeletedFiles[deletedFileEntry{
  1607  		Level:   level,
  1608  		FileNum: m.FileNum,
  1609  	}] = m
  1610  	// Fast path: m sits entirely within the exciseSpan, so just delete it.
  1611  	if exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
  1612  		return nil, nil
  1613  	}
  1614  	var iter internalIterator
  1615  	var rangeDelIter keyspan.FragmentIterator
  1616  	var rangeKeyIter keyspan.FragmentIterator
  1617  	needsBacking := false
  1618  	// Create a file to the left of the excise span, if necessary.
  1619  	// The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)].
  1620  	//
  1621  	// We create bounds that are tight on user keys, and we make the effort to find
  1622  	// the last key in the original sstable that's smaller than exciseSpan.Start
  1623  	// even though it requires some sstable reads. We could choose to create
  1624  	// virtual sstables on loose userKey bounds, in which case we could just set
  1625  	// leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest
  1626  	// issue with that approach would be that it'd lead to lots of small virtual
  1627  	// sstables in the LSM that have no guarantee on containing even a single user
  1628  	// key within the file bounds. This has the potential to increase both read and
  1629  	// write-amp as we will be opening up these sstables only to find no relevant
  1630  	// keys in the read path, and compacting sstables on top of them instead of
  1631  	// directly into the space occupied by them. We choose to incur the cost of
  1632  	// calculating tight bounds at this time instead of creating more work in the
  1633  	// future.
  1634  	//
  1635  	// TODO(bilal): Some of this work can happen without grabbing the manifest
  1636  	// lock; we could grab one currentVersion, release the lock, calculate excised
  1637  	// files, then grab the lock again and recalculate for just the files that
  1638  	// have changed since our previous calculation. Do this optimiaztino as part of
  1639  	// https://github.com/cockroachdb/pebble/issues/2112 .
  1640  	if d.cmp(m.Smallest.UserKey, exciseSpan.Start) < 0 {
  1641  		leftFile := &fileMetadata{
  1642  			Virtual:     true,
  1643  			FileBacking: m.FileBacking,
  1644  			FileNum:     d.mu.versions.getNextFileNum(),
  1645  			// Note that these are loose bounds for smallest/largest seqnums, but they're
  1646  			// sufficient for maintaining correctness.
  1647  			SmallestSeqNum: m.SmallestSeqNum,
  1648  			LargestSeqNum:  m.LargestSeqNum,
  1649  		}
  1650  		if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.SmallestPointKey) {
  1651  			// This file will contain point keys
  1652  			smallestPointKey := m.SmallestPointKey
  1653  			var err error
  1654  			iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{level: manifest.Level(level)}, internalIterOpts{})
  1655  			if err != nil {
  1656  				return nil, err
  1657  			}
  1658  			var key *InternalKey
  1659  			if iter != nil {
  1660  				defer iter.Close()
  1661  				key, _ = iter.SeekLT(exciseSpan.Start, base.SeekLTFlagsNone)
  1662  			} else {
  1663  				iter = emptyIter
  1664  			}
  1665  			if key != nil {
  1666  				leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, key.Clone())
  1667  			}
  1668  			// Store the min of (exciseSpan.Start, rdel.End) in lastRangeDel. This
  1669  			// needs to be a copy if the key is owned by the range del iter.
  1670  			var lastRangeDel []byte
  1671  			if rangeDelIter != nil {
  1672  				defer rangeDelIter.Close()
  1673  				rdel := rangeDelIter.SeekLT(exciseSpan.Start)
  1674  				if rdel != nil {
  1675  					lastRangeDel = append(lastRangeDel[:0], rdel.End...)
  1676  					if d.cmp(lastRangeDel, exciseSpan.Start) > 0 {
  1677  						lastRangeDel = exciseSpan.Start
  1678  					}
  1679  				}
  1680  			} else {
  1681  				rangeDelIter = emptyKeyspanIter
  1682  			}
  1683  			if lastRangeDel != nil {
  1684  				leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel))
  1685  			}
  1686  		}
  1687  		if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.SmallestRangeKey) {
  1688  			// This file will contain range keys
  1689  			var err error
  1690  			smallestRangeKey := m.SmallestRangeKey
  1691  			rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
  1692  			if err != nil {
  1693  				return nil, err
  1694  			}
  1695  			// Store the min of (exciseSpan.Start, rkey.End) in lastRangeKey. This
  1696  			// needs to be a copy if the key is owned by the range key iter.
  1697  			var lastRangeKey []byte
  1698  			var lastRangeKeyKind InternalKeyKind
  1699  			defer rangeKeyIter.Close()
  1700  			rkey := rangeKeyIter.SeekLT(exciseSpan.Start)
  1701  			if rkey != nil {
  1702  				lastRangeKey = append(lastRangeKey[:0], rkey.End...)
  1703  				if d.cmp(lastRangeKey, exciseSpan.Start) > 0 {
  1704  					lastRangeKey = exciseSpan.Start
  1705  				}
  1706  				lastRangeKeyKind = rkey.Keys[0].Kind()
  1707  			}
  1708  			if lastRangeKey != nil {
  1709  				leftFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, base.MakeExclusiveSentinelKey(lastRangeKeyKind, lastRangeKey))
  1710  			}
  1711  		}
  1712  		if leftFile.HasRangeKeys || leftFile.HasPointKeys {
  1713  			var err error
  1714  			leftFile.Size, err = d.tableCache.estimateSize(m, leftFile.Smallest.UserKey, leftFile.Largest.UserKey)
  1715  			if err != nil {
  1716  				return nil, err
  1717  			}
  1718  			if leftFile.Size == 0 {
  1719  				// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
  1720  				// such as if the excised file only has range keys/dels and no point
  1721  				// keys. This can cause panics in places where we divide by file sizes.
  1722  				// Correct for it here.
  1723  				leftFile.Size = 1
  1724  			}
  1725  			if err := leftFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
  1726  				return nil, err
  1727  			}
  1728  			leftFile.ValidateVirtual(m)
  1729  			d.checkVirtualBounds(leftFile)
  1730  			ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: leftFile})
  1731  			needsBacking = true
  1732  			numCreatedFiles++
  1733  		}
  1734  	}
  1735  	// Create a file to the right, if necessary.
  1736  	if exciseSpan.Contains(d.cmp, m.Largest) {
  1737  		// No key exists to the right of the excise span in this file.
  1738  		if needsBacking && !m.Virtual {
  1739  			// If m is virtual, then its file backing is already known to the manifest.
  1740  			// We don't need to create another file backing. Note that there must be
  1741  			// only one CreatedBackingTables entry per backing sstable. This is
  1742  			// indicated by the VersionEdit.CreatedBackingTables invariant.
  1743  			ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  1744  		}
  1745  		return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
  1746  	}
  1747  	// Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest].
  1748  	//
  1749  	// See comment before the definition of leftFile for the motivation behind
  1750  	// calculating tight user-key bounds.
  1751  	rightFile := &fileMetadata{
  1752  		Virtual:     true,
  1753  		FileBacking: m.FileBacking,
  1754  		FileNum:     d.mu.versions.getNextFileNum(),
  1755  		// Note that these are loose bounds for smallest/largest seqnums, but they're
  1756  		// sufficient for maintaining correctness.
  1757  		SmallestSeqNum: m.SmallestSeqNum,
  1758  		LargestSeqNum:  m.LargestSeqNum,
  1759  	}
  1760  	if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.LargestPointKey) {
  1761  		// This file will contain point keys
  1762  		largestPointKey := m.LargestPointKey
  1763  		var err error
  1764  		if iter == nil && rangeDelIter == nil {
  1765  			iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{level: manifest.Level(level)}, internalIterOpts{})
  1766  			if err != nil {
  1767  				return nil, err
  1768  			}
  1769  			if iter != nil {
  1770  				defer iter.Close()
  1771  			} else {
  1772  				iter = emptyIter
  1773  			}
  1774  			if rangeDelIter != nil {
  1775  				defer rangeDelIter.Close()
  1776  			} else {
  1777  				rangeDelIter = emptyKeyspanIter
  1778  			}
  1779  		}
  1780  		key, _ := iter.SeekGE(exciseSpan.End, base.SeekGEFlagsNone)
  1781  		if key != nil {
  1782  			rightFile.ExtendPointKeyBounds(d.cmp, key.Clone(), largestPointKey)
  1783  		}
  1784  		// Store the max of (exciseSpan.End, rdel.Start) in firstRangeDel. This
  1785  		// needs to be a copy if the key is owned by the range del iter.
  1786  		var firstRangeDel []byte
  1787  		rdel := rangeDelIter.SeekGE(exciseSpan.End)
  1788  		if rdel != nil {
  1789  			firstRangeDel = append(firstRangeDel[:0], rdel.Start...)
  1790  			if d.cmp(firstRangeDel, exciseSpan.End) < 0 {
  1791  				firstRangeDel = exciseSpan.End
  1792  			}
  1793  		}
  1794  		if firstRangeDel != nil {
  1795  			smallestPointKey := rdel.SmallestKey()
  1796  			smallestPointKey.UserKey = firstRangeDel
  1797  			rightFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, largestPointKey)
  1798  		}
  1799  	}
  1800  	if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.LargestRangeKey) {
  1801  		// This file will contain range keys.
  1802  		largestRangeKey := m.LargestRangeKey
  1803  		if rangeKeyIter == nil {
  1804  			var err error
  1805  			rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
  1806  			if err != nil {
  1807  				return nil, err
  1808  			}
  1809  			defer rangeKeyIter.Close()
  1810  		}
  1811  		// Store the max of (exciseSpan.End, rkey.Start) in firstRangeKey. This
  1812  		// needs to be a copy if the key is owned by the range key iter.
  1813  		var firstRangeKey []byte
  1814  		rkey := rangeKeyIter.SeekGE(exciseSpan.End)
  1815  		if rkey != nil {
  1816  			firstRangeKey = append(firstRangeKey[:0], rkey.Start...)
  1817  			if d.cmp(firstRangeKey, exciseSpan.End) < 0 {
  1818  				firstRangeKey = exciseSpan.End
  1819  			}
  1820  		}
  1821  		if firstRangeKey != nil {
  1822  			smallestRangeKey := rkey.SmallestKey()
  1823  			smallestRangeKey.UserKey = firstRangeKey
  1824  			// We call ExtendRangeKeyBounds so any internal boundType fields are
  1825  			// set correctly. Note that this is mildly wasteful as we'll be comparing
  1826  			// rightFile.{Smallest,Largest}RangeKey with themselves, which can be
  1827  			// avoided if we exported ExtendOverallKeyBounds or so.
  1828  			rightFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, largestRangeKey)
  1829  		}
  1830  	}
  1831  	if rightFile.HasRangeKeys || rightFile.HasPointKeys {
  1832  		var err error
  1833  		rightFile.Size, err = d.tableCache.estimateSize(m, rightFile.Smallest.UserKey, rightFile.Largest.UserKey)
  1834  		if err != nil {
  1835  			return nil, err
  1836  		}
  1837  		if rightFile.Size == 0 {
  1838  			// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
  1839  			// such as if the excised file only has range keys/dels and no point keys.
  1840  			// This can cause panics in places where we divide by file sizes. Correct
  1841  			// for it here.
  1842  			rightFile.Size = 1
  1843  		}
  1844  		rightFile.ValidateVirtual(m)
  1845  		d.checkVirtualBounds(rightFile)
  1846  		ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: rightFile})
  1847  		needsBacking = true
  1848  		numCreatedFiles++
  1849  	}
  1850  
  1851  	if needsBacking && !m.Virtual {
  1852  		// If m is virtual, then its file backing is already known to the manifest.
  1853  		// We don't need to create another file backing. Note that there must be
  1854  		// only one CreatedBackingTables entry per backing sstable. This is
  1855  		// indicated by the VersionEdit.CreatedBackingTables invariant.
  1856  		ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  1857  	}
  1858  
  1859  	if err := rightFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
  1860  		return nil, err
  1861  	}
  1862  	return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
  1863  }
  1864  
  1865  type ingestTargetLevelFunc func(
  1866  	newIters tableNewIters,
  1867  	newRangeKeyIter keyspan.TableNewSpanIter,
  1868  	iterOps IterOptions,
  1869  	comparer *Comparer,
  1870  	v *version,
  1871  	baseLevel int,
  1872  	compactions map[*compaction]struct{},
  1873  	meta *fileMetadata,
  1874  	suggestSplit bool,
  1875  ) (int, *fileMetadata, error)
  1876  
  1877  type ingestSplitFile struct {
  1878  	// ingestFile is the file being ingested.
  1879  	ingestFile *fileMetadata
  1880  	// splitFile is the file that needs to be split to allow ingestFile to slot
  1881  	// into `level` level.
  1882  	splitFile *fileMetadata
  1883  	// The level where ingestFile will go (and where splitFile already is).
  1884  	level int
  1885  }
  1886  
  1887  // ingestSplit splits files specified in `files` and updates ve in-place to
  1888  // account for existing files getting split into two virtual sstables. The map
  1889  // `replacedFiles` contains an in-progress map of all files that have been
  1890  // replaced with new virtual sstables in this version edit so far, which is also
  1891  // updated in-place.
  1892  //
  1893  // d.mu as well as the manifest lock must be held when calling this method.
  1894  func (d *DB) ingestSplit(
  1895  	ve *versionEdit,
  1896  	updateMetrics func(*fileMetadata, int, []newFileEntry),
  1897  	files []ingestSplitFile,
  1898  	replacedFiles map[base.FileNum][]newFileEntry,
  1899  ) error {
  1900  	for _, s := range files {
  1901  		// replacedFiles can be thought of as a tree, where we start iterating with
  1902  		// s.splitFile and run its fileNum through replacedFiles, then find which of
  1903  		// the replaced files overlaps with s.ingestFile, which becomes the new
  1904  		// splitFile, then we check splitFile's replacements in replacedFiles again
  1905  		// for overlap with s.ingestFile, and so on until we either can't find the
  1906  		// current splitFile in replacedFiles (i.e. that's the file that now needs to
  1907  		// be split), or we don't find a file that overlaps with s.ingestFile, which
  1908  		// means a prior ingest split already produced enough room for s.ingestFile
  1909  		// to go into this level without necessitating another ingest split.
  1910  		splitFile := s.splitFile
  1911  		for splitFile != nil {
  1912  			replaced, ok := replacedFiles[splitFile.FileNum]
  1913  			if !ok {
  1914  				break
  1915  			}
  1916  			updatedSplitFile := false
  1917  			for i := range replaced {
  1918  				if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) {
  1919  					if updatedSplitFile {
  1920  						// This should never happen because the earlier ingestTargetLevel
  1921  						// function only finds split file candidates that are guaranteed to
  1922  						// have no data overlap, only boundary overlap. See the comments
  1923  						// in that method to see the definitions of data vs boundary
  1924  						// overlap. That, plus the fact that files in `replaced` are
  1925  						// guaranteed to have file bounds that are tight on user keys
  1926  						// (as that's what `d.excise` produces), means that the only case
  1927  						// where we overlap with two or more files in `replaced` is if we
  1928  						// actually had data overlap all along, or if the ingestion files
  1929  						// were overlapping, either of which is an invariant violation.
  1930  						panic("updated with two files in ingestSplit")
  1931  					}
  1932  					splitFile = replaced[i].Meta
  1933  					updatedSplitFile = true
  1934  				}
  1935  			}
  1936  			if !updatedSplitFile {
  1937  				// None of the replaced files overlapped with the file being ingested.
  1938  				// This can happen if we've already excised a span overlapping with
  1939  				// this file, or if we have consecutive ingested files that can slide
  1940  				// within the same gap between keys in an existing file. For instance,
  1941  				// if an existing file has keys a and g and we're ingesting b-c, d-e,
  1942  				// the first loop iteration will split the existing file into one that
  1943  				// ends in a and another that starts at g, and the second iteration will
  1944  				// fall into this case and require no splitting.
  1945  				//
  1946  				// No splitting necessary.
  1947  				splitFile = nil
  1948  			}
  1949  		}
  1950  		if splitFile == nil {
  1951  			continue
  1952  		}
  1953  		// NB: excise operates on [start, end). We're splitting at [start, end]
  1954  		// (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation
  1955  		// of exclusive vs inclusive end bounds should not make a difference here
  1956  		// as we're guaranteed to not have any data overlap between splitFile and
  1957  		// s.ingestFile, so panic if we do see a newly added file with an endKey
  1958  		// equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel()
  1959  		added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level)
  1960  		if err != nil {
  1961  			return err
  1962  		}
  1963  		if _, ok := ve.DeletedFiles[deletedFileEntry{
  1964  			Level:   s.level,
  1965  			FileNum: splitFile.FileNum,
  1966  		}]; !ok {
  1967  			panic("did not split file that was expected to be split")
  1968  		}
  1969  		replacedFiles[splitFile.FileNum] = added
  1970  		for i := range added {
  1971  			if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) {
  1972  				panic("ingest-time split produced a file that overlaps with ingested file")
  1973  			}
  1974  		}
  1975  		updateMetrics(splitFile, s.level, added)
  1976  	}
  1977  	// Flatten the version edit by removing any entries from ve.NewFiles that
  1978  	// are also in ve.DeletedFiles.
  1979  	newNewFiles := ve.NewFiles[:0]
  1980  	for i := range ve.NewFiles {
  1981  		fn := ve.NewFiles[i].Meta.FileNum
  1982  		deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn}
  1983  		if _, ok := ve.DeletedFiles[deEntry]; ok {
  1984  			delete(ve.DeletedFiles, deEntry)
  1985  		} else {
  1986  			newNewFiles = append(newNewFiles, ve.NewFiles[i])
  1987  		}
  1988  	}
  1989  	ve.NewFiles = newNewFiles
  1990  	return nil
  1991  }
  1992  
  1993  func (d *DB) ingestApply(
  1994  	jobID int,
  1995  	lr ingestLoadResult,
  1996  	findTargetLevel ingestTargetLevelFunc,
  1997  	mut *memTable,
  1998  	exciseSpan KeyRange,
  1999  ) (*versionEdit, error) {
  2000  	d.mu.Lock()
  2001  	defer d.mu.Unlock()
  2002  
  2003  	ve := &versionEdit{
  2004  		NewFiles: make([]newFileEntry, lr.fileCount),
  2005  	}
  2006  	if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) {
  2007  		ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{}
  2008  	}
  2009  	metrics := make(map[int]*LevelMetrics)
  2010  
  2011  	// Lock the manifest for writing before we use the current version to
  2012  	// determine the target level. This prevents two concurrent ingestion jobs
  2013  	// from using the same version to determine the target level, and also
  2014  	// provides serialization with concurrent compaction and flush jobs.
  2015  	// logAndApply unconditionally releases the manifest lock, but any earlier
  2016  	// returns must unlock the manifest.
  2017  	d.mu.versions.logLock()
  2018  
  2019  	if mut != nil {
  2020  		// Unref the mutable memtable to allows its flush to proceed. Now that we've
  2021  		// acquired the manifest lock, we can be certain that if the mutable
  2022  		// memtable has received more recent conflicting writes, the flush won't
  2023  		// beat us to applying to the manifest resulting in sequence number
  2024  		// inversion. Even though we call maybeScheduleFlush right now, this flush
  2025  		// will apply after our ingestion.
  2026  		if mut.writerUnref() {
  2027  			d.maybeScheduleFlush()
  2028  		}
  2029  	}
  2030  
  2031  	shouldIngestSplit := d.opts.Experimental.IngestSplit != nil &&
  2032  		d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables
  2033  	current := d.mu.versions.currentVersion()
  2034  	baseLevel := d.mu.versions.picker.getBaseLevel()
  2035  	iterOps := IterOptions{logger: d.opts.Logger}
  2036  	// filesToSplit is a list where each element is a pair consisting of a file
  2037  	// being ingested and a file being split to make room for an ingestion into
  2038  	// that level. Each ingested file will appear at most once in this list. It
  2039  	// is possible for split files to appear twice in this list.
  2040  	filesToSplit := make([]ingestSplitFile, 0)
  2041  	checkCompactions := false
  2042  	for i := 0; i < lr.fileCount; i++ {
  2043  		// Determine the lowest level in the LSM for which the sstable doesn't
  2044  		// overlap any existing files in the level.
  2045  		var m *fileMetadata
  2046  		sharedIdx := -1
  2047  		sharedLevel := -1
  2048  		externalFile := false
  2049  		if i < len(lr.localMeta) {
  2050  			// local file.
  2051  			m = lr.localMeta[i]
  2052  		} else if (i - len(lr.localMeta)) < len(lr.sharedMeta) {
  2053  			// shared file.
  2054  			sharedIdx = i - len(lr.localMeta)
  2055  			m = lr.sharedMeta[sharedIdx]
  2056  			sharedLevel = int(lr.sharedLevels[sharedIdx])
  2057  		} else {
  2058  			// external file.
  2059  			externalFile = true
  2060  			m = lr.externalMeta[i-(len(lr.localMeta)+len(lr.sharedMeta))]
  2061  		}
  2062  		f := &ve.NewFiles[i]
  2063  		var err error
  2064  		if sharedIdx >= 0 {
  2065  			f.Level = sharedLevel
  2066  			if f.Level < sharedLevelsStart {
  2067  				panic("cannot slot a shared file higher than the highest shared level")
  2068  			}
  2069  			ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  2070  		} else {
  2071  			if externalFile {
  2072  				ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  2073  			}
  2074  			var splitFile *fileMetadata
  2075  			if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
  2076  				// This file fits perfectly within the excise span. We can slot it at
  2077  				// L6, or sharedLevelsStart - 1 if we have shared files.
  2078  				if len(lr.sharedMeta) > 0 {
  2079  					f.Level = sharedLevelsStart - 1
  2080  					if baseLevel > f.Level {
  2081  						f.Level = 0
  2082  					}
  2083  				} else {
  2084  					f.Level = 6
  2085  				}
  2086  			} else {
  2087  				// TODO(bilal): findTargetLevel does disk IO (reading files for data
  2088  				// overlap) even though we're holding onto d.mu. Consider unlocking
  2089  				// d.mu while we do this. We already hold versions.logLock so we should
  2090  				// not see any version applications while we're at this. The one
  2091  				// complication here would be pulling out the mu.compact.inProgress
  2092  				// check from findTargetLevel, as that requires d.mu to be held.
  2093  				f.Level, splitFile, err = findTargetLevel(
  2094  					d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit)
  2095  			}
  2096  
  2097  			if splitFile != nil {
  2098  				if invariants.Enabled {
  2099  					if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil {
  2100  						panic("splitFile returned is not in level it should be")
  2101  					}
  2102  				}
  2103  				// We take advantage of the fact that we won't drop the db mutex
  2104  				// between now and the call to logAndApply. So, no files should
  2105  				// get added to a new in-progress compaction at this point. We can
  2106  				// avoid having to iterate on in-progress compactions to cancel them
  2107  				// if none of the files being split have a compacting state.
  2108  				if splitFile.IsCompacting() {
  2109  					checkCompactions = true
  2110  				}
  2111  				filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level})
  2112  			}
  2113  		}
  2114  		if err != nil {
  2115  			d.mu.versions.logUnlock()
  2116  			return nil, err
  2117  		}
  2118  		f.Meta = m
  2119  		levelMetrics := metrics[f.Level]
  2120  		if levelMetrics == nil {
  2121  			levelMetrics = &LevelMetrics{}
  2122  			metrics[f.Level] = levelMetrics
  2123  		}
  2124  		levelMetrics.NumFiles++
  2125  		levelMetrics.Size += int64(m.Size)
  2126  		levelMetrics.BytesIngested += m.Size
  2127  		levelMetrics.TablesIngested++
  2128  	}
  2129  	// replacedFiles maps files excised due to exciseSpan (or splitFiles returned
  2130  	// by ingestTargetLevel), to files that were created to replace it. This map
  2131  	// is used to resolve references to split files in filesToSplit, as it is
  2132  	// possible for a file that we want to split to no longer exist or have a
  2133  	// newer fileMetadata due to a split induced by another ingestion file, or an
  2134  	// excise.
  2135  	replacedFiles := make(map[base.FileNum][]newFileEntry)
  2136  	updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) {
  2137  		levelMetrics := metrics[level]
  2138  		if levelMetrics == nil {
  2139  			levelMetrics = &LevelMetrics{}
  2140  			metrics[level] = levelMetrics
  2141  		}
  2142  		levelMetrics.NumFiles--
  2143  		levelMetrics.Size -= int64(m.Size)
  2144  		for i := range added {
  2145  			levelMetrics.NumFiles++
  2146  			levelMetrics.Size += int64(added[i].Meta.Size)
  2147  		}
  2148  	}
  2149  	if exciseSpan.Valid() {
  2150  		// Iterate through all levels and find files that intersect with exciseSpan.
  2151  		//
  2152  		// TODO(bilal): We could drop the DB mutex here as we don't need it for
  2153  		// excises; we only need to hold the version lock which we already are
  2154  		// holding. However releasing the DB mutex could mess with the
  2155  		// ingestTargetLevel calculation that happened above, as it assumed that it
  2156  		// had a complete view of in-progress compactions that wouldn't change
  2157  		// until logAndApply is called. If we were to drop the mutex now, we could
  2158  		// schedule another in-progress compaction that would go into the chosen target
  2159  		// level and lead to file overlap within level (which would panic in
  2160  		// logAndApply). We should drop the db mutex here, do the excise, then
  2161  		// re-grab the DB mutex and rerun just the in-progress compaction check to
  2162  		// see if any new compactions are conflicting with our chosen target levels
  2163  		// for files, and if they are, we should signal those compactions to error
  2164  		// out.
  2165  		for level := range current.Levels {
  2166  			overlaps := current.Overlaps(level, d.cmp, exciseSpan.Start, exciseSpan.End, true /* exclusiveEnd */)
  2167  			iter := overlaps.Iter()
  2168  
  2169  			for m := iter.First(); m != nil; m = iter.Next() {
  2170  				newFiles, err := d.excise(exciseSpan, m, ve, level)
  2171  				if err != nil {
  2172  					return nil, err
  2173  				}
  2174  
  2175  				if _, ok := ve.DeletedFiles[deletedFileEntry{
  2176  					Level:   level,
  2177  					FileNum: m.FileNum,
  2178  				}]; !ok {
  2179  					// We did not excise this file.
  2180  					continue
  2181  				}
  2182  				replacedFiles[m.FileNum] = newFiles
  2183  				updateLevelMetricsOnExcise(m, level, newFiles)
  2184  			}
  2185  		}
  2186  	}
  2187  	if len(filesToSplit) > 0 {
  2188  		// For the same reasons as the above call to excise, we hold the db mutex
  2189  		// while calling this method.
  2190  		if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil {
  2191  			return nil, err
  2192  		}
  2193  	}
  2194  	if len(filesToSplit) > 0 || exciseSpan.Valid() {
  2195  		for c := range d.mu.compact.inProgress {
  2196  			if c.versionEditApplied {
  2197  				continue
  2198  			}
  2199  			// Check if this compaction overlaps with the excise span. Note that just
  2200  			// checking if the inputs individually overlap with the excise span
  2201  			// isn't sufficient; for instance, a compaction could have [a,b] and [e,f]
  2202  			// as inputs and write it all out as [a,b,e,f] in one sstable. If we're
  2203  			// doing a [c,d) excise at the same time as this compaction, we will have
  2204  			// to error out the whole compaction as we can't guarantee it hasn't/won't
  2205  			// write a file overlapping with the excise span.
  2206  			if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) {
  2207  				c.cancel.Store(true)
  2208  			}
  2209  			// Check if this compaction's inputs have been replaced due to an
  2210  			// ingest-time split. In that case, cancel the compaction as a newly picked
  2211  			// compaction would need to include any new files that slid in between
  2212  			// previously-existing files. Note that we cancel any compaction that has a
  2213  			// file that was ingest-split as an input, even if it started before this
  2214  			// ingestion.
  2215  			if checkCompactions {
  2216  				for i := range c.inputs {
  2217  					iter := c.inputs[i].files.Iter()
  2218  					for f := iter.First(); f != nil; f = iter.Next() {
  2219  						if _, ok := replacedFiles[f.FileNum]; ok {
  2220  							c.cancel.Store(true)
  2221  							break
  2222  						}
  2223  					}
  2224  				}
  2225  			}
  2226  		}
  2227  		// Check for any EventuallyFileOnlySnapshots that could be watching for
  2228  		// an excise on this span.
  2229  		if exciseSpan.Valid() {
  2230  			for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next {
  2231  				if s.efos == nil {
  2232  					continue
  2233  				}
  2234  				efos := s.efos
  2235  				// TODO(bilal): We can make this faster by taking advantage of the sorted
  2236  				// nature of protectedRanges to do a sort.Search, or even maintaining a
  2237  				// global list of all protected ranges instead of having to peer into every
  2238  				// snapshot.
  2239  				for i := range efos.protectedRanges {
  2240  					if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) {
  2241  						efos.excised.Store(true)
  2242  						break
  2243  					}
  2244  				}
  2245  			}
  2246  		}
  2247  	}
  2248  	if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo {
  2249  		return d.getInProgressCompactionInfoLocked(nil)
  2250  	}); err != nil {
  2251  		return nil, err
  2252  	}
  2253  
  2254  	d.mu.versions.metrics.Ingest.Count++
  2255  
  2256  	d.updateReadStateLocked(d.opts.DebugCheck)
  2257  	// updateReadStateLocked could have generated obsolete tables, schedule a
  2258  	// cleanup job if necessary.
  2259  	d.deleteObsoleteFiles(jobID)
  2260  	d.updateTableStatsLocked(ve.NewFiles)
  2261  	// The ingestion may have pushed a level over the threshold for compaction,
  2262  	// so check to see if one is necessary and schedule it.
  2263  	d.maybeScheduleCompaction()
  2264  	var toValidate []manifest.NewFileEntry
  2265  	dedup := make(map[base.DiskFileNum]struct{})
  2266  	for _, entry := range ve.NewFiles {
  2267  		if _, ok := dedup[entry.Meta.FileBacking.DiskFileNum]; !ok {
  2268  			toValidate = append(toValidate, entry)
  2269  			dedup[entry.Meta.FileBacking.DiskFileNum] = struct{}{}
  2270  		}
  2271  	}
  2272  	d.maybeValidateSSTablesLocked(toValidate)
  2273  	return ve, nil
  2274  }
  2275  
  2276  // maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending
  2277  // queue of files to be validated, when the feature is enabled.
  2278  //
  2279  // Note that if two entries with the same backing file are added twice, then the
  2280  // block checksums for the backing file will be validated twice.
  2281  //
  2282  // DB.mu must be locked when calling.
  2283  func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) {
  2284  	// Only add to the validation queue when the feature is enabled.
  2285  	if !d.opts.Experimental.ValidateOnIngest {
  2286  		return
  2287  	}
  2288  
  2289  	d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...)
  2290  	if d.shouldValidateSSTablesLocked() {
  2291  		go d.validateSSTables()
  2292  	}
  2293  }
  2294  
  2295  // shouldValidateSSTablesLocked returns true if SSTable validation should run.
  2296  // DB.mu must be locked when calling.
  2297  func (d *DB) shouldValidateSSTablesLocked() bool {
  2298  	return !d.mu.tableValidation.validating &&
  2299  		d.closed.Load() == nil &&
  2300  		d.opts.Experimental.ValidateOnIngest &&
  2301  		len(d.mu.tableValidation.pending) > 0
  2302  }
  2303  
  2304  // validateSSTables runs a round of validation on the tables in the pending
  2305  // queue.
  2306  func (d *DB) validateSSTables() {
  2307  	d.mu.Lock()
  2308  	if !d.shouldValidateSSTablesLocked() {
  2309  		d.mu.Unlock()
  2310  		return
  2311  	}
  2312  
  2313  	pending := d.mu.tableValidation.pending
  2314  	d.mu.tableValidation.pending = nil
  2315  	d.mu.tableValidation.validating = true
  2316  	jobID := d.mu.nextJobID
  2317  	d.mu.nextJobID++
  2318  	rs := d.loadReadState()
  2319  
  2320  	// Drop DB.mu before performing IO.
  2321  	d.mu.Unlock()
  2322  
  2323  	// Validate all tables in the pending queue. This could lead to a situation
  2324  	// where we are starving IO from other tasks due to having to page through
  2325  	// all the blocks in all the sstables in the queue.
  2326  	// TODO(travers): Add some form of pacing to avoid IO starvation.
  2327  	for _, f := range pending {
  2328  		// The file may have been moved or deleted since it was ingested, in
  2329  		// which case we skip.
  2330  		if !rs.current.Contains(f.Level, d.cmp, f.Meta) {
  2331  			// Assume the file was moved to a lower level. It is rare enough
  2332  			// that a table is moved or deleted between the time it was ingested
  2333  			// and the time the validation routine runs that the overall cost of
  2334  			// this inner loop is tolerably low, when amortized over all
  2335  			// ingested tables.
  2336  			found := false
  2337  			for i := f.Level + 1; i < numLevels; i++ {
  2338  				if rs.current.Contains(i, d.cmp, f.Meta) {
  2339  					found = true
  2340  					break
  2341  				}
  2342  			}
  2343  			if !found {
  2344  				continue
  2345  			}
  2346  		}
  2347  
  2348  		var err error
  2349  		if f.Meta.Virtual {
  2350  			err = d.tableCache.withVirtualReader(
  2351  				f.Meta.VirtualMeta(), func(v sstable.VirtualReader) error {
  2352  					return v.ValidateBlockChecksumsOnBacking()
  2353  				})
  2354  		} else {
  2355  			err = d.tableCache.withReader(
  2356  				f.Meta.PhysicalMeta(), func(r *sstable.Reader) error {
  2357  					return r.ValidateBlockChecksums()
  2358  				})
  2359  		}
  2360  
  2361  		if err != nil {
  2362  			// TODO(travers): Hook into the corruption reporting pipeline, once
  2363  			// available. See pebble#1192.
  2364  			d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err)
  2365  		}
  2366  
  2367  		d.opts.EventListener.TableValidated(TableValidatedInfo{
  2368  			JobID: jobID,
  2369  			Meta:  f.Meta,
  2370  		})
  2371  	}
  2372  	rs.unref()
  2373  
  2374  	d.mu.Lock()
  2375  	defer d.mu.Unlock()
  2376  	d.mu.tableValidation.validating = false
  2377  	d.mu.tableValidation.cond.Broadcast()
  2378  	if d.shouldValidateSSTablesLocked() {
  2379  		go d.validateSSTables()
  2380  	}
  2381  }