github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/ingest.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"slices"
    10  	"sort"
    11  	"time"
    12  
    13  	"github.com/cockroachdb/errors"
    14  	"github.com/cockroachdb/pebble/internal/base"
    15  	"github.com/cockroachdb/pebble/internal/invariants"
    16  	"github.com/cockroachdb/pebble/internal/keyspan"
    17  	"github.com/cockroachdb/pebble/internal/manifest"
    18  	"github.com/cockroachdb/pebble/internal/private"
    19  	"github.com/cockroachdb/pebble/objstorage"
    20  	"github.com/cockroachdb/pebble/objstorage/remote"
    21  	"github.com/cockroachdb/pebble/sstable"
    22  )
    23  
    24  func sstableKeyCompare(userCmp Compare, a, b InternalKey) int {
    25  	c := userCmp(a.UserKey, b.UserKey)
    26  	if c != 0 {
    27  		return c
    28  	}
    29  	if a.IsExclusiveSentinel() {
    30  		if !b.IsExclusiveSentinel() {
    31  			return -1
    32  		}
    33  	} else if b.IsExclusiveSentinel() {
    34  		return +1
    35  	}
    36  	return 0
    37  }
    38  
    39  // KeyRange encodes a key range in user key space. A KeyRange's Start is
    40  // inclusive while its End is exclusive.
    41  type KeyRange struct {
    42  	Start, End []byte
    43  }
    44  
    45  // Valid returns true if the KeyRange is defined.
    46  func (k *KeyRange) Valid() bool {
    47  	return k.Start != nil && k.End != nil
    48  }
    49  
    50  // Contains returns whether the specified key exists in the KeyRange.
    51  func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool {
    52  	v := cmp(key.UserKey, k.End)
    53  	return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0
    54  }
    55  
    56  // OverlapsInternalKeyRange checks if the specified internal key range has an
    57  // overlap with the KeyRange. Note that we aren't checking for full containment
    58  // of smallest-largest within k, rather just that there's some intersection
    59  // between the two ranges.
    60  func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool {
    61  	v := cmp(k.Start, largest.UserKey)
    62  	return v <= 0 && !(largest.IsExclusiveSentinel() && v == 0) &&
    63  		cmp(k.End, smallest.UserKey) > 0
    64  }
    65  
    66  // Overlaps checks if the specified file has an overlap with the KeyRange.
    67  // Note that we aren't checking for full containment of m within k, rather just
    68  // that there's some intersection between m and k's bounds.
    69  func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool {
    70  	return k.OverlapsInternalKeyRange(cmp, m.Smallest, m.Largest)
    71  }
    72  
    73  // OverlapsKeyRange checks if this span overlaps with the provided KeyRange.
    74  // Note that we aren't checking for full containment of either span in the other,
    75  // just that there's a key x that is in both key ranges.
    76  func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool {
    77  	return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0
    78  }
    79  
    80  func ingestValidateKey(opts *Options, key *InternalKey) error {
    81  	if key.Kind() == InternalKeyKindInvalid {
    82  		return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s",
    83  			key.Pretty(opts.Comparer.FormatKey))
    84  	}
    85  	if key.SeqNum() != 0 {
    86  		return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s",
    87  			key.Pretty(opts.Comparer.FormatKey))
    88  	}
    89  	return nil
    90  }
    91  
    92  // ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned
    93  // or shared by another node.
    94  func ingestSynthesizeShared(
    95  	opts *Options, sm SharedSSTMeta, fileNum base.DiskFileNum,
    96  ) (*fileMetadata, error) {
    97  	if sm.Size == 0 {
    98  		// Disallow 0 file sizes
    99  		return nil, errors.New("pebble: cannot ingest shared file with size 0")
   100  	}
   101  	// Don't load table stats. Doing a round trip to shared storage, one SST
   102  	// at a time is not worth it as it slows down ingestion.
   103  	meta := &fileMetadata{
   104  		FileNum:      fileNum.FileNum(),
   105  		CreationTime: time.Now().Unix(),
   106  		Virtual:      true,
   107  		Size:         sm.Size,
   108  	}
   109  	meta.InitProviderBacking(fileNum)
   110  	// Set the underlying FileBacking's size to the same size as the virtualized
   111  	// view of the sstable. This ensures that we don't over-prioritize this
   112  	// sstable for compaction just yet, as we do not have a clear sense of what
   113  	// parts of this sstable are referenced by other nodes.
   114  	meta.FileBacking.Size = sm.Size
   115  	if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil {
   116  		// Initialize meta.{HasRangeKeys,Smallest,Largest}, etc.
   117  		//
   118  		// NB: We create new internal keys and pass them into ExternalRangeKeyBounds
   119  		// so that we can sub a zero sequence number into the bounds. We can set
   120  		// the sequence number to anything here; it'll be reset in ingestUpdateSeqNum
   121  		// anyway. However we do need to use the same sequence number across all
   122  		// bound keys at this step so that we end up with bounds that are consistent
   123  		// across point/range keys.
   124  		smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, sm.SmallestRangeKey.Kind())
   125  		largestRangeKey := base.MakeExclusiveSentinelKey(sm.LargestRangeKey.Kind(), sm.LargestRangeKey.UserKey)
   126  		meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey)
   127  	}
   128  	if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil {
   129  		// Initialize meta.{HasPointKeys,Smallest,Largest}, etc.
   130  		//
   131  		// See point above in the ExtendRangeKeyBounds call on why we use a zero
   132  		// sequence number here.
   133  		smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, sm.SmallestPointKey.Kind())
   134  		largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, sm.LargestPointKey.Kind())
   135  		if sm.LargestPointKey.IsExclusiveSentinel() {
   136  			largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey)
   137  		}
   138  		meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey)
   139  	}
   140  	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
   141  		return nil, err
   142  	}
   143  	return meta, nil
   144  }
   145  
   146  // ingestLoad1External loads the fileMetadata for one external sstable.
   147  // Sequence number and target level calculation happens during prepare/apply.
   148  func ingestLoad1External(
   149  	opts *Options,
   150  	e ExternalFile,
   151  	fileNum base.DiskFileNum,
   152  	objprovider objstorage.Provider,
   153  	jobID int,
   154  ) (*fileMetadata, error) {
   155  	if e.Size == 0 {
   156  		// Disallow 0 file sizes
   157  		return nil, errors.New("pebble: cannot ingest external file with size 0")
   158  	}
   159  	if !e.HasRangeKey && !e.HasPointKey {
   160  		return nil, errors.New("pebble: cannot ingest external file with no point or range keys")
   161  	}
   162  	// Don't load table stats. Doing a round trip to shared storage, one SST
   163  	// at a time is not worth it as it slows down ingestion.
   164  	meta := &fileMetadata{}
   165  	meta.FileNum = fileNum.FileNum()
   166  	meta.CreationTime = time.Now().Unix()
   167  	meta.Virtual = true
   168  	meta.Size = e.Size
   169  	meta.InitProviderBacking(fileNum)
   170  
   171  	// Try to resolve a reference to the external file.
   172  	backing, err := objprovider.CreateExternalObjectBacking(e.Locator, e.ObjName)
   173  	if err != nil {
   174  		return nil, err
   175  	}
   176  	metas, err := objprovider.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
   177  		FileNum:  fileNum,
   178  		FileType: fileTypeTable,
   179  		Backing:  backing,
   180  	}})
   181  	if err != nil {
   182  		return nil, err
   183  	}
   184  	if opts.EventListener.TableCreated != nil {
   185  		opts.EventListener.TableCreated(TableCreateInfo{
   186  			JobID:   jobID,
   187  			Reason:  "ingesting",
   188  			Path:    objprovider.Path(metas[0]),
   189  			FileNum: fileNum.FileNum(),
   190  		})
   191  	}
   192  	// In the name of keeping this ingestion as fast as possible, we avoid
   193  	// *all* existence checks and synthesize a file metadata with smallest/largest
   194  	// keys that overlap whatever the passed-in span was.
   195  	smallestCopy := make([]byte, len(e.SmallestUserKey))
   196  	copy(smallestCopy, e.SmallestUserKey)
   197  	largestCopy := make([]byte, len(e.LargestUserKey))
   198  	copy(largestCopy, e.LargestUserKey)
   199  	if e.HasPointKey {
   200  		meta.ExtendPointKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax),
   201  			base.MakeRangeDeleteSentinelKey(largestCopy))
   202  	}
   203  	if e.HasRangeKey {
   204  		meta.ExtendRangeKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeySet),
   205  			base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyDelete, largestCopy))
   206  	}
   207  
   208  	// Set the underlying FileBacking's size to the same size as the virtualized
   209  	// view of the sstable. This ensures that we don't over-prioritize this
   210  	// sstable for compaction just yet, as we do not have a clear sense of
   211  	// what parts of this sstable are referenced by other nodes.
   212  	meta.FileBacking.Size = e.Size
   213  
   214  	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
   215  		return nil, err
   216  	}
   217  	return meta, nil
   218  }
   219  
   220  // ingestLoad1 creates the FileMetadata for one file. This file will be owned
   221  // by this store.
   222  func ingestLoad1(
   223  	opts *Options,
   224  	fmv FormatMajorVersion,
   225  	readable objstorage.Readable,
   226  	cacheID uint64,
   227  	fileNum base.DiskFileNum,
   228  ) (*fileMetadata, error) {
   229  	cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption)
   230  	r, err := sstable.NewReader(readable, opts.MakeReaderOptions(), cacheOpts)
   231  	if err != nil {
   232  		return nil, err
   233  	}
   234  	defer r.Close()
   235  
   236  	// Avoid ingesting tables with format versions this DB doesn't support.
   237  	tf, err := r.TableFormat()
   238  	if err != nil {
   239  		return nil, err
   240  	}
   241  	if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() {
   242  		return nil, errors.Newf(
   243  			"pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)",
   244  			tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(),
   245  		)
   246  	}
   247  
   248  	meta := &fileMetadata{}
   249  	meta.FileNum = fileNum.FileNum()
   250  	meta.Size = uint64(readable.Size())
   251  	meta.CreationTime = time.Now().Unix()
   252  	meta.InitPhysicalBacking()
   253  
   254  	// Avoid loading into the table cache for collecting stats if we
   255  	// don't need to. If there are no range deletions, we have all the
   256  	// information to compute the stats here.
   257  	//
   258  	// This is helpful in tests for avoiding awkwardness around deletion of
   259  	// ingested files from MemFS. MemFS implements the Windows semantics of
   260  	// disallowing removal of an open file. Under MemFS, if we don't populate
   261  	// meta.Stats here, the file will be loaded into the table cache for
   262  	// calculating stats before we can remove the original link.
   263  	maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties)
   264  
   265  	{
   266  		iter, err := r.NewIter(nil /* lower */, nil /* upper */)
   267  		if err != nil {
   268  			return nil, err
   269  		}
   270  		defer iter.Close()
   271  		var smallest InternalKey
   272  		if key, _ := iter.First(); key != nil {
   273  			if err := ingestValidateKey(opts, key); err != nil {
   274  				return nil, err
   275  			}
   276  			smallest = (*key).Clone()
   277  		}
   278  		if err := iter.Error(); err != nil {
   279  			return nil, err
   280  		}
   281  		if key, _ := iter.Last(); key != nil {
   282  			if err := ingestValidateKey(opts, key); err != nil {
   283  				return nil, err
   284  			}
   285  			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone())
   286  		}
   287  		if err := iter.Error(); err != nil {
   288  			return nil, err
   289  		}
   290  	}
   291  
   292  	iter, err := r.NewRawRangeDelIter()
   293  	if err != nil {
   294  		return nil, err
   295  	}
   296  	if iter != nil {
   297  		defer iter.Close()
   298  		var smallest InternalKey
   299  		if s := iter.First(); s != nil {
   300  			key := s.SmallestKey()
   301  			if err := ingestValidateKey(opts, &key); err != nil {
   302  				return nil, err
   303  			}
   304  			smallest = key.Clone()
   305  		}
   306  		if err := iter.Error(); err != nil {
   307  			return nil, err
   308  		}
   309  		if s := iter.Last(); s != nil {
   310  			k := s.SmallestKey()
   311  			if err := ingestValidateKey(opts, &k); err != nil {
   312  				return nil, err
   313  			}
   314  			largest := s.LargestKey().Clone()
   315  			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
   316  		}
   317  	}
   318  
   319  	// Update the range-key bounds for the table.
   320  	{
   321  		iter, err := r.NewRawRangeKeyIter()
   322  		if err != nil {
   323  			return nil, err
   324  		}
   325  		if iter != nil {
   326  			defer iter.Close()
   327  			var smallest InternalKey
   328  			if s := iter.First(); s != nil {
   329  				key := s.SmallestKey()
   330  				if err := ingestValidateKey(opts, &key); err != nil {
   331  					return nil, err
   332  				}
   333  				smallest = key.Clone()
   334  			}
   335  			if err := iter.Error(); err != nil {
   336  				return nil, err
   337  			}
   338  			if s := iter.Last(); s != nil {
   339  				k := s.SmallestKey()
   340  				if err := ingestValidateKey(opts, &k); err != nil {
   341  					return nil, err
   342  				}
   343  				// As range keys are fragmented, the end key of the last range key in
   344  				// the table provides the upper bound for the table.
   345  				largest := s.LargestKey().Clone()
   346  				meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest)
   347  			}
   348  			if err := iter.Error(); err != nil {
   349  				return nil, err
   350  			}
   351  		}
   352  	}
   353  
   354  	if !meta.HasPointKeys && !meta.HasRangeKeys {
   355  		return nil, nil
   356  	}
   357  
   358  	// Sanity check that the various bounds on the file were set consistently.
   359  	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
   360  		return nil, err
   361  	}
   362  
   363  	return meta, nil
   364  }
   365  
   366  type ingestLoadResult struct {
   367  	localMeta, sharedMeta []*fileMetadata
   368  	externalMeta          []*fileMetadata
   369  	localPaths            []string
   370  	sharedLevels          []uint8
   371  	fileCount             int
   372  }
   373  
   374  func ingestLoad(
   375  	opts *Options,
   376  	fmv FormatMajorVersion,
   377  	paths []string,
   378  	shared []SharedSSTMeta,
   379  	external []ExternalFile,
   380  	cacheID uint64,
   381  	pending []base.DiskFileNum,
   382  	objProvider objstorage.Provider,
   383  	jobID int,
   384  ) (ingestLoadResult, error) {
   385  	meta := make([]*fileMetadata, 0, len(paths))
   386  	newPaths := make([]string, 0, len(paths))
   387  	for i := range paths {
   388  		f, err := opts.FS.Open(paths[i])
   389  		if err != nil {
   390  			return ingestLoadResult{}, err
   391  		}
   392  
   393  		readable, err := sstable.NewSimpleReadable(f)
   394  		if err != nil {
   395  			return ingestLoadResult{}, err
   396  		}
   397  		m, err := ingestLoad1(opts, fmv, readable, cacheID, pending[i])
   398  		if err != nil {
   399  			return ingestLoadResult{}, err
   400  		}
   401  		if m != nil {
   402  			meta = append(meta, m)
   403  			newPaths = append(newPaths, paths[i])
   404  		}
   405  	}
   406  	if len(shared) == 0 && len(external) == 0 {
   407  		return ingestLoadResult{localMeta: meta, localPaths: newPaths, fileCount: len(meta)}, nil
   408  	}
   409  
   410  	// Sort the shared files according to level.
   411  	sort.Sort(sharedByLevel(shared))
   412  
   413  	sharedMeta := make([]*fileMetadata, 0, len(shared))
   414  	levels := make([]uint8, 0, len(shared))
   415  	for i := range shared {
   416  		m, err := ingestSynthesizeShared(opts, shared[i], pending[len(paths)+i])
   417  		if err != nil {
   418  			return ingestLoadResult{}, err
   419  		}
   420  		if shared[i].Level < sharedLevelsStart {
   421  			return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart")
   422  		}
   423  		sharedMeta = append(sharedMeta, m)
   424  		levels = append(levels, shared[i].Level)
   425  	}
   426  	externalMeta := make([]*fileMetadata, 0, len(external))
   427  	for i := range external {
   428  		m, err := ingestLoad1External(opts, external[i], pending[len(paths)+len(shared)+i], objProvider, jobID)
   429  		if err != nil {
   430  			return ingestLoadResult{}, err
   431  		}
   432  		externalMeta = append(externalMeta, m)
   433  	}
   434  	result := ingestLoadResult{
   435  		localMeta:    meta,
   436  		sharedMeta:   sharedMeta,
   437  		externalMeta: externalMeta,
   438  		localPaths:   newPaths,
   439  		sharedLevels: levels,
   440  		fileCount:    len(meta) + len(sharedMeta) + len(externalMeta),
   441  	}
   442  	return result, nil
   443  }
   444  
   445  // Struct for sorting metadatas by smallest user keys, while ensuring the
   446  // matching path also gets swapped to the same index. For use in
   447  // ingestSortAndVerify.
   448  type metaAndPaths struct {
   449  	meta  []*fileMetadata
   450  	paths []string
   451  	cmp   Compare
   452  }
   453  
   454  func (m metaAndPaths) Len() int {
   455  	return len(m.meta)
   456  }
   457  
   458  func (m metaAndPaths) Less(i, j int) bool {
   459  	return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0
   460  }
   461  
   462  func (m metaAndPaths) Swap(i, j int) {
   463  	m.meta[i], m.meta[j] = m.meta[j], m.meta[i]
   464  	if m.paths != nil {
   465  		m.paths[i], m.paths[j] = m.paths[j], m.paths[i]
   466  	}
   467  }
   468  
   469  func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error {
   470  	// Verify that all the shared files (i.e. files in sharedMeta)
   471  	// fit within the exciseSpan.
   472  	for i := range lr.sharedMeta {
   473  		f := lr.sharedMeta[i]
   474  		if !exciseSpan.Contains(cmp, f.Smallest) || !exciseSpan.Contains(cmp, f.Largest) {
   475  			return errors.AssertionFailedf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String())
   476  		}
   477  	}
   478  	if len(lr.externalMeta) > 0 {
   479  		if len(lr.localMeta) > 0 || len(lr.sharedMeta) > 0 {
   480  			// Currently we only support external ingests on their own. If external
   481  			// files are present alongside local/shared files, return an error.
   482  			return errors.AssertionFailedf("pebble: external files cannot be ingested atomically alongside other types of files")
   483  		}
   484  		sort.Sort(&metaAndPaths{
   485  			meta: lr.externalMeta,
   486  			cmp:  cmp,
   487  		})
   488  		for i := 1; i < len(lr.externalMeta); i++ {
   489  			if sstableKeyCompare(cmp, lr.externalMeta[i-1].Largest, lr.externalMeta[i].Smallest) >= 0 {
   490  				return errors.AssertionFailedf("pebble: external sstables have overlapping ranges")
   491  			}
   492  		}
   493  		return nil
   494  	}
   495  	if len(lr.localMeta) <= 1 || len(lr.localPaths) <= 1 {
   496  		return nil
   497  	}
   498  
   499  	sort.Sort(&metaAndPaths{
   500  		meta:  lr.localMeta,
   501  		paths: lr.localPaths,
   502  		cmp:   cmp,
   503  	})
   504  
   505  	for i := 1; i < len(lr.localPaths); i++ {
   506  		if sstableKeyCompare(cmp, lr.localMeta[i-1].Largest, lr.localMeta[i].Smallest) >= 0 {
   507  			return errors.AssertionFailedf("pebble: local ingestion sstables have overlapping ranges")
   508  		}
   509  	}
   510  	if len(lr.sharedMeta) == 0 {
   511  		return nil
   512  	}
   513  	filesInLevel := make([]*fileMetadata, 0, len(lr.sharedMeta))
   514  	for l := sharedLevelsStart; l < numLevels; l++ {
   515  		filesInLevel = filesInLevel[:0]
   516  		for i := range lr.sharedMeta {
   517  			if lr.sharedLevels[i] == uint8(l) {
   518  				filesInLevel = append(filesInLevel, lr.sharedMeta[i])
   519  			}
   520  		}
   521  		slices.SortFunc(filesInLevel, func(a, b *fileMetadata) int {
   522  			return cmp(a.Smallest.UserKey, b.Smallest.UserKey)
   523  		})
   524  		for i := 1; i < len(filesInLevel); i++ {
   525  			if sstableKeyCompare(cmp, filesInLevel[i-1].Largest, filesInLevel[i].Smallest) >= 0 {
   526  				return errors.AssertionFailedf("pebble: external shared sstables have overlapping ranges")
   527  			}
   528  		}
   529  	}
   530  	return nil
   531  }
   532  
   533  func ingestCleanup(objProvider objstorage.Provider, meta []*fileMetadata) error {
   534  	var firstErr error
   535  	for i := range meta {
   536  		if err := objProvider.Remove(fileTypeTable, meta[i].FileBacking.DiskFileNum); err != nil {
   537  			firstErr = firstError(firstErr, err)
   538  		}
   539  	}
   540  	return firstErr
   541  }
   542  
   543  // ingestLink creates new objects which are backed by either hardlinks to or
   544  // copies of the ingested files. It also attaches shared objects to the provider.
   545  func ingestLink(
   546  	jobID int,
   547  	opts *Options,
   548  	objProvider objstorage.Provider,
   549  	lr ingestLoadResult,
   550  	shared []SharedSSTMeta,
   551  ) error {
   552  	for i := range lr.localPaths {
   553  		objMeta, err := objProvider.LinkOrCopyFromLocal(
   554  			context.TODO(), opts.FS, lr.localPaths[i], fileTypeTable, lr.localMeta[i].FileBacking.DiskFileNum,
   555  			objstorage.CreateOptions{PreferSharedStorage: true},
   556  		)
   557  		if err != nil {
   558  			if err2 := ingestCleanup(objProvider, lr.localMeta[:i]); err2 != nil {
   559  				opts.Logger.Errorf("ingest cleanup failed: %v", err2)
   560  			}
   561  			return err
   562  		}
   563  		if opts.EventListener.TableCreated != nil {
   564  			opts.EventListener.TableCreated(TableCreateInfo{
   565  				JobID:   jobID,
   566  				Reason:  "ingesting",
   567  				Path:    objProvider.Path(objMeta),
   568  				FileNum: lr.localMeta[i].FileNum,
   569  			})
   570  		}
   571  	}
   572  	sharedObjs := make([]objstorage.RemoteObjectToAttach, 0, len(shared))
   573  	for i := range shared {
   574  		backing, err := shared[i].Backing.Get()
   575  		if err != nil {
   576  			return err
   577  		}
   578  		sharedObjs = append(sharedObjs, objstorage.RemoteObjectToAttach{
   579  			FileNum:  lr.sharedMeta[i].FileBacking.DiskFileNum,
   580  			FileType: fileTypeTable,
   581  			Backing:  backing,
   582  		})
   583  	}
   584  	sharedObjMetas, err := objProvider.AttachRemoteObjects(sharedObjs)
   585  	if err != nil {
   586  		return err
   587  	}
   588  	for i := range sharedObjMetas {
   589  		// One corner case around file sizes we need to be mindful of, is that
   590  		// if one of the shareObjs was initially created by us (and has boomeranged
   591  		// back from another node), we'll need to update the FileBacking's size
   592  		// to be the true underlying size. Otherwise, we could hit errors when we
   593  		// open the db again after a crash/restart (see checkConsistency in open.go),
   594  		// plus it more accurately allows us to prioritize compactions of files
   595  		// that were originally created by us.
   596  		if sharedObjMetas[i].IsShared() && !objProvider.IsSharedForeign(sharedObjMetas[i]) {
   597  			size, err := objProvider.Size(sharedObjMetas[i])
   598  			if err != nil {
   599  				return err
   600  			}
   601  			lr.sharedMeta[i].FileBacking.Size = uint64(size)
   602  		}
   603  		if opts.EventListener.TableCreated != nil {
   604  			opts.EventListener.TableCreated(TableCreateInfo{
   605  				JobID:   jobID,
   606  				Reason:  "ingesting",
   607  				Path:    objProvider.Path(sharedObjMetas[i]),
   608  				FileNum: lr.sharedMeta[i].FileNum,
   609  			})
   610  		}
   611  	}
   612  	// We do not need to do anything about lr.externalMetas. Those were already
   613  	// linked in ingestLoad.
   614  
   615  	return nil
   616  }
   617  
   618  func ingestMemtableOverlaps(cmp Compare, mem flushable, keyRanges []internalKeyRange) bool {
   619  	iter := mem.newIter(nil)
   620  	rangeDelIter := mem.newRangeDelIter(nil)
   621  	rkeyIter := mem.newRangeKeyIter(nil)
   622  
   623  	closeIters := func() error {
   624  		err := iter.Close()
   625  		if rangeDelIter != nil {
   626  			err = firstError(err, rangeDelIter.Close())
   627  		}
   628  		if rkeyIter != nil {
   629  			err = firstError(err, rkeyIter.Close())
   630  		}
   631  		return err
   632  	}
   633  
   634  	for _, kr := range keyRanges {
   635  		if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, cmp) {
   636  			closeIters()
   637  			return true
   638  		}
   639  	}
   640  
   641  	// Assume overlap if any iterator errored out.
   642  	return closeIters() != nil
   643  }
   644  
   645  func ingestUpdateSeqNum(
   646  	cmp Compare, format base.FormatKey, seqNum uint64, loadResult ingestLoadResult,
   647  ) error {
   648  	setSeqFn := func(k base.InternalKey) base.InternalKey {
   649  		return base.MakeInternalKey(k.UserKey, seqNum, k.Kind())
   650  	}
   651  	updateMetadata := func(m *fileMetadata) error {
   652  		// NB: we set the fields directly here, rather than via their Extend*
   653  		// methods, as we are updating sequence numbers.
   654  		if m.HasPointKeys {
   655  			m.SmallestPointKey = setSeqFn(m.SmallestPointKey)
   656  		}
   657  		if m.HasRangeKeys {
   658  			m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey)
   659  		}
   660  		m.Smallest = setSeqFn(m.Smallest)
   661  		// Only update the seqnum for the largest key if that key is not an
   662  		// "exclusive sentinel" (i.e. a range deletion sentinel or a range key
   663  		// boundary), as doing so effectively drops the exclusive sentinel (by
   664  		// lowering the seqnum from the max value), and extends the bounds of the
   665  		// table.
   666  		// NB: as the largest range key is always an exclusive sentinel, it is never
   667  		// updated.
   668  		if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() {
   669  			m.LargestPointKey = setSeqFn(m.LargestPointKey)
   670  		}
   671  		if !m.Largest.IsExclusiveSentinel() {
   672  			m.Largest = setSeqFn(m.Largest)
   673  		}
   674  		// Setting smallestSeqNum == largestSeqNum triggers the setting of
   675  		// Properties.GlobalSeqNum when an sstable is loaded.
   676  		m.SmallestSeqNum = seqNum
   677  		m.LargestSeqNum = seqNum
   678  		// Ensure the new bounds are consistent.
   679  		if err := m.Validate(cmp, format); err != nil {
   680  			return err
   681  		}
   682  		seqNum++
   683  		return nil
   684  	}
   685  
   686  	// Shared sstables are required to be sorted by level ascending. We then
   687  	// iterate the shared sstables in reverse, assigning the lower sequence
   688  	// numbers to the shared sstables that will be ingested into the lower
   689  	// (larger numbered) levels first. This ensures sequence number shadowing is
   690  	// correct.
   691  	for i := len(loadResult.sharedMeta) - 1; i >= 0; i-- {
   692  		if i-1 >= 0 && loadResult.sharedLevels[i-1] > loadResult.sharedLevels[i] {
   693  			panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.sharedMeta[i-1], loadResult.sharedMeta[i]))
   694  		}
   695  		if err := updateMetadata(loadResult.sharedMeta[i]); err != nil {
   696  			return err
   697  		}
   698  	}
   699  	for i := range loadResult.localMeta {
   700  		if err := updateMetadata(loadResult.localMeta[i]); err != nil {
   701  			return err
   702  		}
   703  	}
   704  	for i := range loadResult.externalMeta {
   705  		if err := updateMetadata(loadResult.externalMeta[i]); err != nil {
   706  			return err
   707  		}
   708  	}
   709  	return nil
   710  }
   711  
   712  // Denotes an internal key range. Smallest and largest are both inclusive.
   713  type internalKeyRange struct {
   714  	smallest, largest InternalKey
   715  }
   716  
   717  func overlapWithIterator(
   718  	iter internalIterator,
   719  	rangeDelIter *keyspan.FragmentIterator,
   720  	rkeyIter keyspan.FragmentIterator,
   721  	keyRange internalKeyRange,
   722  	cmp Compare,
   723  ) bool {
   724  	// Check overlap with point operations.
   725  	//
   726  	// When using levelIter, it seeks to the SST whose boundaries
   727  	// contain keyRange.smallest.UserKey(S).
   728  	// It then tries to find a point in that SST that is >= S.
   729  	// If there's no such point it means the SST ends in a tombstone in which case
   730  	// levelIter.SeekGE generates a boundary range del sentinel.
   731  	// The comparison of this boundary with keyRange.largest(L) below
   732  	// is subtle but maintains correctness.
   733  	// 1) boundary < L,
   734  	//    since boundary is also > S (initial seek),
   735  	//    whatever the boundary's start key may be, we're always overlapping.
   736  	// 2) boundary > L,
   737  	//    overlap with boundary cannot be determined since we don't know boundary's start key.
   738  	//    We require checking for overlap with rangeDelIter.
   739  	// 3) boundary == L and L is not sentinel,
   740  	//    means boundary < L and hence is similar to 1).
   741  	// 4) boundary == L and L is sentinel,
   742  	//    we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap.
   743  	key, _ := iter.SeekGE(keyRange.smallest.UserKey, base.SeekGEFlagsNone)
   744  	if key != nil {
   745  		c := sstableKeyCompare(cmp, *key, keyRange.largest)
   746  		if c <= 0 {
   747  			return true
   748  		}
   749  	}
   750  	// Assume overlap if iterator errored.
   751  	if err := iter.Error(); err != nil {
   752  		return true
   753  	}
   754  
   755  	computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) bool {
   756  		// NB: The spans surfaced by the fragment iterator are non-overlapping.
   757  		span := rIter.SeekLT(keyRange.smallest.UserKey)
   758  		if span == nil {
   759  			span = rIter.Next()
   760  		}
   761  		for ; span != nil; span = rIter.Next() {
   762  			if span.Empty() {
   763  				continue
   764  			}
   765  			key := span.SmallestKey()
   766  			c := sstableKeyCompare(cmp, key, keyRange.largest)
   767  			if c > 0 {
   768  				// The start of the span is after the largest key in the
   769  				// ingested table.
   770  				return false
   771  			}
   772  			if cmp(span.End, keyRange.smallest.UserKey) > 0 {
   773  				// The end of the span is greater than the smallest in the
   774  				// table. Note that the span end key is exclusive, thus ">0"
   775  				// instead of ">=0".
   776  				return true
   777  			}
   778  		}
   779  		// Assume overlap if iterator errored.
   780  		if err := rIter.Error(); err != nil {
   781  			return true
   782  		}
   783  		return false
   784  	}
   785  
   786  	// rkeyIter is either a range key level iter, or a range key iterator
   787  	// over a single file.
   788  	if rkeyIter != nil {
   789  		if computeOverlapWithSpans(rkeyIter) {
   790  			return true
   791  		}
   792  	}
   793  
   794  	// Check overlap with range deletions.
   795  	if rangeDelIter == nil || *rangeDelIter == nil {
   796  		return false
   797  	}
   798  	return computeOverlapWithSpans(*rangeDelIter)
   799  }
   800  
   801  // ingestTargetLevel returns the target level for a file being ingested.
   802  // If suggestSplit is true, it accounts for ingest-time splitting as part of
   803  // its target level calculation, and if a split candidate is found, that file
   804  // is returned as the splitFile.
   805  func ingestTargetLevel(
   806  	newIters tableNewIters,
   807  	newRangeKeyIter keyspan.TableNewSpanIter,
   808  	iterOps IterOptions,
   809  	comparer *Comparer,
   810  	v *version,
   811  	baseLevel int,
   812  	compactions map[*compaction]struct{},
   813  	meta *fileMetadata,
   814  	suggestSplit bool,
   815  ) (targetLevel int, splitFile *fileMetadata, err error) {
   816  	// Find the lowest level which does not have any files which overlap meta. We
   817  	// search from L0 to L6 looking for whether there are any files in the level
   818  	// which overlap meta. We want the "lowest" level (where lower means
   819  	// increasing level number) in order to reduce write amplification.
   820  	//
   821  	// There are 2 kinds of overlap we need to check for: file boundary overlap
   822  	// and data overlap. Data overlap implies file boundary overlap. Note that it
   823  	// is always possible to ingest into L0.
   824  	//
   825  	// To place meta at level i where i > 0:
   826  	// - there must not be any data overlap with levels <= i, since that will
   827  	//   violate the sequence number invariant.
   828  	// - no file boundary overlap with level i, since that will violate the
   829  	//   invariant that files do not overlap in levels i > 0.
   830  	//   - if there is only a file overlap at a given level, and no data overlap,
   831  	//     we can still slot a file at that level. We return the fileMetadata with
   832  	//     which we have file boundary overlap (must be only one file, as sstable
   833  	//     bounds are usually tight on user keys) and the caller is expected to split
   834  	//     that sstable into two virtual sstables, allowing this file to go into that
   835  	//     level. Note that if we have file boundary overlap with two files, which
   836  	//     should only happen on rare occasions, we treat it as data overlap and
   837  	//     don't use this optimization.
   838  	//
   839  	// The file boundary overlap check is simpler to conceptualize. Consider the
   840  	// following example, in which the ingested file lies completely before or
   841  	// after the file being considered.
   842  	//
   843  	//   |--|           |--|  ingested file: [a,b] or [f,g]
   844  	//         |-----|        existing file: [c,e]
   845  	//  _____________________
   846  	//   a  b  c  d  e  f  g
   847  	//
   848  	// In both cases the ingested file can move to considering the next level.
   849  	//
   850  	// File boundary overlap does not necessarily imply data overlap. The check
   851  	// for data overlap is a little more nuanced. Consider the following examples:
   852  	//
   853  	//  1. No data overlap:
   854  	//
   855  	//          |-|   |--|    ingested file: [cc-d] or [ee-ff]
   856  	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
   857  	//  _____________________
   858  	//   a  b  c  d  e  f  g
   859  	//
   860  	// In this case the ingested files can "fall through" this level. The checks
   861  	// continue at the next level.
   862  	//
   863  	//  2. Data overlap:
   864  	//
   865  	//            |--|        ingested file: [d-e]
   866  	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
   867  	//  _____________________
   868  	//   a  b  c  d  e  f  g
   869  	//
   870  	// In this case the file cannot be ingested into this level as the point 'dd'
   871  	// is in the way.
   872  	//
   873  	// It is worth noting that the check for data overlap is only approximate. In
   874  	// the previous example, the ingested table [d-e] could contain only the
   875  	// points 'd' and 'e', in which case the table would be eligible for
   876  	// considering lower levels. However, such a fine-grained check would need to
   877  	// be exhaustive (comparing points and ranges in both the ingested existing
   878  	// tables) and such a check is prohibitively expensive. Thus Pebble treats any
   879  	// existing point that falls within the ingested table bounds as being "data
   880  	// overlap".
   881  
   882  	// This assertion implicitly checks that we have the current version of
   883  	// the metadata.
   884  	if v.L0Sublevels == nil {
   885  		return 0, nil, errors.AssertionFailedf("could not read L0 sublevels")
   886  	}
   887  	iterOps.CategoryAndQoS = sstable.CategoryAndQoS{
   888  		Category: "pebble-ingest",
   889  		QoSLevel: sstable.LatencySensitiveQoSLevel,
   890  	}
   891  	// Check for overlap over the keys of L0 by iterating over the sublevels.
   892  	for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ {
   893  		iter := newLevelIter(context.Background(),
   894  			iterOps, comparer, newIters, v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{})
   895  
   896  		var rangeDelIter keyspan.FragmentIterator
   897  		// Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
   898  		// sets it up for the target file.
   899  		iter.initRangeDel(&rangeDelIter)
   900  
   901  		levelIter := keyspan.LevelIter{}
   902  		levelIter.Init(
   903  			keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
   904  			v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange,
   905  		)
   906  
   907  		kr := internalKeyRange{
   908  			smallest: meta.Smallest,
   909  			largest:  meta.Largest,
   910  		}
   911  		overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, kr, comparer.Compare)
   912  		err := iter.Close() // Closes range del iter as well.
   913  		err = firstError(err, levelIter.Close())
   914  		if err != nil {
   915  			return 0, nil, err
   916  		}
   917  		if overlap {
   918  			return targetLevel, nil, nil
   919  		}
   920  	}
   921  
   922  	level := baseLevel
   923  	for ; level < numLevels; level++ {
   924  		levelIter := newLevelIter(context.Background(),
   925  			iterOps, comparer, newIters, v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{})
   926  		var rangeDelIter keyspan.FragmentIterator
   927  		// Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
   928  		// sets it up for the target file.
   929  		levelIter.initRangeDel(&rangeDelIter)
   930  
   931  		rkeyLevelIter := &keyspan.LevelIter{}
   932  		rkeyLevelIter.Init(
   933  			keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
   934  			v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange,
   935  		)
   936  
   937  		kr := internalKeyRange{
   938  			smallest: meta.Smallest,
   939  			largest:  meta.Largest,
   940  		}
   941  		overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, kr, comparer.Compare)
   942  		err := levelIter.Close() // Closes range del iter as well.
   943  		err = firstError(err, rkeyLevelIter.Close())
   944  		if err != nil {
   945  			return 0, nil, err
   946  		}
   947  		if overlap {
   948  			return targetLevel, splitFile, nil
   949  		}
   950  
   951  		// Check boundary overlap.
   952  		var candidateSplitFile *fileMetadata
   953  		boundaryOverlaps := v.Overlaps(level, comparer.Compare, meta.Smallest.UserKey,
   954  			meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
   955  		if !boundaryOverlaps.Empty() {
   956  			// We are already guaranteed to not have any data overlaps with files
   957  			// in boundaryOverlaps, otherwise we'd have returned in the above if
   958  			// statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for
   959  			// the case where we can slot this file into the current level despite
   960  			// a boundary overlap, by splitting one existing file into two virtual
   961  			// sstables.
   962  			if suggestSplit && boundaryOverlaps.Len() == 1 {
   963  				iter := boundaryOverlaps.Iter()
   964  				candidateSplitFile = iter.First()
   965  			} else {
   966  				// We either don't want to suggest ingest-time splits (i.e.
   967  				// !suggestSplit), or we boundary-overlapped with more than one file.
   968  				continue
   969  			}
   970  		}
   971  
   972  		// Check boundary overlap with any ongoing compactions. We consider an
   973  		// overlapping compaction that's writing files to an output level as
   974  		// equivalent to boundary overlap with files in that output level.
   975  		//
   976  		// We cannot check for data overlap with the new SSTs compaction will produce
   977  		// since compaction hasn't been done yet. However, there's no need to check
   978  		// since all keys in them will be from levels in [c.startLevel,
   979  		// c.outputLevel], and all those levels have already had their data overlap
   980  		// tested negative (else we'd have returned earlier).
   981  		//
   982  		// An alternative approach would be to cancel these compactions and proceed
   983  		// with an ingest-time split on this level if necessary. However, compaction
   984  		// cancellation can result in significant wasted effort and is best avoided
   985  		// unless necessary.
   986  		overlaps := false
   987  		for c := range compactions {
   988  			if c.outputLevel == nil || level != c.outputLevel.level {
   989  				continue
   990  			}
   991  			if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 &&
   992  				comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 {
   993  				overlaps = true
   994  				break
   995  			}
   996  		}
   997  		if !overlaps {
   998  			targetLevel = level
   999  			splitFile = candidateSplitFile
  1000  		}
  1001  	}
  1002  	return targetLevel, splitFile, nil
  1003  }
  1004  
  1005  // Ingest ingests a set of sstables into the DB. Ingestion of the files is
  1006  // atomic and semantically equivalent to creating a single batch containing all
  1007  // of the mutations in the sstables. Ingestion may require the memtable to be
  1008  // flushed. The ingested sstable files are moved into the DB and must reside on
  1009  // the same filesystem as the DB. Sstables can be created for ingestion using
  1010  // sstable.Writer. On success, Ingest removes the input paths.
  1011  //
  1012  // Two types of sstables are accepted for ingestion(s): one is sstables present
  1013  // in the instance's vfs.FS and can be referenced locally. The other is sstables
  1014  // present in remote.Storage, referred to as shared or foreign sstables. These
  1015  // shared sstables can be linked through objstorageprovider.Provider, and do not
  1016  // need to already be present on the local vfs.FS. Foreign sstables must all fit
  1017  // in an excise span, and are destined for a level specified in SharedSSTMeta.
  1018  //
  1019  // All sstables *must* be Sync()'d by the caller after all bytes are written
  1020  // and before its file handle is closed; failure to do so could violate
  1021  // durability or lead to corrupted on-disk state. This method cannot, in a
  1022  // platform-and-FS-agnostic way, ensure that all sstables in the input are
  1023  // properly synced to disk. Opening new file handles and Sync()-ing them
  1024  // does not always guarantee durability; see the discussion here on that:
  1025  // https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379
  1026  //
  1027  // Ingestion loads each sstable into the lowest level of the LSM which it
  1028  // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable,
  1029  // ingestion forces the memtable to flush, and then waits for the flush to
  1030  // occur. In some cases, such as with no foreign sstables and no excise span,
  1031  // ingestion that gets blocked on a memtable can join the flushable queue and
  1032  // finish even before the memtable has been flushed.
  1033  //
  1034  // The steps for ingestion are:
  1035  //
  1036  //  1. Allocate file numbers for every sstable being ingested.
  1037  //  2. Load the metadata for all sstables being ingested.
  1038  //  3. Sort the sstables by smallest key, verifying non overlap (for local
  1039  //     sstables).
  1040  //  4. Hard link (or copy) the local sstables into the DB directory.
  1041  //  5. Allocate a sequence number to use for all of the entries in the
  1042  //     local sstables. This is the step where overlap with memtables is
  1043  //     determined. If there is overlap, we remember the most recent memtable
  1044  //     that overlaps.
  1045  //  6. Update the sequence number in the ingested local sstables. (Remote
  1046  //     sstables get fixed sequence numbers that were determined at load time.)
  1047  //  7. Wait for the most recent memtable that overlaps to flush (if any).
  1048  //  8. Add the ingested sstables to the version (DB.ingestApply).
  1049  //     8.1.  If an excise span was specified, figure out what sstables in the
  1050  //     current version overlap with the excise span, and create new virtual
  1051  //     sstables out of those sstables that exclude the excised span (DB.excise).
  1052  //  9. Publish the ingestion sequence number.
  1053  //
  1054  // Note that if the mutable memtable overlaps with ingestion, a flush of the
  1055  // memtable is forced equivalent to DB.Flush. Additionally, subsequent
  1056  // mutations that get sequence numbers larger than the ingestion sequence
  1057  // number get queued up behind the ingestion waiting for it to complete. This
  1058  // can produce a noticeable hiccup in performance. See
  1059  // https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix
  1060  // this hiccup.
  1061  func (d *DB) Ingest(paths []string) error {
  1062  	if err := d.closed.Load(); err != nil {
  1063  		panic(err)
  1064  	}
  1065  	if d.opts.ReadOnly {
  1066  		return ErrReadOnly
  1067  	}
  1068  	_, err := d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
  1069  	return err
  1070  }
  1071  
  1072  // IngestOperationStats provides some information about where in the LSM the
  1073  // bytes were ingested.
  1074  type IngestOperationStats struct {
  1075  	// Bytes is the total bytes in the ingested sstables.
  1076  	Bytes uint64
  1077  	// ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested
  1078  	// into L0. This value is approximate when flushable ingests are active and
  1079  	// an ingest overlaps an entry in the flushable queue. Currently, this
  1080  	// approximation is very rough, only including tables that overlapped the
  1081  	// memtable. This estimate may be improved with #2112.
  1082  	ApproxIngestedIntoL0Bytes uint64
  1083  	// MemtableOverlappingFiles is the count of ingested sstables
  1084  	// that overlapped keys in the memtables.
  1085  	MemtableOverlappingFiles int
  1086  }
  1087  
  1088  // ExternalFile are external sstables that can be referenced through
  1089  // objprovider and ingested as remote files that will not be refcounted or
  1090  // cleaned up. For use with online restore. Note that the underlying sstable
  1091  // could contain keys outside the [Smallest,Largest) bounds; however Pebble
  1092  // is expected to only read the keys within those bounds.
  1093  type ExternalFile struct {
  1094  	// Locator is the shared.Locator that can be used with objProvider to
  1095  	// resolve a reference to this external sstable.
  1096  	Locator remote.Locator
  1097  	// ObjName is the unique name of this sstable on Locator.
  1098  	ObjName string
  1099  	// Size of the referenced proportion of the virtualized sstable. An estimate
  1100  	// is acceptable in lieu of the backing file size.
  1101  	Size uint64
  1102  	// SmallestUserKey and LargestUserKey are the [smallest,largest) user key
  1103  	// bounds of the sstable. Both these bounds are loose i.e. it's possible for
  1104  	// the sstable to not span the entirety of this range. However, multiple
  1105  	// ExternalFiles in one ingestion must all have non-overlapping
  1106  	// [smallest, largest) spans. Note that this Largest bound is exclusive.
  1107  	SmallestUserKey, LargestUserKey []byte
  1108  	// HasPointKey and HasRangeKey denote whether this file contains point keys
  1109  	// or range keys. If both structs are false, an error is returned during
  1110  	// ingestion.
  1111  	HasPointKey, HasRangeKey bool
  1112  }
  1113  
  1114  // IngestWithStats does the same as Ingest, and additionally returns
  1115  // IngestOperationStats.
  1116  func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) {
  1117  	if err := d.closed.Load(); err != nil {
  1118  		panic(err)
  1119  	}
  1120  	if d.opts.ReadOnly {
  1121  		return IngestOperationStats{}, ErrReadOnly
  1122  	}
  1123  	return d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
  1124  }
  1125  
  1126  // IngestExternalFiles does the same as IngestWithStats, and additionally
  1127  // accepts external files (with locator info that can be resolved using
  1128  // d.opts.SharedStorage). These files must also be non-overlapping with
  1129  // each other, and must be resolvable through d.objProvider.
  1130  func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error) {
  1131  	if err := d.closed.Load(); err != nil {
  1132  		panic(err)
  1133  	}
  1134  
  1135  	if d.opts.ReadOnly {
  1136  		return IngestOperationStats{}, ErrReadOnly
  1137  	}
  1138  	if d.opts.Experimental.RemoteStorage == nil {
  1139  		return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured")
  1140  	}
  1141  	return d.ingest(nil, ingestTargetLevel, nil /* shared */, KeyRange{}, external)
  1142  }
  1143  
  1144  // IngestAndExcise does the same as IngestWithStats, and additionally accepts a
  1145  // list of shared files to ingest that can be read from a remote.Storage through
  1146  // a Provider. All the shared files must live within exciseSpan, and any existing
  1147  // keys in exciseSpan are deleted by turning existing sstables into virtual
  1148  // sstables (if not virtual already) and shrinking their spans to exclude
  1149  // exciseSpan. See the comment at Ingest for a more complete picture of the
  1150  // ingestion process.
  1151  //
  1152  // Panics if this DB instance was not instantiated with a remote.Storage and
  1153  // shared sstables are present.
  1154  func (d *DB) IngestAndExcise(
  1155  	paths []string, shared []SharedSSTMeta, exciseSpan KeyRange,
  1156  ) (IngestOperationStats, error) {
  1157  	if err := d.closed.Load(); err != nil {
  1158  		panic(err)
  1159  	}
  1160  	if d.opts.ReadOnly {
  1161  		return IngestOperationStats{}, ErrReadOnly
  1162  	}
  1163  	if invariants.Enabled && d.opts.Comparer.Split != nil {
  1164  		// Excise is only supported on prefix keys.
  1165  		if d.opts.Comparer.Split(exciseSpan.Start) != len(exciseSpan.Start) {
  1166  			panic("IngestAndExcise called with suffixed start key")
  1167  		}
  1168  		if d.opts.Comparer.Split(exciseSpan.End) != len(exciseSpan.End) {
  1169  			panic("IngestAndExcise called with suffixed end key")
  1170  		}
  1171  	}
  1172  	return d.ingest(paths, ingestTargetLevel, shared, exciseSpan, nil /* external */)
  1173  }
  1174  
  1175  // Both DB.mu and commitPipeline.mu must be held while this is called.
  1176  func (d *DB) newIngestedFlushableEntry(
  1177  	meta []*fileMetadata, seqNum uint64, logNum base.DiskFileNum,
  1178  ) (*flushableEntry, error) {
  1179  	// Update the sequence number for all of the sstables in the
  1180  	// metadata. Writing the metadata to the manifest when the
  1181  	// version edit is applied is the mechanism that persists the
  1182  	// sequence number. The sstables themselves are left unmodified.
  1183  	// In this case, a version edit will only be written to the manifest
  1184  	// when the flushable is eventually flushed. If Pebble restarts in that
  1185  	// time, then we'll lose the ingest sequence number information. But this
  1186  	// information will also be reconstructed on node restart.
  1187  	if err := ingestUpdateSeqNum(
  1188  		d.cmp, d.opts.Comparer.FormatKey, seqNum, ingestLoadResult{localMeta: meta},
  1189  	); err != nil {
  1190  		return nil, err
  1191  	}
  1192  
  1193  	f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter)
  1194  
  1195  	// NB: The logNum/seqNum are the WAL number which we're writing this entry
  1196  	// to and the sequence number within the WAL which we'll write this entry
  1197  	// to.
  1198  	entry := d.newFlushableEntry(f, logNum, seqNum)
  1199  	// The flushable entry starts off with a single reader ref, so increment
  1200  	// the FileMetadata.Refs.
  1201  	for _, file := range f.files {
  1202  		file.Ref()
  1203  	}
  1204  	entry.unrefFiles = func() []*fileBacking {
  1205  		var obsolete []*fileBacking
  1206  		for _, file := range f.files {
  1207  			if file.Unref() == 0 {
  1208  				obsolete = append(obsolete, file.FileMetadata.FileBacking)
  1209  			}
  1210  		}
  1211  		return obsolete
  1212  	}
  1213  
  1214  	entry.flushForced = true
  1215  	entry.releaseMemAccounting = func() {}
  1216  	return entry, nil
  1217  }
  1218  
  1219  // Both DB.mu and commitPipeline.mu must be held while this is called. Since
  1220  // we're holding both locks, the order in which we rotate the memtable or
  1221  // recycle the WAL in this function is irrelevant as long as the correct log
  1222  // numbers are assigned to the appropriate flushable.
  1223  func (d *DB) handleIngestAsFlushable(meta []*fileMetadata, seqNum uint64) error {
  1224  	b := d.NewBatch()
  1225  	for _, m := range meta {
  1226  		b.ingestSST(m.FileNum)
  1227  	}
  1228  	b.setSeqNum(seqNum)
  1229  
  1230  	// If the WAL is disabled, then the logNum used to create the flushable
  1231  	// entry doesn't matter. We just use the logNum assigned to the current
  1232  	// mutable memtable. If the WAL is enabled, then this logNum will be
  1233  	// overwritten by the logNum of the log which will contain the log entry
  1234  	// for the ingestedFlushable.
  1235  	logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
  1236  	if !d.opts.DisableWAL {
  1237  		// We create a new WAL for the flushable instead of reusing the end of
  1238  		// the previous WAL. This simplifies the increment of the minimum
  1239  		// unflushed log number, and also simplifies WAL replay.
  1240  		logNum, _ = d.recycleWAL()
  1241  		d.mu.Unlock()
  1242  		err := d.commit.directWrite(b)
  1243  		if err != nil {
  1244  			d.opts.Logger.Fatalf("%v", err)
  1245  		}
  1246  		d.mu.Lock()
  1247  	}
  1248  
  1249  	entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum)
  1250  	if err != nil {
  1251  		return err
  1252  	}
  1253  	nextSeqNum := seqNum + uint64(b.Count())
  1254  
  1255  	// Set newLogNum to the logNum of the previous flushable. This value is
  1256  	// irrelevant if the WAL is disabled. If the WAL is enabled, then we set
  1257  	// the appropriate value below.
  1258  	newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
  1259  	if !d.opts.DisableWAL {
  1260  		// This is WAL num of the next mutable memtable which comes after the
  1261  		// ingestedFlushable in the flushable queue. The mutable memtable
  1262  		// will be created below.
  1263  		newLogNum, _ = d.recycleWAL()
  1264  		if err != nil {
  1265  			return err
  1266  		}
  1267  	}
  1268  
  1269  	currMem := d.mu.mem.mutable
  1270  	// NB: Placing ingested sstables above the current memtables
  1271  	// requires rotating of the existing memtables/WAL. There is
  1272  	// some concern of churning through tiny memtables due to
  1273  	// ingested sstables being placed on top of them, but those
  1274  	// memtables would have to be flushed anyways.
  1275  	d.mu.mem.queue = append(d.mu.mem.queue, entry)
  1276  	d.rotateMemtable(newLogNum, nextSeqNum, currMem)
  1277  	d.updateReadStateLocked(d.opts.DebugCheck)
  1278  	d.maybeScheduleFlush()
  1279  	return nil
  1280  }
  1281  
  1282  // See comment at Ingest() for details on how this works.
  1283  func (d *DB) ingest(
  1284  	paths []string,
  1285  	targetLevelFunc ingestTargetLevelFunc,
  1286  	shared []SharedSSTMeta,
  1287  	exciseSpan KeyRange,
  1288  	external []ExternalFile,
  1289  ) (IngestOperationStats, error) {
  1290  	if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil {
  1291  		panic("cannot ingest shared sstables with nil SharedStorage")
  1292  	}
  1293  	if (exciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables {
  1294  		return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion")
  1295  	}
  1296  	// Allocate file numbers for all of the files being ingested and mark them as
  1297  	// pending in order to prevent them from being deleted. Note that this causes
  1298  	// the file number ordering to be out of alignment with sequence number
  1299  	// ordering. The sorting of L0 tables by sequence number avoids relying on
  1300  	// that (busted) invariant.
  1301  	d.mu.Lock()
  1302  	pendingOutputs := make([]base.DiskFileNum, len(paths)+len(shared)+len(external))
  1303  	for i := 0; i < len(paths)+len(shared)+len(external); i++ {
  1304  		pendingOutputs[i] = d.mu.versions.getNextDiskFileNum()
  1305  	}
  1306  
  1307  	jobID := d.mu.nextJobID
  1308  	d.mu.nextJobID++
  1309  	d.mu.Unlock()
  1310  
  1311  	// Load the metadata for all the files being ingested. This step detects
  1312  	// and elides empty sstables.
  1313  	loadResult, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheID, pendingOutputs, d.objProvider, jobID)
  1314  	if err != nil {
  1315  		return IngestOperationStats{}, err
  1316  	}
  1317  
  1318  	if loadResult.fileCount == 0 {
  1319  		// All of the sstables to be ingested were empty. Nothing to do.
  1320  		return IngestOperationStats{}, nil
  1321  	}
  1322  
  1323  	// Verify the sstables do not overlap.
  1324  	if err := ingestSortAndVerify(d.cmp, loadResult, exciseSpan); err != nil {
  1325  		return IngestOperationStats{}, err
  1326  	}
  1327  
  1328  	// Hard link the sstables into the DB directory. Since the sstables aren't
  1329  	// referenced by a version, they won't be used. If the hard linking fails
  1330  	// (e.g. because the files reside on a different filesystem), ingestLink will
  1331  	// fall back to copying, and if that fails we undo our work and return an
  1332  	// error.
  1333  	if err := ingestLink(jobID, d.opts, d.objProvider, loadResult, shared); err != nil {
  1334  		return IngestOperationStats{}, err
  1335  	}
  1336  
  1337  	// Make the new tables durable. We need to do this at some point before we
  1338  	// update the MANIFEST (via logAndApply), otherwise a crash can have the
  1339  	// tables referenced in the MANIFEST, but not present in the provider.
  1340  	if err := d.objProvider.Sync(); err != nil {
  1341  		return IngestOperationStats{}, err
  1342  	}
  1343  
  1344  	// metaFlushableOverlaps is a slice parallel to meta indicating which of the
  1345  	// ingested sstables overlap some table in the flushable queue. It's used to
  1346  	// approximate ingest-into-L0 stats when using flushable ingests.
  1347  	metaFlushableOverlaps := make([]bool, loadResult.fileCount)
  1348  	var mem *flushableEntry
  1349  	var mut *memTable
  1350  	// asFlushable indicates whether the sstable was ingested as a flushable.
  1351  	var asFlushable bool
  1352  	iterOps := IterOptions{
  1353  		CategoryAndQoS: sstable.CategoryAndQoS{
  1354  			Category: "pebble-ingest",
  1355  			QoSLevel: sstable.LatencySensitiveQoSLevel,
  1356  		},
  1357  	}
  1358  	prepare := func(seqNum uint64) {
  1359  		// Note that d.commit.mu is held by commitPipeline when calling prepare.
  1360  
  1361  		d.mu.Lock()
  1362  		defer d.mu.Unlock()
  1363  
  1364  		// Check to see if any files overlap with any of the memtables. The queue
  1365  		// is ordered from oldest to newest with the mutable memtable being the
  1366  		// last element in the slice. We want to wait for the newest table that
  1367  		// overlaps.
  1368  
  1369  		for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
  1370  			m := d.mu.mem.queue[i]
  1371  			iter := m.newIter(&iterOps)
  1372  			rangeDelIter := m.newRangeDelIter(&iterOps)
  1373  			rkeyIter := m.newRangeKeyIter(&iterOps)
  1374  
  1375  			checkForOverlap := func(i int, meta *fileMetadata) {
  1376  				if metaFlushableOverlaps[i] {
  1377  					// This table already overlapped a more recent flushable.
  1378  					return
  1379  				}
  1380  				kr := internalKeyRange{
  1381  					smallest: meta.Smallest,
  1382  					largest:  meta.Largest,
  1383  				}
  1384  				if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
  1385  					// If this is the first table to overlap a flushable, save
  1386  					// the flushable. This ingest must be ingested or flushed
  1387  					// after it.
  1388  					if mem == nil {
  1389  						mem = m
  1390  					}
  1391  					metaFlushableOverlaps[i] = true
  1392  				}
  1393  			}
  1394  			for i := range loadResult.localMeta {
  1395  				checkForOverlap(i, loadResult.localMeta[i])
  1396  			}
  1397  			for i := range loadResult.sharedMeta {
  1398  				checkForOverlap(len(loadResult.localMeta)+i, loadResult.sharedMeta[i])
  1399  			}
  1400  			for i := range loadResult.externalMeta {
  1401  				checkForOverlap(len(loadResult.localMeta)+len(loadResult.sharedMeta)+i, loadResult.externalMeta[i])
  1402  			}
  1403  			if exciseSpan.Valid() {
  1404  				kr := internalKeyRange{
  1405  					smallest: base.MakeInternalKey(exciseSpan.Start, InternalKeySeqNumMax, InternalKeyKindMax),
  1406  					largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, exciseSpan.End),
  1407  				}
  1408  				if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
  1409  					if mem == nil {
  1410  						mem = m
  1411  					}
  1412  				}
  1413  			}
  1414  			err := iter.Close()
  1415  			if rangeDelIter != nil {
  1416  				err = firstError(err, rangeDelIter.Close())
  1417  			}
  1418  			if rkeyIter != nil {
  1419  				err = firstError(err, rkeyIter.Close())
  1420  			}
  1421  			if err != nil {
  1422  				d.opts.Logger.Errorf("ingest error reading flushable for log %s: %s", m.logNum, err)
  1423  			}
  1424  		}
  1425  
  1426  		if mem == nil {
  1427  			// No overlap with any of the queued flushables, so no need to queue
  1428  			// after them.
  1429  
  1430  			// New writes with higher sequence numbers may be concurrently
  1431  			// committed. We must ensure they don't flush before this ingest
  1432  			// completes. To do that, we ref the mutable memtable as a writer,
  1433  			// preventing its flushing (and the flushing of all subsequent
  1434  			// flushables in the queue). Once we've acquired the manifest lock
  1435  			// to add the ingested sstables to the LSM, we can unref as we're
  1436  			// guaranteed that the flush won't edit the LSM before this ingest.
  1437  			mut = d.mu.mem.mutable
  1438  			mut.writerRef()
  1439  			return
  1440  		}
  1441  		// The ingestion overlaps with some entry in the flushable queue.
  1442  		if d.FormatMajorVersion() < FormatFlushableIngest ||
  1443  			d.opts.Experimental.DisableIngestAsFlushable() ||
  1444  			len(shared) > 0 || exciseSpan.Valid() || len(external) > 0 ||
  1445  			(len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) {
  1446  			// We're not able to ingest as a flushable,
  1447  			// so we must synchronously flush.
  1448  			//
  1449  			// TODO(bilal): Currently, if any of the files being ingested are shared or
  1450  			// there's an excise span present, we cannot use flushable ingests and need
  1451  			// to wait synchronously. Either remove this caveat by fleshing out
  1452  			// flushable ingest logic to also account for these cases, or remove this
  1453  			// comment. Tracking issue: https://github.com/cockroachdb/pebble/issues/2676
  1454  			if mem.flushable == d.mu.mem.mutable {
  1455  				err = d.makeRoomForWrite(nil)
  1456  			}
  1457  			// New writes with higher sequence numbers may be concurrently
  1458  			// committed. We must ensure they don't flush before this ingest
  1459  			// completes. To do that, we ref the mutable memtable as a writer,
  1460  			// preventing its flushing (and the flushing of all subsequent
  1461  			// flushables in the queue). Once we've acquired the manifest lock
  1462  			// to add the ingested sstables to the LSM, we can unref as we're
  1463  			// guaranteed that the flush won't edit the LSM before this ingest.
  1464  			mut = d.mu.mem.mutable
  1465  			mut.writerRef()
  1466  			mem.flushForced = true
  1467  			d.maybeScheduleFlush()
  1468  			return
  1469  		}
  1470  		// Since there aren't too many memtables already queued up, we can
  1471  		// slide the ingested sstables on top of the existing memtables.
  1472  		asFlushable = true
  1473  		err = d.handleIngestAsFlushable(loadResult.localMeta, seqNum)
  1474  	}
  1475  
  1476  	var ve *versionEdit
  1477  	apply := func(seqNum uint64) {
  1478  		if err != nil || asFlushable {
  1479  			// An error occurred during prepare.
  1480  			if mut != nil {
  1481  				if mut.writerUnref() {
  1482  					d.mu.Lock()
  1483  					d.maybeScheduleFlush()
  1484  					d.mu.Unlock()
  1485  				}
  1486  			}
  1487  			return
  1488  		}
  1489  
  1490  		// Update the sequence numbers for all ingested sstables'
  1491  		// metadata. When the version edit is applied, the metadata is
  1492  		// written to the manifest, persisting the sequence number.
  1493  		// The sstables themselves are left unmodified.
  1494  		if err = ingestUpdateSeqNum(
  1495  			d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult,
  1496  		); err != nil {
  1497  			if mut != nil {
  1498  				if mut.writerUnref() {
  1499  					d.mu.Lock()
  1500  					d.maybeScheduleFlush()
  1501  					d.mu.Unlock()
  1502  				}
  1503  			}
  1504  			return
  1505  		}
  1506  
  1507  		// If we overlapped with a memtable in prepare wait for the flush to
  1508  		// finish.
  1509  		if mem != nil {
  1510  			<-mem.flushed
  1511  		}
  1512  
  1513  		// Assign the sstables to the correct level in the LSM and apply the
  1514  		// version edit.
  1515  		ve, err = d.ingestApply(jobID, loadResult, targetLevelFunc, mut, exciseSpan)
  1516  	}
  1517  
  1518  	// Only one ingest can occur at a time because if not, one would block waiting
  1519  	// for the other to finish applying. This blocking would happen while holding
  1520  	// the commit mutex which would prevent unrelated batches from writing their
  1521  	// changes to the WAL and memtable. This will cause a bigger commit hiccup
  1522  	// during ingestion.
  1523  	d.commit.ingestSem <- struct{}{}
  1524  	d.commit.AllocateSeqNum(loadResult.fileCount, prepare, apply)
  1525  	<-d.commit.ingestSem
  1526  
  1527  	if err != nil {
  1528  		if err2 := ingestCleanup(d.objProvider, loadResult.localMeta); err2 != nil {
  1529  			d.opts.Logger.Errorf("ingest cleanup failed: %v", err2)
  1530  		}
  1531  	} else {
  1532  		// Since we either created a hard link to the ingesting files, or copied
  1533  		// them over, it is safe to remove the originals paths.
  1534  		for _, path := range loadResult.localPaths {
  1535  			if err2 := d.opts.FS.Remove(path); err2 != nil {
  1536  				d.opts.Logger.Errorf("ingest failed to remove original file: %s", err2)
  1537  			}
  1538  		}
  1539  	}
  1540  
  1541  	info := TableIngestInfo{
  1542  		JobID:     jobID,
  1543  		Err:       err,
  1544  		flushable: asFlushable,
  1545  	}
  1546  	if len(loadResult.localMeta) > 0 {
  1547  		info.GlobalSeqNum = loadResult.localMeta[0].SmallestSeqNum
  1548  	} else if len(loadResult.sharedMeta) > 0 {
  1549  		info.GlobalSeqNum = loadResult.sharedMeta[0].SmallestSeqNum
  1550  	} else {
  1551  		info.GlobalSeqNum = loadResult.externalMeta[0].SmallestSeqNum
  1552  	}
  1553  	var stats IngestOperationStats
  1554  	if ve != nil {
  1555  		info.Tables = make([]struct {
  1556  			TableInfo
  1557  			Level int
  1558  		}, len(ve.NewFiles))
  1559  		for i := range ve.NewFiles {
  1560  			e := &ve.NewFiles[i]
  1561  			info.Tables[i].Level = e.Level
  1562  			info.Tables[i].TableInfo = e.Meta.TableInfo()
  1563  			stats.Bytes += e.Meta.Size
  1564  			if e.Level == 0 {
  1565  				stats.ApproxIngestedIntoL0Bytes += e.Meta.Size
  1566  			}
  1567  			if i < len(metaFlushableOverlaps) && metaFlushableOverlaps[i] {
  1568  				stats.MemtableOverlappingFiles++
  1569  			}
  1570  		}
  1571  	} else if asFlushable {
  1572  		// NB: If asFlushable == true, there are no shared sstables.
  1573  		info.Tables = make([]struct {
  1574  			TableInfo
  1575  			Level int
  1576  		}, len(loadResult.localMeta))
  1577  		for i, f := range loadResult.localMeta {
  1578  			info.Tables[i].Level = -1
  1579  			info.Tables[i].TableInfo = f.TableInfo()
  1580  			stats.Bytes += f.Size
  1581  			// We don't have exact stats on which files will be ingested into
  1582  			// L0, because actual ingestion into the LSM has been deferred until
  1583  			// flush time. Instead, we infer based on memtable overlap.
  1584  			//
  1585  			// TODO(jackson): If we optimistically compute data overlap (#2112)
  1586  			// before entering the commit pipeline, we can use that overlap to
  1587  			// improve our approximation by incorporating overlap with L0, not
  1588  			// just memtables.
  1589  			if metaFlushableOverlaps[i] {
  1590  				stats.ApproxIngestedIntoL0Bytes += f.Size
  1591  				stats.MemtableOverlappingFiles++
  1592  			}
  1593  		}
  1594  	}
  1595  	d.opts.EventListener.TableIngested(info)
  1596  
  1597  	return stats, err
  1598  }
  1599  
  1600  // excise updates ve to include a replacement of the file m with new virtual
  1601  // sstables that exclude exciseSpan, returning a slice of newly-created files if
  1602  // any. If the entirety of m is deleted by exciseSpan, no new sstables are added
  1603  // and m is deleted. Note that ve is updated in-place.
  1604  //
  1605  // The manifest lock must be held when calling this method.
  1606  func (d *DB) excise(
  1607  	exciseSpan KeyRange, m *fileMetadata, ve *versionEdit, level int,
  1608  ) ([]manifest.NewFileEntry, error) {
  1609  	numCreatedFiles := 0
  1610  	// Check if there's actually an overlap between m and exciseSpan.
  1611  	if !exciseSpan.Overlaps(d.cmp, m) {
  1612  		return nil, nil
  1613  	}
  1614  	ve.DeletedFiles[deletedFileEntry{
  1615  		Level:   level,
  1616  		FileNum: m.FileNum,
  1617  	}] = m
  1618  	// Fast path: m sits entirely within the exciseSpan, so just delete it.
  1619  	if exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
  1620  		return nil, nil
  1621  	}
  1622  	var iter internalIterator
  1623  	var rangeDelIter keyspan.FragmentIterator
  1624  	var rangeKeyIter keyspan.FragmentIterator
  1625  	needsBacking := false
  1626  	// Create a file to the left of the excise span, if necessary.
  1627  	// The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)].
  1628  	//
  1629  	// We create bounds that are tight on user keys, and we make the effort to find
  1630  	// the last key in the original sstable that's smaller than exciseSpan.Start
  1631  	// even though it requires some sstable reads. We could choose to create
  1632  	// virtual sstables on loose userKey bounds, in which case we could just set
  1633  	// leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest
  1634  	// issue with that approach would be that it'd lead to lots of small virtual
  1635  	// sstables in the LSM that have no guarantee on containing even a single user
  1636  	// key within the file bounds. This has the potential to increase both read and
  1637  	// write-amp as we will be opening up these sstables only to find no relevant
  1638  	// keys in the read path, and compacting sstables on top of them instead of
  1639  	// directly into the space occupied by them. We choose to incur the cost of
  1640  	// calculating tight bounds at this time instead of creating more work in the
  1641  	// future.
  1642  	//
  1643  	// TODO(bilal): Some of this work can happen without grabbing the manifest
  1644  	// lock; we could grab one currentVersion, release the lock, calculate excised
  1645  	// files, then grab the lock again and recalculate for just the files that
  1646  	// have changed since our previous calculation. Do this optimiaztino as part of
  1647  	// https://github.com/cockroachdb/pebble/issues/2112 .
  1648  	if d.cmp(m.Smallest.UserKey, exciseSpan.Start) < 0 {
  1649  		leftFile := &fileMetadata{
  1650  			Virtual:     true,
  1651  			FileBacking: m.FileBacking,
  1652  			FileNum:     d.mu.versions.getNextFileNum(),
  1653  			// Note that these are loose bounds for smallest/largest seqnums, but they're
  1654  			// sufficient for maintaining correctness.
  1655  			SmallestSeqNum: m.SmallestSeqNum,
  1656  			LargestSeqNum:  m.LargestSeqNum,
  1657  		}
  1658  		if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.SmallestPointKey) {
  1659  			// This file will contain point keys
  1660  			smallestPointKey := m.SmallestPointKey
  1661  			var err error
  1662  			iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{
  1663  				CategoryAndQoS: sstable.CategoryAndQoS{
  1664  					Category: "pebble-ingest",
  1665  					QoSLevel: sstable.LatencySensitiveQoSLevel,
  1666  				},
  1667  				level: manifest.Level(level),
  1668  			}, internalIterOpts{})
  1669  			if err != nil {
  1670  				return nil, err
  1671  			}
  1672  			var key *InternalKey
  1673  			if iter != nil {
  1674  				defer iter.Close()
  1675  				key, _ = iter.SeekLT(exciseSpan.Start, base.SeekLTFlagsNone)
  1676  			} else {
  1677  				iter = emptyIter
  1678  			}
  1679  			if key != nil {
  1680  				leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, key.Clone())
  1681  			}
  1682  			// Store the min of (exciseSpan.Start, rdel.End) in lastRangeDel. This
  1683  			// needs to be a copy if the key is owned by the range del iter.
  1684  			var lastRangeDel []byte
  1685  			if rangeDelIter != nil {
  1686  				defer rangeDelIter.Close()
  1687  				rdel := rangeDelIter.SeekLT(exciseSpan.Start)
  1688  				if rdel != nil {
  1689  					lastRangeDel = append(lastRangeDel[:0], rdel.End...)
  1690  					if d.cmp(lastRangeDel, exciseSpan.Start) > 0 {
  1691  						lastRangeDel = exciseSpan.Start
  1692  					}
  1693  				}
  1694  			} else {
  1695  				rangeDelIter = emptyKeyspanIter
  1696  			}
  1697  			if lastRangeDel != nil {
  1698  				leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel))
  1699  			}
  1700  		}
  1701  		if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.SmallestRangeKey) {
  1702  			// This file will contain range keys
  1703  			var err error
  1704  			smallestRangeKey := m.SmallestRangeKey
  1705  			rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
  1706  			if err != nil {
  1707  				return nil, err
  1708  			}
  1709  			// Store the min of (exciseSpan.Start, rkey.End) in lastRangeKey. This
  1710  			// needs to be a copy if the key is owned by the range key iter.
  1711  			var lastRangeKey []byte
  1712  			var lastRangeKeyKind InternalKeyKind
  1713  			defer rangeKeyIter.Close()
  1714  			rkey := rangeKeyIter.SeekLT(exciseSpan.Start)
  1715  			if rkey != nil {
  1716  				lastRangeKey = append(lastRangeKey[:0], rkey.End...)
  1717  				if d.cmp(lastRangeKey, exciseSpan.Start) > 0 {
  1718  					lastRangeKey = exciseSpan.Start
  1719  				}
  1720  				lastRangeKeyKind = rkey.Keys[0].Kind()
  1721  			}
  1722  			if lastRangeKey != nil {
  1723  				leftFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, base.MakeExclusiveSentinelKey(lastRangeKeyKind, lastRangeKey))
  1724  			}
  1725  		}
  1726  		if leftFile.HasRangeKeys || leftFile.HasPointKeys {
  1727  			var err error
  1728  			leftFile.Size, err = d.tableCache.estimateSize(m, leftFile.Smallest.UserKey, leftFile.Largest.UserKey)
  1729  			if err != nil {
  1730  				return nil, err
  1731  			}
  1732  			if leftFile.Size == 0 {
  1733  				// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
  1734  				// such as if the excised file only has range keys/dels and no point
  1735  				// keys. This can cause panics in places where we divide by file sizes.
  1736  				// Correct for it here.
  1737  				leftFile.Size = 1
  1738  			}
  1739  			if err := leftFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
  1740  				return nil, err
  1741  			}
  1742  			leftFile.ValidateVirtual(m)
  1743  			ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: leftFile})
  1744  			needsBacking = true
  1745  			numCreatedFiles++
  1746  		}
  1747  	}
  1748  	// Create a file to the right, if necessary.
  1749  	if exciseSpan.Contains(d.cmp, m.Largest) {
  1750  		// No key exists to the right of the excise span in this file.
  1751  		if needsBacking && !m.Virtual {
  1752  			// If m is virtual, then its file backing is already known to the manifest.
  1753  			// We don't need to create another file backing. Note that there must be
  1754  			// only one CreatedBackingTables entry per backing sstable. This is
  1755  			// indicated by the VersionEdit.CreatedBackingTables invariant.
  1756  			ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  1757  		}
  1758  		return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
  1759  	}
  1760  	// Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest].
  1761  	//
  1762  	// See comment before the definition of leftFile for the motivation behind
  1763  	// calculating tight user-key bounds.
  1764  	rightFile := &fileMetadata{
  1765  		Virtual:     true,
  1766  		FileBacking: m.FileBacking,
  1767  		FileNum:     d.mu.versions.getNextFileNum(),
  1768  		// Note that these are loose bounds for smallest/largest seqnums, but they're
  1769  		// sufficient for maintaining correctness.
  1770  		SmallestSeqNum: m.SmallestSeqNum,
  1771  		LargestSeqNum:  m.LargestSeqNum,
  1772  	}
  1773  	if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.LargestPointKey) {
  1774  		// This file will contain point keys
  1775  		largestPointKey := m.LargestPointKey
  1776  		var err error
  1777  		if iter == nil && rangeDelIter == nil {
  1778  			iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{
  1779  				CategoryAndQoS: sstable.CategoryAndQoS{
  1780  					Category: "pebble-ingest",
  1781  					QoSLevel: sstable.LatencySensitiveQoSLevel,
  1782  				},
  1783  				level: manifest.Level(level),
  1784  			}, internalIterOpts{})
  1785  			if err != nil {
  1786  				return nil, err
  1787  			}
  1788  			if iter != nil {
  1789  				defer iter.Close()
  1790  			} else {
  1791  				iter = emptyIter
  1792  			}
  1793  			if rangeDelIter != nil {
  1794  				defer rangeDelIter.Close()
  1795  			} else {
  1796  				rangeDelIter = emptyKeyspanIter
  1797  			}
  1798  		}
  1799  		key, _ := iter.SeekGE(exciseSpan.End, base.SeekGEFlagsNone)
  1800  		if key != nil {
  1801  			rightFile.ExtendPointKeyBounds(d.cmp, key.Clone(), largestPointKey)
  1802  		}
  1803  		// Store the max of (exciseSpan.End, rdel.Start) in firstRangeDel. This
  1804  		// needs to be a copy if the key is owned by the range del iter.
  1805  		var firstRangeDel []byte
  1806  		rdel := rangeDelIter.SeekGE(exciseSpan.End)
  1807  		if rdel != nil {
  1808  			firstRangeDel = append(firstRangeDel[:0], rdel.Start...)
  1809  			if d.cmp(firstRangeDel, exciseSpan.End) < 0 {
  1810  				firstRangeDel = exciseSpan.End
  1811  			}
  1812  		}
  1813  		if firstRangeDel != nil {
  1814  			smallestPointKey := rdel.SmallestKey()
  1815  			smallestPointKey.UserKey = firstRangeDel
  1816  			rightFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, largestPointKey)
  1817  		}
  1818  	}
  1819  	if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.LargestRangeKey) {
  1820  		// This file will contain range keys.
  1821  		largestRangeKey := m.LargestRangeKey
  1822  		if rangeKeyIter == nil {
  1823  			var err error
  1824  			rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
  1825  			if err != nil {
  1826  				return nil, err
  1827  			}
  1828  			defer rangeKeyIter.Close()
  1829  		}
  1830  		// Store the max of (exciseSpan.End, rkey.Start) in firstRangeKey. This
  1831  		// needs to be a copy if the key is owned by the range key iter.
  1832  		var firstRangeKey []byte
  1833  		rkey := rangeKeyIter.SeekGE(exciseSpan.End)
  1834  		if rkey != nil {
  1835  			firstRangeKey = append(firstRangeKey[:0], rkey.Start...)
  1836  			if d.cmp(firstRangeKey, exciseSpan.End) < 0 {
  1837  				firstRangeKey = exciseSpan.End
  1838  			}
  1839  		}
  1840  		if firstRangeKey != nil {
  1841  			smallestRangeKey := rkey.SmallestKey()
  1842  			smallestRangeKey.UserKey = firstRangeKey
  1843  			// We call ExtendRangeKeyBounds so any internal boundType fields are
  1844  			// set correctly. Note that this is mildly wasteful as we'll be comparing
  1845  			// rightFile.{Smallest,Largest}RangeKey with themselves, which can be
  1846  			// avoided if we exported ExtendOverallKeyBounds or so.
  1847  			rightFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, largestRangeKey)
  1848  		}
  1849  	}
  1850  	if rightFile.HasRangeKeys || rightFile.HasPointKeys {
  1851  		var err error
  1852  		rightFile.Size, err = d.tableCache.estimateSize(m, rightFile.Smallest.UserKey, rightFile.Largest.UserKey)
  1853  		if err != nil {
  1854  			return nil, err
  1855  		}
  1856  		if rightFile.Size == 0 {
  1857  			// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
  1858  			// such as if the excised file only has range keys/dels and no point keys.
  1859  			// This can cause panics in places where we divide by file sizes. Correct
  1860  			// for it here.
  1861  			rightFile.Size = 1
  1862  		}
  1863  		rightFile.ValidateVirtual(m)
  1864  		ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: rightFile})
  1865  		needsBacking = true
  1866  		numCreatedFiles++
  1867  	}
  1868  
  1869  	if needsBacking && !m.Virtual {
  1870  		// If m is virtual, then its file backing is already known to the manifest.
  1871  		// We don't need to create another file backing. Note that there must be
  1872  		// only one CreatedBackingTables entry per backing sstable. This is
  1873  		// indicated by the VersionEdit.CreatedBackingTables invariant.
  1874  		ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  1875  	}
  1876  
  1877  	if err := rightFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
  1878  		return nil, err
  1879  	}
  1880  	return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
  1881  }
  1882  
  1883  type ingestTargetLevelFunc func(
  1884  	newIters tableNewIters,
  1885  	newRangeKeyIter keyspan.TableNewSpanIter,
  1886  	iterOps IterOptions,
  1887  	comparer *Comparer,
  1888  	v *version,
  1889  	baseLevel int,
  1890  	compactions map[*compaction]struct{},
  1891  	meta *fileMetadata,
  1892  	suggestSplit bool,
  1893  ) (int, *fileMetadata, error)
  1894  
  1895  type ingestSplitFile struct {
  1896  	// ingestFile is the file being ingested.
  1897  	ingestFile *fileMetadata
  1898  	// splitFile is the file that needs to be split to allow ingestFile to slot
  1899  	// into `level` level.
  1900  	splitFile *fileMetadata
  1901  	// The level where ingestFile will go (and where splitFile already is).
  1902  	level int
  1903  }
  1904  
  1905  // ingestSplit splits files specified in `files` and updates ve in-place to
  1906  // account for existing files getting split into two virtual sstables. The map
  1907  // `replacedFiles` contains an in-progress map of all files that have been
  1908  // replaced with new virtual sstables in this version edit so far, which is also
  1909  // updated in-place.
  1910  //
  1911  // d.mu as well as the manifest lock must be held when calling this method.
  1912  func (d *DB) ingestSplit(
  1913  	ve *versionEdit,
  1914  	updateMetrics func(*fileMetadata, int, []newFileEntry),
  1915  	files []ingestSplitFile,
  1916  	replacedFiles map[base.FileNum][]newFileEntry,
  1917  ) error {
  1918  	for _, s := range files {
  1919  		// replacedFiles can be thought of as a tree, where we start iterating with
  1920  		// s.splitFile and run its fileNum through replacedFiles, then find which of
  1921  		// the replaced files overlaps with s.ingestFile, which becomes the new
  1922  		// splitFile, then we check splitFile's replacements in replacedFiles again
  1923  		// for overlap with s.ingestFile, and so on until we either can't find the
  1924  		// current splitFile in replacedFiles (i.e. that's the file that now needs to
  1925  		// be split), or we don't find a file that overlaps with s.ingestFile, which
  1926  		// means a prior ingest split already produced enough room for s.ingestFile
  1927  		// to go into this level without necessitating another ingest split.
  1928  		splitFile := s.splitFile
  1929  		for splitFile != nil {
  1930  			replaced, ok := replacedFiles[splitFile.FileNum]
  1931  			if !ok {
  1932  				break
  1933  			}
  1934  			updatedSplitFile := false
  1935  			for i := range replaced {
  1936  				if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) {
  1937  					if updatedSplitFile {
  1938  						// This should never happen because the earlier ingestTargetLevel
  1939  						// function only finds split file candidates that are guaranteed to
  1940  						// have no data overlap, only boundary overlap. See the comments
  1941  						// in that method to see the definitions of data vs boundary
  1942  						// overlap. That, plus the fact that files in `replaced` are
  1943  						// guaranteed to have file bounds that are tight on user keys
  1944  						// (as that's what `d.excise` produces), means that the only case
  1945  						// where we overlap with two or more files in `replaced` is if we
  1946  						// actually had data overlap all along, or if the ingestion files
  1947  						// were overlapping, either of which is an invariant violation.
  1948  						panic("updated with two files in ingestSplit")
  1949  					}
  1950  					splitFile = replaced[i].Meta
  1951  					updatedSplitFile = true
  1952  				}
  1953  			}
  1954  			if !updatedSplitFile {
  1955  				// None of the replaced files overlapped with the file being ingested.
  1956  				// This can happen if we've already excised a span overlapping with
  1957  				// this file, or if we have consecutive ingested files that can slide
  1958  				// within the same gap between keys in an existing file. For instance,
  1959  				// if an existing file has keys a and g and we're ingesting b-c, d-e,
  1960  				// the first loop iteration will split the existing file into one that
  1961  				// ends in a and another that starts at g, and the second iteration will
  1962  				// fall into this case and require no splitting.
  1963  				//
  1964  				// No splitting necessary.
  1965  				splitFile = nil
  1966  			}
  1967  		}
  1968  		if splitFile == nil {
  1969  			continue
  1970  		}
  1971  		// NB: excise operates on [start, end). We're splitting at [start, end]
  1972  		// (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation
  1973  		// of exclusive vs inclusive end bounds should not make a difference here
  1974  		// as we're guaranteed to not have any data overlap between splitFile and
  1975  		// s.ingestFile, so panic if we do see a newly added file with an endKey
  1976  		// equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel()
  1977  		added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level)
  1978  		if err != nil {
  1979  			return err
  1980  		}
  1981  		if _, ok := ve.DeletedFiles[deletedFileEntry{
  1982  			Level:   s.level,
  1983  			FileNum: splitFile.FileNum,
  1984  		}]; !ok {
  1985  			panic("did not split file that was expected to be split")
  1986  		}
  1987  		replacedFiles[splitFile.FileNum] = added
  1988  		for i := range added {
  1989  			if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) {
  1990  				panic("ingest-time split produced a file that overlaps with ingested file")
  1991  			}
  1992  		}
  1993  		updateMetrics(splitFile, s.level, added)
  1994  	}
  1995  	// Flatten the version edit by removing any entries from ve.NewFiles that
  1996  	// are also in ve.DeletedFiles.
  1997  	newNewFiles := ve.NewFiles[:0]
  1998  	for i := range ve.NewFiles {
  1999  		fn := ve.NewFiles[i].Meta.FileNum
  2000  		deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn}
  2001  		if _, ok := ve.DeletedFiles[deEntry]; ok {
  2002  			delete(ve.DeletedFiles, deEntry)
  2003  		} else {
  2004  			newNewFiles = append(newNewFiles, ve.NewFiles[i])
  2005  		}
  2006  	}
  2007  	ve.NewFiles = newNewFiles
  2008  	return nil
  2009  }
  2010  
  2011  func (d *DB) ingestApply(
  2012  	jobID int,
  2013  	lr ingestLoadResult,
  2014  	findTargetLevel ingestTargetLevelFunc,
  2015  	mut *memTable,
  2016  	exciseSpan KeyRange,
  2017  ) (*versionEdit, error) {
  2018  	d.mu.Lock()
  2019  	defer d.mu.Unlock()
  2020  
  2021  	ve := &versionEdit{
  2022  		NewFiles: make([]newFileEntry, lr.fileCount),
  2023  	}
  2024  	if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) {
  2025  		ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{}
  2026  	}
  2027  	metrics := make(map[int]*LevelMetrics)
  2028  
  2029  	// Lock the manifest for writing before we use the current version to
  2030  	// determine the target level. This prevents two concurrent ingestion jobs
  2031  	// from using the same version to determine the target level, and also
  2032  	// provides serialization with concurrent compaction and flush jobs.
  2033  	// logAndApply unconditionally releases the manifest lock, but any earlier
  2034  	// returns must unlock the manifest.
  2035  	d.mu.versions.logLock()
  2036  
  2037  	if mut != nil {
  2038  		// Unref the mutable memtable to allows its flush to proceed. Now that we've
  2039  		// acquired the manifest lock, we can be certain that if the mutable
  2040  		// memtable has received more recent conflicting writes, the flush won't
  2041  		// beat us to applying to the manifest resulting in sequence number
  2042  		// inversion. Even though we call maybeScheduleFlush right now, this flush
  2043  		// will apply after our ingestion.
  2044  		if mut.writerUnref() {
  2045  			d.maybeScheduleFlush()
  2046  		}
  2047  	}
  2048  
  2049  	shouldIngestSplit := d.opts.Experimental.IngestSplit != nil &&
  2050  		d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables
  2051  	current := d.mu.versions.currentVersion()
  2052  	baseLevel := d.mu.versions.picker.getBaseLevel()
  2053  	iterOps := IterOptions{logger: d.opts.Logger}
  2054  	// filesToSplit is a list where each element is a pair consisting of a file
  2055  	// being ingested and a file being split to make room for an ingestion into
  2056  	// that level. Each ingested file will appear at most once in this list. It
  2057  	// is possible for split files to appear twice in this list.
  2058  	filesToSplit := make([]ingestSplitFile, 0)
  2059  	checkCompactions := false
  2060  	for i := 0; i < lr.fileCount; i++ {
  2061  		// Determine the lowest level in the LSM for which the sstable doesn't
  2062  		// overlap any existing files in the level.
  2063  		var m *fileMetadata
  2064  		sharedIdx := -1
  2065  		sharedLevel := -1
  2066  		externalFile := false
  2067  		if i < len(lr.localMeta) {
  2068  			// local file.
  2069  			m = lr.localMeta[i]
  2070  		} else if (i - len(lr.localMeta)) < len(lr.sharedMeta) {
  2071  			// shared file.
  2072  			sharedIdx = i - len(lr.localMeta)
  2073  			m = lr.sharedMeta[sharedIdx]
  2074  			sharedLevel = int(lr.sharedLevels[sharedIdx])
  2075  		} else {
  2076  			// external file.
  2077  			externalFile = true
  2078  			m = lr.externalMeta[i-(len(lr.localMeta)+len(lr.sharedMeta))]
  2079  		}
  2080  		f := &ve.NewFiles[i]
  2081  		var err error
  2082  		if sharedIdx >= 0 {
  2083  			f.Level = sharedLevel
  2084  			if f.Level < sharedLevelsStart {
  2085  				panic("cannot slot a shared file higher than the highest shared level")
  2086  			}
  2087  			ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  2088  		} else {
  2089  			if externalFile {
  2090  				ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
  2091  			}
  2092  			var splitFile *fileMetadata
  2093  			if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
  2094  				// This file fits perfectly within the excise span. We can slot it at
  2095  				// L6, or sharedLevelsStart - 1 if we have shared files.
  2096  				if len(lr.sharedMeta) > 0 {
  2097  					f.Level = sharedLevelsStart - 1
  2098  					if baseLevel > f.Level {
  2099  						f.Level = 0
  2100  					}
  2101  				} else {
  2102  					f.Level = 6
  2103  				}
  2104  			} else {
  2105  				// TODO(bilal): findTargetLevel does disk IO (reading files for data
  2106  				// overlap) even though we're holding onto d.mu. Consider unlocking
  2107  				// d.mu while we do this. We already hold versions.logLock so we should
  2108  				// not see any version applications while we're at this. The one
  2109  				// complication here would be pulling out the mu.compact.inProgress
  2110  				// check from findTargetLevel, as that requires d.mu to be held.
  2111  				f.Level, splitFile, err = findTargetLevel(
  2112  					d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit)
  2113  			}
  2114  
  2115  			if splitFile != nil {
  2116  				if invariants.Enabled {
  2117  					if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil {
  2118  						panic("splitFile returned is not in level it should be")
  2119  					}
  2120  				}
  2121  				// We take advantage of the fact that we won't drop the db mutex
  2122  				// between now and the call to logAndApply. So, no files should
  2123  				// get added to a new in-progress compaction at this point. We can
  2124  				// avoid having to iterate on in-progress compactions to cancel them
  2125  				// if none of the files being split have a compacting state.
  2126  				if splitFile.IsCompacting() {
  2127  					checkCompactions = true
  2128  				}
  2129  				filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level})
  2130  			}
  2131  		}
  2132  		if err != nil {
  2133  			d.mu.versions.logUnlock()
  2134  			return nil, err
  2135  		}
  2136  		f.Meta = m
  2137  		levelMetrics := metrics[f.Level]
  2138  		if levelMetrics == nil {
  2139  			levelMetrics = &LevelMetrics{}
  2140  			metrics[f.Level] = levelMetrics
  2141  		}
  2142  		levelMetrics.NumFiles++
  2143  		levelMetrics.Size += int64(m.Size)
  2144  		levelMetrics.BytesIngested += m.Size
  2145  		levelMetrics.TablesIngested++
  2146  	}
  2147  	// replacedFiles maps files excised due to exciseSpan (or splitFiles returned
  2148  	// by ingestTargetLevel), to files that were created to replace it. This map
  2149  	// is used to resolve references to split files in filesToSplit, as it is
  2150  	// possible for a file that we want to split to no longer exist or have a
  2151  	// newer fileMetadata due to a split induced by another ingestion file, or an
  2152  	// excise.
  2153  	replacedFiles := make(map[base.FileNum][]newFileEntry)
  2154  	updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) {
  2155  		levelMetrics := metrics[level]
  2156  		if levelMetrics == nil {
  2157  			levelMetrics = &LevelMetrics{}
  2158  			metrics[level] = levelMetrics
  2159  		}
  2160  		levelMetrics.NumFiles--
  2161  		levelMetrics.Size -= int64(m.Size)
  2162  		for i := range added {
  2163  			levelMetrics.NumFiles++
  2164  			levelMetrics.Size += int64(added[i].Meta.Size)
  2165  		}
  2166  	}
  2167  	if exciseSpan.Valid() {
  2168  		// Iterate through all levels and find files that intersect with exciseSpan.
  2169  		//
  2170  		// TODO(bilal): We could drop the DB mutex here as we don't need it for
  2171  		// excises; we only need to hold the version lock which we already are
  2172  		// holding. However releasing the DB mutex could mess with the
  2173  		// ingestTargetLevel calculation that happened above, as it assumed that it
  2174  		// had a complete view of in-progress compactions that wouldn't change
  2175  		// until logAndApply is called. If we were to drop the mutex now, we could
  2176  		// schedule another in-progress compaction that would go into the chosen target
  2177  		// level and lead to file overlap within level (which would panic in
  2178  		// logAndApply). We should drop the db mutex here, do the excise, then
  2179  		// re-grab the DB mutex and rerun just the in-progress compaction check to
  2180  		// see if any new compactions are conflicting with our chosen target levels
  2181  		// for files, and if they are, we should signal those compactions to error
  2182  		// out.
  2183  		for level := range current.Levels {
  2184  			overlaps := current.Overlaps(level, d.cmp, exciseSpan.Start, exciseSpan.End, true /* exclusiveEnd */)
  2185  			iter := overlaps.Iter()
  2186  
  2187  			for m := iter.First(); m != nil; m = iter.Next() {
  2188  				newFiles, err := d.excise(exciseSpan, m, ve, level)
  2189  				if err != nil {
  2190  					return nil, err
  2191  				}
  2192  
  2193  				if _, ok := ve.DeletedFiles[deletedFileEntry{
  2194  					Level:   level,
  2195  					FileNum: m.FileNum,
  2196  				}]; !ok {
  2197  					// We did not excise this file.
  2198  					continue
  2199  				}
  2200  				replacedFiles[m.FileNum] = newFiles
  2201  				updateLevelMetricsOnExcise(m, level, newFiles)
  2202  			}
  2203  		}
  2204  	}
  2205  	if len(filesToSplit) > 0 {
  2206  		// For the same reasons as the above call to excise, we hold the db mutex
  2207  		// while calling this method.
  2208  		if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil {
  2209  			return nil, err
  2210  		}
  2211  	}
  2212  	if len(filesToSplit) > 0 || exciseSpan.Valid() {
  2213  		for c := range d.mu.compact.inProgress {
  2214  			if c.versionEditApplied {
  2215  				continue
  2216  			}
  2217  			// Check if this compaction overlaps with the excise span. Note that just
  2218  			// checking if the inputs individually overlap with the excise span
  2219  			// isn't sufficient; for instance, a compaction could have [a,b] and [e,f]
  2220  			// as inputs and write it all out as [a,b,e,f] in one sstable. If we're
  2221  			// doing a [c,d) excise at the same time as this compaction, we will have
  2222  			// to error out the whole compaction as we can't guarantee it hasn't/won't
  2223  			// write a file overlapping with the excise span.
  2224  			if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) {
  2225  				c.cancel.Store(true)
  2226  			}
  2227  			// Check if this compaction's inputs have been replaced due to an
  2228  			// ingest-time split. In that case, cancel the compaction as a newly picked
  2229  			// compaction would need to include any new files that slid in between
  2230  			// previously-existing files. Note that we cancel any compaction that has a
  2231  			// file that was ingest-split as an input, even if it started before this
  2232  			// ingestion.
  2233  			if checkCompactions {
  2234  				for i := range c.inputs {
  2235  					iter := c.inputs[i].files.Iter()
  2236  					for f := iter.First(); f != nil; f = iter.Next() {
  2237  						if _, ok := replacedFiles[f.FileNum]; ok {
  2238  							c.cancel.Store(true)
  2239  							break
  2240  						}
  2241  					}
  2242  				}
  2243  			}
  2244  		}
  2245  		// Check for any EventuallyFileOnlySnapshots that could be watching for
  2246  		// an excise on this span.
  2247  		if exciseSpan.Valid() {
  2248  			for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next {
  2249  				if s.efos == nil {
  2250  					continue
  2251  				}
  2252  				efos := s.efos
  2253  				// TODO(bilal): We can make this faster by taking advantage of the sorted
  2254  				// nature of protectedRanges to do a sort.Search, or even maintaining a
  2255  				// global list of all protected ranges instead of having to peer into every
  2256  				// snapshot.
  2257  				for i := range efos.protectedRanges {
  2258  					if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) {
  2259  						efos.excised.Store(true)
  2260  						break
  2261  					}
  2262  				}
  2263  			}
  2264  		}
  2265  	}
  2266  	if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo {
  2267  		return d.getInProgressCompactionInfoLocked(nil)
  2268  	}); err != nil {
  2269  		return nil, err
  2270  	}
  2271  
  2272  	d.mu.versions.metrics.Ingest.Count++
  2273  
  2274  	d.updateReadStateLocked(d.opts.DebugCheck)
  2275  	// updateReadStateLocked could have generated obsolete tables, schedule a
  2276  	// cleanup job if necessary.
  2277  	d.deleteObsoleteFiles(jobID)
  2278  	d.updateTableStatsLocked(ve.NewFiles)
  2279  	// The ingestion may have pushed a level over the threshold for compaction,
  2280  	// so check to see if one is necessary and schedule it.
  2281  	d.maybeScheduleCompaction()
  2282  	var toValidate []manifest.NewFileEntry
  2283  	dedup := make(map[base.DiskFileNum]struct{})
  2284  	for _, entry := range ve.NewFiles {
  2285  		if _, ok := dedup[entry.Meta.FileBacking.DiskFileNum]; !ok {
  2286  			toValidate = append(toValidate, entry)
  2287  			dedup[entry.Meta.FileBacking.DiskFileNum] = struct{}{}
  2288  		}
  2289  	}
  2290  	d.maybeValidateSSTablesLocked(toValidate)
  2291  	return ve, nil
  2292  }
  2293  
  2294  // maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending
  2295  // queue of files to be validated, when the feature is enabled.
  2296  //
  2297  // Note that if two entries with the same backing file are added twice, then the
  2298  // block checksums for the backing file will be validated twice.
  2299  //
  2300  // DB.mu must be locked when calling.
  2301  func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) {
  2302  	// Only add to the validation queue when the feature is enabled.
  2303  	if !d.opts.Experimental.ValidateOnIngest {
  2304  		return
  2305  	}
  2306  
  2307  	d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...)
  2308  	if d.shouldValidateSSTablesLocked() {
  2309  		go d.validateSSTables()
  2310  	}
  2311  }
  2312  
  2313  // shouldValidateSSTablesLocked returns true if SSTable validation should run.
  2314  // DB.mu must be locked when calling.
  2315  func (d *DB) shouldValidateSSTablesLocked() bool {
  2316  	return !d.mu.tableValidation.validating &&
  2317  		d.closed.Load() == nil &&
  2318  		d.opts.Experimental.ValidateOnIngest &&
  2319  		len(d.mu.tableValidation.pending) > 0
  2320  }
  2321  
  2322  // validateSSTables runs a round of validation on the tables in the pending
  2323  // queue.
  2324  func (d *DB) validateSSTables() {
  2325  	d.mu.Lock()
  2326  	if !d.shouldValidateSSTablesLocked() {
  2327  		d.mu.Unlock()
  2328  		return
  2329  	}
  2330  
  2331  	pending := d.mu.tableValidation.pending
  2332  	d.mu.tableValidation.pending = nil
  2333  	d.mu.tableValidation.validating = true
  2334  	jobID := d.mu.nextJobID
  2335  	d.mu.nextJobID++
  2336  	rs := d.loadReadState()
  2337  
  2338  	// Drop DB.mu before performing IO.
  2339  	d.mu.Unlock()
  2340  
  2341  	// Validate all tables in the pending queue. This could lead to a situation
  2342  	// where we are starving IO from other tasks due to having to page through
  2343  	// all the blocks in all the sstables in the queue.
  2344  	// TODO(travers): Add some form of pacing to avoid IO starvation.
  2345  
  2346  	// If we fail to validate any files due to reasons other than uncovered
  2347  	// corruption, accumulate them and re-queue them for another attempt.
  2348  	var retry []manifest.NewFileEntry
  2349  
  2350  	for _, f := range pending {
  2351  		// The file may have been moved or deleted since it was ingested, in
  2352  		// which case we skip.
  2353  		if !rs.current.Contains(f.Level, d.cmp, f.Meta) {
  2354  			// Assume the file was moved to a lower level. It is rare enough
  2355  			// that a table is moved or deleted between the time it was ingested
  2356  			// and the time the validation routine runs that the overall cost of
  2357  			// this inner loop is tolerably low, when amortized over all
  2358  			// ingested tables.
  2359  			found := false
  2360  			for i := f.Level + 1; i < numLevels; i++ {
  2361  				if rs.current.Contains(i, d.cmp, f.Meta) {
  2362  					found = true
  2363  					break
  2364  				}
  2365  			}
  2366  			if !found {
  2367  				continue
  2368  			}
  2369  		}
  2370  
  2371  		var err error
  2372  		if f.Meta.Virtual {
  2373  			err = d.tableCache.withVirtualReader(
  2374  				f.Meta.VirtualMeta(), func(v sstable.VirtualReader) error {
  2375  					return v.ValidateBlockChecksumsOnBacking()
  2376  				})
  2377  		} else {
  2378  			err = d.tableCache.withReader(
  2379  				f.Meta.PhysicalMeta(), func(r *sstable.Reader) error {
  2380  					return r.ValidateBlockChecksums()
  2381  				})
  2382  		}
  2383  
  2384  		if err != nil {
  2385  			if IsCorruptionError(err) {
  2386  				// TODO(travers): Hook into the corruption reporting pipeline, once
  2387  				// available. See pebble#1192.
  2388  				d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err)
  2389  			} else {
  2390  				// If there was some other, possibly transient, error that
  2391  				// caused table validation to fail inform the EventListener and
  2392  				// move on. We remember the table so that we can retry it in a
  2393  				// subsequent table validation job.
  2394  				//
  2395  				// TODO(jackson): If the error is not transient, this will retry
  2396  				// validation indefinitely. While not great, it's the same
  2397  				// behavior as erroring flushes and compactions. We should
  2398  				// address this as a part of #270.
  2399  				d.opts.EventListener.BackgroundError(err)
  2400  				retry = append(retry, f)
  2401  				continue
  2402  			}
  2403  		}
  2404  
  2405  		d.opts.EventListener.TableValidated(TableValidatedInfo{
  2406  			JobID: jobID,
  2407  			Meta:  f.Meta,
  2408  		})
  2409  	}
  2410  	rs.unref()
  2411  	d.mu.Lock()
  2412  	defer d.mu.Unlock()
  2413  	d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, retry...)
  2414  	d.mu.tableValidation.validating = false
  2415  	d.mu.tableValidation.cond.Broadcast()
  2416  	if d.shouldValidateSSTablesLocked() {
  2417  		go d.validateSSTables()
  2418  	}
  2419  }