github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/ingest.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"sort"
     9  	"time"
    10  
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/zuoyebang/bitalostable/internal/base"
    13  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    14  	"github.com/zuoyebang/bitalostable/internal/manifest"
    15  	"github.com/zuoyebang/bitalostable/internal/private"
    16  	"github.com/zuoyebang/bitalostable/sstable"
    17  	"github.com/zuoyebang/bitalostable/vfs"
    18  )
    19  
    20  func sstableKeyCompare(userCmp Compare, a, b InternalKey) int {
    21  	c := userCmp(a.UserKey, b.UserKey)
    22  	if c != 0 {
    23  		return c
    24  	}
    25  	if a.Trailer == InternalKeyRangeDeleteSentinel {
    26  		if b.Trailer != InternalKeyRangeDeleteSentinel {
    27  			return -1
    28  		}
    29  	} else if b.Trailer == InternalKeyRangeDeleteSentinel {
    30  		return 1
    31  	}
    32  	return 0
    33  }
    34  
    35  func ingestValidateKey(opts *Options, key *InternalKey) error {
    36  	if key.Kind() == InternalKeyKindInvalid {
    37  		return base.CorruptionErrorf("bitalostable: external sstable has corrupted key: %s",
    38  			key.Pretty(opts.Comparer.FormatKey))
    39  	}
    40  	if key.SeqNum() != 0 {
    41  		return base.CorruptionErrorf("bitalostable: external sstable has non-zero seqnum: %s",
    42  			key.Pretty(opts.Comparer.FormatKey))
    43  	}
    44  	return nil
    45  }
    46  
    47  func ingestLoad1(
    48  	opts *Options, fmv FormatMajorVersion, path string, cacheID uint64, fileNum FileNum,
    49  ) (*fileMetadata, error) {
    50  	stat, err := opts.FS.Stat(path)
    51  	if err != nil {
    52  		return nil, err
    53  	}
    54  
    55  	f, err := opts.FS.Open(path)
    56  	if err != nil {
    57  		return nil, err
    58  	}
    59  
    60  	cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption)
    61  	r, err := sstable.NewReader(f, opts.MakeReaderOptions(), cacheOpts)
    62  	if err != nil {
    63  		return nil, err
    64  	}
    65  	defer r.Close()
    66  
    67  	// Avoid ingesting tables with format versions this DB doesn't support.
    68  	tf, err := r.TableFormat()
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  	if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() {
    73  		return nil, errors.Newf(
    74  			"bitalostable: table format %s is not within range supported at DB format major version %d, (%s,%s)",
    75  			tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(),
    76  		)
    77  	}
    78  
    79  	meta := &fileMetadata{}
    80  	meta.FileNum = fileNum
    81  	meta.Size = uint64(stat.Size())
    82  	meta.CreationTime = time.Now().Unix()
    83  
    84  	// Avoid loading into the table cache for collecting stats if we
    85  	// don't need to. If there are no range deletions, we have all the
    86  	// information to compute the stats here.
    87  	//
    88  	// This is helpful in tests for avoiding awkwardness around deletion of
    89  	// ingested files from MemFS. MemFS implements the Windows semantics of
    90  	// disallowing removal of an open file. Under MemFS, if we don't populate
    91  	// meta.Stats here, the file will be loaded into the table cache for
    92  	// calculating stats before we can remove the original link.
    93  	maybeSetStatsFromProperties(meta, &r.Properties)
    94  
    95  	{
    96  		iter, err := r.NewIter(nil /* lower */, nil /* upper */)
    97  		if err != nil {
    98  			return nil, err
    99  		}
   100  		defer iter.Close()
   101  		var smallest InternalKey
   102  		if key, _ := iter.First(); key != nil {
   103  			if err := ingestValidateKey(opts, key); err != nil {
   104  				return nil, err
   105  			}
   106  			smallest = (*key).Clone()
   107  		}
   108  		if err := iter.Error(); err != nil {
   109  			return nil, err
   110  		}
   111  		if key, _ := iter.Last(); key != nil {
   112  			if err := ingestValidateKey(opts, key); err != nil {
   113  				return nil, err
   114  			}
   115  			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone())
   116  		}
   117  		if err := iter.Error(); err != nil {
   118  			return nil, err
   119  		}
   120  	}
   121  
   122  	iter, err := r.NewRawRangeDelIter()
   123  	if err != nil {
   124  		return nil, err
   125  	}
   126  	if iter != nil {
   127  		defer iter.Close()
   128  		var smallest InternalKey
   129  		if s := iter.First(); s != nil {
   130  			key := s.SmallestKey()
   131  			if err := ingestValidateKey(opts, &key); err != nil {
   132  				return nil, err
   133  			}
   134  			smallest = key.Clone()
   135  		}
   136  		if err := iter.Error(); err != nil {
   137  			return nil, err
   138  		}
   139  		if s := iter.Last(); s != nil {
   140  			k := s.SmallestKey()
   141  			if err := ingestValidateKey(opts, &k); err != nil {
   142  				return nil, err
   143  			}
   144  			largest := s.LargestKey().Clone()
   145  			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
   146  		}
   147  	}
   148  
   149  	// Update the range-key bounds for the table.
   150  	{
   151  		iter, err := r.NewRawRangeKeyIter()
   152  		if err != nil {
   153  			return nil, err
   154  		}
   155  		if iter != nil {
   156  			defer iter.Close()
   157  			var smallest InternalKey
   158  			if s := iter.First(); s != nil {
   159  				key := s.SmallestKey()
   160  				if err := ingestValidateKey(opts, &key); err != nil {
   161  					return nil, err
   162  				}
   163  				smallest = key.Clone()
   164  			}
   165  			if err := iter.Error(); err != nil {
   166  				return nil, err
   167  			}
   168  			if s := iter.Last(); s != nil {
   169  				k := s.SmallestKey()
   170  				if err := ingestValidateKey(opts, &k); err != nil {
   171  					return nil, err
   172  				}
   173  				// As range keys are fragmented, the end key of the last range key in
   174  				// the table provides the upper bound for the table.
   175  				largest := s.LargestKey().Clone()
   176  				meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest)
   177  			}
   178  			if err := iter.Error(); err != nil {
   179  				return nil, err
   180  			}
   181  		}
   182  	}
   183  
   184  	if !meta.HasPointKeys && !meta.HasRangeKeys {
   185  		return nil, nil
   186  	}
   187  
   188  	// Sanity check that the various bounds on the file were set consistently.
   189  	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
   190  		return nil, err
   191  	}
   192  
   193  	return meta, nil
   194  }
   195  
   196  func ingestLoad(
   197  	opts *Options, fmv FormatMajorVersion, paths []string, cacheID uint64, pending []FileNum,
   198  ) ([]*fileMetadata, []string, error) {
   199  	meta := make([]*fileMetadata, 0, len(paths))
   200  	newPaths := make([]string, 0, len(paths))
   201  	for i := range paths {
   202  		m, err := ingestLoad1(opts, fmv, paths[i], cacheID, pending[i])
   203  		if err != nil {
   204  			return nil, nil, err
   205  		}
   206  		if m != nil {
   207  			meta = append(meta, m)
   208  			newPaths = append(newPaths, paths[i])
   209  		}
   210  	}
   211  	return meta, newPaths, nil
   212  }
   213  
   214  // Struct for sorting metadatas by smallest user keys, while ensuring the
   215  // matching path also gets swapped to the same index. For use in
   216  // ingestSortAndVerify.
   217  type metaAndPaths struct {
   218  	meta  []*fileMetadata
   219  	paths []string
   220  	cmp   Compare
   221  }
   222  
   223  func (m metaAndPaths) Len() int {
   224  	return len(m.meta)
   225  }
   226  
   227  func (m metaAndPaths) Less(i, j int) bool {
   228  	return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0
   229  }
   230  
   231  func (m metaAndPaths) Swap(i, j int) {
   232  	m.meta[i], m.meta[j] = m.meta[j], m.meta[i]
   233  	m.paths[i], m.paths[j] = m.paths[j], m.paths[i]
   234  }
   235  
   236  func ingestSortAndVerify(cmp Compare, meta []*fileMetadata, paths []string) error {
   237  	if len(meta) <= 1 {
   238  		return nil
   239  	}
   240  
   241  	sort.Sort(&metaAndPaths{
   242  		meta:  meta,
   243  		paths: paths,
   244  		cmp:   cmp,
   245  	})
   246  
   247  	for i := 1; i < len(meta); i++ {
   248  		if sstableKeyCompare(cmp, meta[i-1].Largest, meta[i].Smallest) >= 0 {
   249  			return errors.New("bitalostable: external sstables have overlapping ranges")
   250  		}
   251  	}
   252  	return nil
   253  }
   254  
   255  func ingestCleanup(fs vfs.FS, dirname string, meta []*fileMetadata) error {
   256  	var firstErr error
   257  	for i := range meta {
   258  		target := base.MakeFilepath(fs, dirname, fileTypeTable, meta[i].FileNum)
   259  		if err := fs.Remove(target); err != nil {
   260  			firstErr = firstError(firstErr, err)
   261  		}
   262  	}
   263  	return firstErr
   264  }
   265  
   266  func ingestLink(
   267  	jobID int, opts *Options, dirname string, paths []string, meta []*fileMetadata,
   268  ) error {
   269  	// Wrap the normal filesystem with one which wraps newly created files with
   270  	// vfs.NewSyncingFile.
   271  	fs := syncingFS{
   272  		FS: opts.FS,
   273  		syncOpts: vfs.SyncingFileOptions{
   274  			NoSyncOnClose: opts.NoSyncOnClose,
   275  			BytesPerSync:  opts.BytesPerSync,
   276  		},
   277  	}
   278  
   279  	for i := range paths {
   280  		target := base.MakeFilepath(fs, dirname, fileTypeTable, meta[i].FileNum)
   281  		var err error
   282  		if _, ok := opts.FS.(*vfs.MemFS); ok && opts.DebugCheck != nil {
   283  			// The combination of MemFS+Ingest+DebugCheck produces awkwardness around
   284  			// the subsequent deletion of files. The problem is that MemFS implements
   285  			// the Windows semantics of disallowing removal of an open file. This is
   286  			// desirable because it helps catch bugs where we violate the
   287  			// requirements of the Windows semantics. The normal practice for Ingest
   288  			// is for the caller to remove the source files after the ingest
   289  			// completes successfully. Unfortunately, Options.DebugCheck causes
   290  			// ingest to run DB.CheckLevels() before the ingest finishes, and
   291  			// DB.CheckLevels() populates the table cache with the newly ingested
   292  			// files.
   293  			//
   294  			// The combination of MemFS+Ingest+DebugCheck is primarily used in
   295  			// tests. As a workaround, disable hard linking this combination
   296  			// occurs. See https://github.com/zuoyebang/bitalostable/issues/495.
   297  			err = vfs.Copy(fs, paths[i], target)
   298  		} else {
   299  			err = vfs.LinkOrCopy(fs, paths[i], target)
   300  		}
   301  		if err != nil {
   302  			if err2 := ingestCleanup(fs, dirname, meta[:i]); err2 != nil {
   303  				opts.Logger.Infof("ingest cleanup failed: %v", err2)
   304  			}
   305  			return err
   306  		}
   307  		if opts.EventListener.TableCreated != nil {
   308  			opts.EventListener.TableCreated(TableCreateInfo{
   309  				JobID:   jobID,
   310  				Reason:  "ingesting",
   311  				Path:    target,
   312  				FileNum: meta[i].FileNum,
   313  			})
   314  		}
   315  	}
   316  
   317  	return nil
   318  }
   319  
   320  func ingestMemtableOverlaps(cmp Compare, mem flushable, meta []*fileMetadata) bool {
   321  	iter := mem.newIter(nil)
   322  	rangeDelIter := mem.newRangeDelIter(nil)
   323  	defer iter.Close()
   324  
   325  	if rangeDelIter != nil {
   326  		defer rangeDelIter.Close()
   327  	}
   328  
   329  	for _, m := range meta {
   330  		if overlapWithIterator(iter, &rangeDelIter, m, cmp) {
   331  			return true
   332  		}
   333  	}
   334  	return false
   335  }
   336  
   337  func ingestUpdateSeqNum(
   338  	cmp Compare, format base.FormatKey, seqNum uint64, meta []*fileMetadata,
   339  ) error {
   340  	setSeqFn := func(k base.InternalKey) base.InternalKey {
   341  		return base.MakeInternalKey(k.UserKey, seqNum, k.Kind())
   342  	}
   343  	for _, m := range meta {
   344  		// NB: we set the fields directly here, rather than via their Extend*
   345  		// methods, as we are updating sequence numbers.
   346  		if m.HasPointKeys {
   347  			m.SmallestPointKey = setSeqFn(m.SmallestPointKey)
   348  		}
   349  		if m.HasRangeKeys {
   350  			m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey)
   351  		}
   352  		m.Smallest = setSeqFn(m.Smallest)
   353  		// Only update the seqnum for the largest key if that key is not an
   354  		// "exclusive sentinel" (i.e. a range deletion sentinel or a range key
   355  		// boundary), as doing so effectively drops the exclusive sentinel (by
   356  		// lowering the seqnum from the max value), and extends the bounds of the
   357  		// table.
   358  		// NB: as the largest range key is always an exclusive sentinel, it is never
   359  		// updated.
   360  		if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() {
   361  			m.LargestPointKey = setSeqFn(m.LargestPointKey)
   362  		}
   363  		if !m.Largest.IsExclusiveSentinel() {
   364  			m.Largest = setSeqFn(m.Largest)
   365  		}
   366  		// Setting smallestSeqNum == largestSeqNum triggers the setting of
   367  		// Properties.GlobalSeqNum when an sstable is loaded.
   368  		m.SmallestSeqNum = seqNum
   369  		m.LargestSeqNum = seqNum
   370  		// Ensure the new bounds are consistent.
   371  		if err := m.Validate(cmp, format); err != nil {
   372  			return err
   373  		}
   374  		seqNum++
   375  	}
   376  	return nil
   377  }
   378  
   379  func overlapWithIterator(
   380  	iter internalIterator, rangeDelIter *keyspan.FragmentIterator, meta *fileMetadata, cmp Compare,
   381  ) bool {
   382  	// Check overlap with point operations.
   383  	//
   384  	// When using levelIter, it seeks to the SST whose boundaries
   385  	// contain meta.Smallest.UserKey(S).
   386  	// It then tries to find a point in that SST that is >= S.
   387  	// If there's no such point it means the SST ends in a tombstone in which case
   388  	// levelIter.SeekGE generates a boundary range del sentinel.
   389  	// The comparison of this boundary with meta.Largest(L) below
   390  	// is subtle but maintains correctness.
   391  	// 1) boundary < L,
   392  	//    since boundary is also > S (initial seek),
   393  	//    whatever the boundary's start key may be, we're always overlapping.
   394  	// 2) boundary > L,
   395  	//    overlap with boundary cannot be determined since we don't know boundary's start key.
   396  	//    We require checking for overlap with rangeDelIter.
   397  	// 3) boundary == L and L is not sentinel,
   398  	//    means boundary < L and hence is similar to 1).
   399  	// 4) boundary == L and L is sentinel,
   400  	//    we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap.
   401  	key, _ := iter.SeekGE(meta.Smallest.UserKey, base.SeekGEFlagsNone)
   402  	if key != nil {
   403  		c := sstableKeyCompare(cmp, *key, meta.Largest)
   404  		if c <= 0 {
   405  			return true
   406  		}
   407  	}
   408  
   409  	// Check overlap with range deletions.
   410  	if rangeDelIter == nil || *rangeDelIter == nil {
   411  		return false
   412  	}
   413  	rangeDelItr := *rangeDelIter
   414  	rangeDel := rangeDelItr.SeekLT(meta.Smallest.UserKey)
   415  	if rangeDel == nil {
   416  		rangeDel = rangeDelItr.Next()
   417  	}
   418  	for ; rangeDel != nil; rangeDel = rangeDelItr.Next() {
   419  		key := rangeDel.SmallestKey()
   420  		c := sstableKeyCompare(cmp, key, meta.Largest)
   421  		if c > 0 {
   422  			// The start of the tombstone is after the largest key in the
   423  			// ingested table.
   424  			return false
   425  		}
   426  		if cmp(rangeDel.End, meta.Smallest.UserKey) > 0 {
   427  			// The end of the tombstone is greater than the smallest in the
   428  			// table. Note that the tombstone end key is exclusive, thus ">0"
   429  			// instead of ">=0".
   430  			return true
   431  		}
   432  	}
   433  	return false
   434  }
   435  
   436  func ingestTargetLevel(
   437  	newIters tableNewIters,
   438  	iterOps IterOptions,
   439  	cmp Compare,
   440  	v *version,
   441  	baseLevel int,
   442  	compactions map[*compaction]struct{},
   443  	meta *fileMetadata,
   444  ) (int, error) {
   445  	// Find the lowest level which does not have any files which overlap meta. We
   446  	// search from L0 to L6 looking for whether there are any files in the level
   447  	// which overlap meta. We want the "lowest" level (where lower means
   448  	// increasing level number) in order to reduce write amplification.
   449  	//
   450  	// There are 2 kinds of overlap we need to check for: file boundary overlap
   451  	// and data overlap. Data overlap implies file boundary overlap. Note that it
   452  	// is always possible to ingest into L0.
   453  	//
   454  	// To place meta at level i where i > 0:
   455  	// - there must not be any data overlap with levels <= i, since that will
   456  	//   violate the sequence number invariant.
   457  	// - no file boundary overlap with level i, since that will violate the
   458  	//   invariant that files do not overlap in levels i > 0.
   459  	//
   460  	// The file boundary overlap check is simpler to conceptualize. Consider the
   461  	// following example, in which the ingested file lies completely before or
   462  	// after the file being considered.
   463  	//
   464  	//   |--|           |--|  ingested file: [a,b] or [f,g]
   465  	//         |-----|        existing file: [c,e]
   466  	//  _____________________
   467  	//   a  b  c  d  e  f  g
   468  	//
   469  	// In both cases the ingested file can move to considering the next level.
   470  	//
   471  	// File boundary overlap does not necessarily imply data overlap. The check
   472  	// for data overlap is a little more nuanced. Consider the following examples:
   473  	//
   474  	//  1. No data overlap:
   475  	//
   476  	//          |-|   |--|    ingested file: [cc-d] or [ee-ff]
   477  	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
   478  	//  _____________________
   479  	//   a  b  c  d  e  f  g
   480  	//
   481  	// In this case the ingested files can "fall through" this level. The checks
   482  	// continue at the next level.
   483  	//
   484  	//  2. Data overlap:
   485  	//
   486  	//            |--|        ingested file: [d-e]
   487  	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
   488  	//  _____________________
   489  	//   a  b  c  d  e  f  g
   490  	//
   491  	// In this case the file cannot be ingested into this level as the point 'dd'
   492  	// is in the way.
   493  	//
   494  	// It is worth noting that the check for data overlap is only approximate. In
   495  	// the previous example, the ingested table [d-e] could contain only the
   496  	// points 'd' and 'e', in which case the table would be eligible for
   497  	// considering lower levels. However, such a fine-grained check would need to
   498  	// be exhaustive (comparing points and ranges in both the ingested existing
   499  	// tables) and such a check is prohibitively expensive. Thus Pebble treats any
   500  	// existing point that falls within the ingested table bounds as being "data
   501  	// overlap".
   502  
   503  	targetLevel := 0
   504  
   505  	// Do we overlap with keys in L0?
   506  	iter := v.Levels[0].Iter()
   507  	for meta0 := iter.First(); meta0 != nil; meta0 = iter.Next() {
   508  		c1 := sstableKeyCompare(cmp, meta.Smallest, meta0.Largest)
   509  		c2 := sstableKeyCompare(cmp, meta.Largest, meta0.Smallest)
   510  		if c1 > 0 || c2 < 0 {
   511  			continue
   512  		}
   513  
   514  		iter, rangeDelIter, err := newIters(iter.Current(), nil, internalIterOpts{})
   515  		if err != nil {
   516  			return 0, err
   517  		}
   518  		overlap := overlapWithIterator(iter, &rangeDelIter, meta, cmp)
   519  		iter.Close()
   520  		if rangeDelIter != nil {
   521  			rangeDelIter.Close()
   522  		}
   523  		if overlap {
   524  			return targetLevel, nil
   525  		}
   526  	}
   527  
   528  	level := baseLevel
   529  	for ; level < numLevels; level++ {
   530  		levelIter := newLevelIter(iterOps, cmp, nil /* split */, newIters,
   531  			v.Levels[level].Iter(), manifest.Level(level), nil)
   532  		var rangeDelIter keyspan.FragmentIterator
   533  		// Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
   534  		// sets it up for the target file.
   535  		levelIter.initRangeDel(&rangeDelIter)
   536  		overlap := overlapWithIterator(levelIter, &rangeDelIter, meta, cmp)
   537  		levelIter.Close() // Closes range del iter as well.
   538  		if overlap {
   539  			return targetLevel, nil
   540  		}
   541  
   542  		// Check boundary overlap.
   543  		boundaryOverlaps := v.Overlaps(level, cmp, meta.Smallest.UserKey,
   544  			meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
   545  		if !boundaryOverlaps.Empty() {
   546  			continue
   547  		}
   548  
   549  		// Check boundary overlap with any ongoing compactions.
   550  		//
   551  		// We cannot check for data overlap with the new SSTs compaction will
   552  		// produce since compaction hasn't been done yet. However, there's no need
   553  		// to check since all keys in them will either be from c.startLevel or
   554  		// c.outputLevel, both levels having their data overlap already tested
   555  		// negative (else we'd have returned earlier).
   556  		overlaps := false
   557  		for c := range compactions {
   558  			if c.outputLevel == nil || level != c.outputLevel.level {
   559  				continue
   560  			}
   561  			if cmp(meta.Smallest.UserKey, c.largest.UserKey) <= 0 &&
   562  				cmp(meta.Largest.UserKey, c.smallest.UserKey) >= 0 {
   563  				overlaps = true
   564  				break
   565  			}
   566  		}
   567  		if !overlaps {
   568  			targetLevel = level
   569  		}
   570  	}
   571  	return targetLevel, nil
   572  }
   573  
   574  // Ingest ingests a set of sstables into the DB. Ingestion of the files is
   575  // atomic and semantically equivalent to creating a single batch containing all
   576  // of the mutations in the sstables. Ingestion may require the memtable to be
   577  // flushed. The ingested sstable files are moved into the DB and must reside on
   578  // the same filesystem as the DB. Sstables can be created for ingestion using
   579  // sstable.Writer. On success, Ingest removes the input paths.
   580  //
   581  // All sstables *must* be Sync()'d by the caller after all bytes are written
   582  // and before its file handle is closed; failure to do so could violate
   583  // durability or lead to corrupted on-disk state. This method cannot, in a
   584  // platform-and-FS-agnostic way, ensure that all sstables in the input are
   585  // properly synced to disk. Opening new file handles and Sync()-ing them
   586  // does not always guarantee durability; see the discussion here on that:
   587  // https://github.com/zuoyebang/bitalostable/pull/835#issuecomment-663075379
   588  //
   589  // Ingestion loads each sstable into the lowest level of the LSM which it
   590  // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable,
   591  // ingestion forces the memtable to flush, and then waits for the flush to
   592  // occur.
   593  //
   594  // The steps for ingestion are:
   595  //
   596  //  1. Allocate file numbers for every sstable being ingested.
   597  //  2. Load the metadata for all sstables being ingest.
   598  //  3. Sort the sstables by smallest key, verifying non overlap.
   599  //  4. Hard link (or copy) the sstables into the DB directory.
   600  //  5. Allocate a sequence number to use for all of the entries in the
   601  //     sstables. This is the step where overlap with memtables is
   602  //     determined. If there is overlap, we remember the most recent memtable
   603  //     that overlaps.
   604  //  6. Update the sequence number in the ingested sstables.
   605  //  7. Wait for the most recent memtable that overlaps to flush (if any).
   606  //  8. Add the ingested sstables to the version (DB.ingestApply).
   607  //  9. Publish the ingestion sequence number.
   608  //
   609  // Note that if the mutable memtable overlaps with ingestion, a flush of the
   610  // memtable is forced equivalent to DB.Flush. Additionally, subsequent
   611  // mutations that get sequence numbers larger than the ingestion sequence
   612  // number get queued up behind the ingestion waiting for it to complete. This
   613  // can produce a noticeable hiccup in performance. See
   614  // https://github.com/zuoyebang/bitalostable/issues/25 for an idea for how to fix
   615  // this hiccup.
   616  func (d *DB) Ingest(paths []string) error {
   617  	if err := d.closed.Load(); err != nil {
   618  		panic(err)
   619  	}
   620  	if d.opts.ReadOnly {
   621  		return ErrReadOnly
   622  	}
   623  	_, err := d.ingest(paths, ingestTargetLevel)
   624  	return err
   625  }
   626  
   627  // IngestOperationStats provides some information about where in the LSM the
   628  // bytes were ingested.
   629  type IngestOperationStats struct {
   630  	// Bytes is the total bytes in the ingested sstables.
   631  	Bytes uint64
   632  	// ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested
   633  	// into L0.
   634  	// Currently, this value is completely accurate, but we are allowing this to
   635  	// be approximate once https://github.com/zuoyebang/bitalostable/issues/25 is
   636  	// implemented.
   637  	ApproxIngestedIntoL0Bytes uint64
   638  }
   639  
   640  // IngestWithStats does the same as Ingest, and additionally returns
   641  // IngestOperationStats.
   642  func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) {
   643  	if err := d.closed.Load(); err != nil {
   644  		panic(err)
   645  	}
   646  	if d.opts.ReadOnly {
   647  		return IngestOperationStats{}, ErrReadOnly
   648  	}
   649  	return d.ingest(paths, ingestTargetLevel)
   650  }
   651  
   652  func (d *DB) ingest(
   653  	paths []string, targetLevelFunc ingestTargetLevelFunc,
   654  ) (IngestOperationStats, error) {
   655  	// Allocate file numbers for all of the files being ingested and mark them as
   656  	// pending in order to prevent them from being deleted. Note that this causes
   657  	// the file number ordering to be out of alignment with sequence number
   658  	// ordering. The sorting of L0 tables by sequence number avoids relying on
   659  	// that (busted) invariant.
   660  	d.mu.Lock()
   661  	pendingOutputs := make([]FileNum, len(paths))
   662  	for i := range paths {
   663  		pendingOutputs[i] = d.mu.versions.getNextFileNum()
   664  	}
   665  	jobID := d.mu.nextJobID
   666  	d.mu.nextJobID++
   667  	d.mu.Unlock()
   668  
   669  	// Load the metadata for all of the files being ingested. This step detects
   670  	// and elides empty sstables.
   671  	meta, paths, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, d.cacheID, pendingOutputs)
   672  	if err != nil {
   673  		return IngestOperationStats{}, err
   674  	}
   675  	if len(meta) == 0 {
   676  		// All of the sstables to be ingested were empty. Nothing to do.
   677  		return IngestOperationStats{}, nil
   678  	}
   679  
   680  	// Verify the sstables do not overlap.
   681  	if err := ingestSortAndVerify(d.cmp, meta, paths); err != nil {
   682  		return IngestOperationStats{}, err
   683  	}
   684  
   685  	// Hard link the sstables into the DB directory. Since the sstables aren't
   686  	// referenced by a version, they won't be used. If the hard linking fails
   687  	// (e.g. because the files reside on a different filesystem), ingestLink will
   688  	// fall back to copying, and if that fails we undo our work and return an
   689  	// error.
   690  	if err := ingestLink(jobID, d.opts, d.dirname, paths, meta); err != nil {
   691  		return IngestOperationStats{}, err
   692  	}
   693  	// Fsync the directory we added the tables to. We need to do this at some
   694  	// point before we update the MANIFEST (via logAndApply), otherwise a crash
   695  	// can have the tables referenced in the MANIFEST, but not present in the
   696  	// directory.
   697  	if err := d.dataDir.Sync(); err != nil {
   698  		return IngestOperationStats{}, err
   699  	}
   700  
   701  	var mem *flushableEntry
   702  	prepare := func() {
   703  		// Note that d.commit.mu is held by commitPipeline when calling prepare.
   704  
   705  		d.mu.Lock()
   706  		defer d.mu.Unlock()
   707  
   708  		// Check to see if any files overlap with any of the memtables. The queue
   709  		// is ordered from oldest to newest with the mutable memtable being the
   710  		// last element in the slice. We want to wait for the newest table that
   711  		// overlaps.
   712  		for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
   713  			m := d.mu.mem.queue[i]
   714  			if ingestMemtableOverlaps(d.cmp, m, meta) {
   715  				mem = m
   716  				if mem.flushable == d.mu.mem.mutable {
   717  					err = d.makeRoomForWrite(nil, true)
   718  				}
   719  				mem.flushForced = true
   720  				d.maybeScheduleFlush(true)
   721  				return
   722  			}
   723  		}
   724  	}
   725  
   726  	var ve *versionEdit
   727  	apply := func(seqNum uint64) {
   728  		if err != nil {
   729  			// An error occurred during prepare.
   730  			return
   731  		}
   732  
   733  		// Update the sequence number for all of the sstables in the
   734  		// metadata. Writing the metadata to the manifest when the
   735  		// version edit is applied is the mechanism that persists the
   736  		// sequence number. The sstables themselves are left unmodified.
   737  		if err = ingestUpdateSeqNum(
   738  			d.cmp, d.opts.Comparer.FormatKey, seqNum, meta,
   739  		); err != nil {
   740  			return
   741  		}
   742  
   743  		// If we overlapped with a memtable in prepare wait for the flush to
   744  		// finish.
   745  		if mem != nil {
   746  			<-mem.flushed
   747  		}
   748  
   749  		// Assign the sstables to the correct level in the LSM and apply the
   750  		// version edit.
   751  		ve, err = d.ingestApply(jobID, meta, targetLevelFunc)
   752  	}
   753  
   754  	d.commit.AllocateSeqNum(len(meta), prepare, apply)
   755  
   756  	if err != nil {
   757  		if err2 := ingestCleanup(d.opts.FS, d.dirname, meta); err2 != nil {
   758  			d.opts.Logger.Infof("ingest cleanup failed: %v", err2)
   759  		}
   760  	} else {
   761  		for _, path := range paths {
   762  			if err2 := d.opts.FS.Remove(path); err2 != nil {
   763  				d.opts.Logger.Infof("ingest failed to remove original file: %s", err2)
   764  			}
   765  		}
   766  	}
   767  
   768  	info := TableIngestInfo{
   769  		JobID:        jobID,
   770  		GlobalSeqNum: meta[0].SmallestSeqNum,
   771  		Err:          err,
   772  	}
   773  	var stats IngestOperationStats
   774  	if ve != nil {
   775  		info.Tables = make([]struct {
   776  			TableInfo
   777  			Level int
   778  		}, len(ve.NewFiles))
   779  		for i := range ve.NewFiles {
   780  			e := &ve.NewFiles[i]
   781  			info.Tables[i].Level = e.Level
   782  			info.Tables[i].TableInfo = e.Meta.TableInfo()
   783  			stats.Bytes += e.Meta.Size
   784  			if e.Level == 0 {
   785  				stats.ApproxIngestedIntoL0Bytes += e.Meta.Size
   786  			}
   787  		}
   788  	}
   789  	d.opts.EventListener.TableIngested(info)
   790  
   791  	return stats, err
   792  }
   793  
   794  type ingestTargetLevelFunc func(
   795  	newIters tableNewIters,
   796  	iterOps IterOptions,
   797  	cmp Compare,
   798  	v *version,
   799  	baseLevel int,
   800  	compactions map[*compaction]struct{},
   801  	meta *fileMetadata,
   802  ) (int, error)
   803  
   804  func (d *DB) ingestApply(
   805  	jobID int, meta []*fileMetadata, findTargetLevel ingestTargetLevelFunc,
   806  ) (*versionEdit, error) {
   807  	d.mu.Lock()
   808  	defer d.mu.Unlock()
   809  
   810  	ve := &versionEdit{
   811  		NewFiles: make([]newFileEntry, len(meta)),
   812  	}
   813  	metrics := make(map[int]*LevelMetrics)
   814  
   815  	// Lock the manifest for writing before we use the current version to
   816  	// determine the target level. This prevents two concurrent ingestion jobs
   817  	// from using the same version to determine the target level, and also
   818  	// provides serialization with concurrent compaction and flush jobs.
   819  	// logAndApply unconditionally releases the manifest lock, but any earlier
   820  	// returns must unlock the manifest.
   821  	d.mu.versions.logLock()
   822  	current := d.mu.versions.currentVersion()
   823  	baseLevel := d.mu.versions.picker.getBaseLevel()
   824  	iterOps := IterOptions{logger: d.opts.Logger}
   825  	for i := range meta {
   826  		// Determine the lowest level in the LSM for which the sstable doesn't
   827  		// overlap any existing files in the level.
   828  		m := meta[i]
   829  		f := &ve.NewFiles[i]
   830  		var err error
   831  		f.Level, err = findTargetLevel(d.newIters, iterOps, d.cmp, current, baseLevel, d.mu.compact.inProgress, m)
   832  		if err != nil {
   833  			d.mu.versions.logUnlock()
   834  			return nil, err
   835  		}
   836  		f.Meta = m
   837  		levelMetrics := metrics[f.Level]
   838  		if levelMetrics == nil {
   839  			levelMetrics = &LevelMetrics{}
   840  			metrics[f.Level] = levelMetrics
   841  		}
   842  		levelMetrics.NumFiles++
   843  		levelMetrics.Size += int64(m.Size)
   844  		levelMetrics.BytesIngested += m.Size
   845  		levelMetrics.TablesIngested++
   846  	}
   847  	if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo {
   848  		return d.getInProgressCompactionInfoLocked(nil)
   849  	}); err != nil {
   850  		return nil, err
   851  	}
   852  	d.updateReadStateLocked(d.opts.DebugCheck)
   853  	d.updateTableStatsLocked(ve.NewFiles)
   854  	d.deleteObsoleteFiles(jobID, false /* waitForOngoing */)
   855  	// The ingestion may have pushed a level over the threshold for compaction,
   856  	// so check to see if one is necessary and schedule it.
   857  	d.maybeScheduleCompaction()
   858  	d.maybeValidateSSTablesLocked(ve.NewFiles)
   859  	return ve, nil
   860  }
   861  
   862  // maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending
   863  // queue of files to be validated, when the feature is enabled.
   864  // DB.mu must be locked when calling.
   865  func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) {
   866  	// Only add to the validation queue when the feature is enabled.
   867  	if !d.opts.Experimental.ValidateOnIngest {
   868  		return
   869  	}
   870  
   871  	d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...)
   872  	if d.shouldValidateSSTablesLocked() {
   873  		go d.validateSSTables()
   874  	}
   875  }
   876  
   877  // shouldValidateSSTablesLocked returns true if SSTable validation should run.
   878  // DB.mu must be locked when calling.
   879  func (d *DB) shouldValidateSSTablesLocked() bool {
   880  	return !d.mu.tableValidation.validating &&
   881  		d.closed.Load() == nil &&
   882  		d.opts.Experimental.ValidateOnIngest &&
   883  		len(d.mu.tableValidation.pending) > 0
   884  }
   885  
   886  // validateSSTables runs a round of validation on the tables in the pending
   887  // queue.
   888  func (d *DB) validateSSTables() {
   889  	d.mu.Lock()
   890  	if !d.shouldValidateSSTablesLocked() {
   891  		d.mu.Unlock()
   892  		return
   893  	}
   894  
   895  	pending := d.mu.tableValidation.pending
   896  	d.mu.tableValidation.pending = nil
   897  	d.mu.tableValidation.validating = true
   898  	jobID := d.mu.nextJobID
   899  	d.mu.nextJobID++
   900  	rs := d.loadReadState()
   901  
   902  	// Drop DB.mu before performing IO.
   903  	d.mu.Unlock()
   904  
   905  	// Validate all tables in the pending queue. This could lead to a situation
   906  	// where we are starving IO from other tasks due to having to page through
   907  	// all the blocks in all the sstables in the queue.
   908  	// TODO(travers): Add some form of pacing to avoid IO starvation.
   909  	for _, f := range pending {
   910  		// The file may have been moved or deleted since it was ingested, in
   911  		// which case we skip.
   912  		if !rs.current.Contains(f.Level, d.cmp, f.Meta) {
   913  			// Assume the file was moved to a lower level. It is rare enough
   914  			// that a table is moved or deleted between the time it was ingested
   915  			// and the time the validation routine runs that the overall cost of
   916  			// this inner loop is tolerably low, when amortized over all
   917  			// ingested tables.
   918  			found := false
   919  			for i := f.Level + 1; i < numLevels; i++ {
   920  				if rs.current.Contains(i, d.cmp, f.Meta) {
   921  					found = true
   922  					break
   923  				}
   924  			}
   925  			if !found {
   926  				continue
   927  			}
   928  		}
   929  
   930  		err := d.tableCache.withReader(f.Meta, func(r *sstable.Reader) error {
   931  			return r.ValidateBlockChecksums()
   932  		})
   933  		if err != nil {
   934  			// TODO(travers): Hook into the corruption reporting pipeline, once
   935  			// available. See bitalostable#1192.
   936  			d.opts.Logger.Fatalf("bitalostable: encountered corruption during ingestion: %s", err)
   937  		}
   938  
   939  		d.opts.EventListener.TableValidated(TableValidatedInfo{
   940  			JobID: jobID,
   941  			Meta:  f.Meta,
   942  		})
   943  	}
   944  	rs.unref()
   945  
   946  	d.mu.Lock()
   947  	defer d.mu.Unlock()
   948  	d.mu.tableValidation.validating = false
   949  	d.mu.tableValidation.cond.Broadcast()
   950  	if d.shouldValidateSSTablesLocked() {
   951  		go d.validateSSTables()
   952  	}
   953  }