github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/ingest.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"fmt"
     9  	"sort"
    10  
    11  	"github.com/petermattis/pebble/internal/base"
    12  	"github.com/petermattis/pebble/sstable"
    13  	"github.com/petermattis/pebble/vfs"
    14  )
    15  
    16  func sstableKeyCompare(userCmp Compare, a, b InternalKey) int {
    17  	c := userCmp(a.UserKey, b.UserKey)
    18  	if c != 0 {
    19  		return c
    20  	}
    21  	if a.Trailer == InternalKeyRangeDeleteSentinel {
    22  		if b.Trailer != InternalKeyRangeDeleteSentinel {
    23  			return -1
    24  		}
    25  	} else if b.Trailer == InternalKeyRangeDeleteSentinel {
    26  		return 1
    27  	}
    28  	return 0
    29  }
    30  
    31  func ingestLoad1(opts *Options, path string, dbNum, fileNum uint64) (*fileMetadata, error) {
    32  	stat, err := opts.FS.Stat(path)
    33  	if err != nil {
    34  		return nil, err
    35  	}
    36  
    37  	f, err := opts.FS.Open(path)
    38  	if err != nil {
    39  		return nil, err
    40  	}
    41  
    42  	r, err := sstable.NewReader(f, dbNum, fileNum, opts)
    43  	defer r.Close()
    44  	if err != nil {
    45  		return nil, err
    46  	}
    47  
    48  	meta := &fileMetadata{}
    49  	meta.FileNum = fileNum
    50  	meta.Size = uint64(stat.Size())
    51  	meta.Smallest = InternalKey{}
    52  	meta.Largest = InternalKey{}
    53  	smallestSet, largestSet := false, false
    54  
    55  	{
    56  		iter := r.NewIter(nil /* lower */, nil /* upper */)
    57  		defer iter.Close()
    58  		if key, _ := iter.First(); key != nil {
    59  			meta.Smallest = key.Clone()
    60  			smallestSet = true
    61  		}
    62  		if key, _ := iter.Last(); key != nil {
    63  			meta.Largest = key.Clone()
    64  			largestSet = true
    65  		}
    66  		if err := iter.Error(); err != nil {
    67  			return nil, err
    68  		}
    69  	}
    70  
    71  	if iter := r.NewRangeDelIter(); iter != nil {
    72  		defer iter.Close()
    73  		if key, _ := iter.First(); key != nil {
    74  			if !smallestSet ||
    75  				base.InternalCompare(opts.Comparer.Compare, meta.Smallest, *key) > 0 {
    76  				meta.Smallest = key.Clone()
    77  			}
    78  		}
    79  		if key, val := iter.Last(); key != nil {
    80  			end := base.MakeRangeDeleteSentinelKey(val)
    81  			if !largestSet ||
    82  				base.InternalCompare(opts.Comparer.Compare, meta.Largest, end) < 0 {
    83  				meta.Largest = end.Clone()
    84  			}
    85  		}
    86  	}
    87  
    88  	return meta, nil
    89  }
    90  
    91  func ingestLoad(
    92  	opts *Options, paths []string, dbNum uint64, pending []uint64,
    93  ) ([]*fileMetadata, error) {
    94  	meta := make([]*fileMetadata, len(paths))
    95  	for i := range paths {
    96  		var err error
    97  		meta[i], err = ingestLoad1(opts, paths[i], dbNum, pending[i])
    98  		if err != nil {
    99  			return nil, err
   100  		}
   101  	}
   102  	return meta, nil
   103  }
   104  
   105  func ingestSortAndVerify(cmp Compare, meta []*fileMetadata) error {
   106  	if len(meta) <= 1 {
   107  		return nil
   108  	}
   109  
   110  	sort.Slice(meta, func(i, j int) bool {
   111  		return cmp(meta[i].Smallest.UserKey, meta[j].Smallest.UserKey) < 0
   112  	})
   113  
   114  	for i := 1; i < len(meta); i++ {
   115  		if sstableKeyCompare(cmp, meta[i-1].Largest, meta[i].Smallest) >= 0 {
   116  			return fmt.Errorf("files have overlapping ranges")
   117  		}
   118  	}
   119  	return nil
   120  }
   121  
   122  func ingestCleanup(fs vfs.FS, dirname string, meta []*fileMetadata) error {
   123  	var firstErr error
   124  	for i := range meta {
   125  		target := base.MakeFilename(dirname, fileTypeTable, meta[i].FileNum)
   126  		if err := fs.Remove(target); err != nil {
   127  			if firstErr != nil {
   128  				firstErr = err
   129  			}
   130  		}
   131  	}
   132  	return firstErr
   133  }
   134  
   135  func ingestLink(opts *Options, dirname string, paths []string, meta []*fileMetadata) error {
   136  	for i := range paths {
   137  		target := base.MakeFilename(dirname, fileTypeTable, meta[i].FileNum)
   138  		err := opts.FS.Link(paths[i], target)
   139  		if err != nil {
   140  			if err2 := ingestCleanup(opts.FS, dirname, meta[:i]); err2 != nil {
   141  				opts.Logger.Infof("ingest cleanup failed: %v", err2)
   142  			}
   143  			return err
   144  		}
   145  	}
   146  
   147  	return nil
   148  }
   149  
   150  func ingestMemtableOverlaps(cmp Compare, mem flushable, meta []*fileMetadata) bool {
   151  	{
   152  		// Check overlap with point operations.
   153  		iter := mem.newIter(nil)
   154  		defer iter.Close()
   155  
   156  		for _, m := range meta {
   157  			key, _ := iter.SeekGE(m.Smallest.UserKey)
   158  			if key == nil {
   159  				continue
   160  			}
   161  			if cmp(key.UserKey, m.Largest.UserKey) <= 0 {
   162  				return true
   163  			}
   164  		}
   165  	}
   166  
   167  	// Check overlap with range deletions.
   168  	if iter := mem.newRangeDelIter(nil); iter != nil {
   169  		defer iter.Close()
   170  		for _, m := range meta {
   171  			key, val := iter.SeekLT(m.Smallest.UserKey)
   172  			if key == nil {
   173  				key, val = iter.Next()
   174  			}
   175  			for ; key != nil; key, val = iter.Next() {
   176  				if cmp(key.UserKey, m.Largest.UserKey) > 0 {
   177  					// The start of the tombstone is after the largest key in the
   178  					// ingested table.
   179  					break
   180  				}
   181  				if cmp(val, m.Smallest.UserKey) > 0 {
   182  					// The end of the tombstone is greater than the smallest in the
   183  					// table. Note that the tombstone end key is exclusive, thus ">0"
   184  					// instead of ">=0".
   185  					return true
   186  				}
   187  			}
   188  		}
   189  	}
   190  
   191  	return false
   192  }
   193  
   194  func ingestUpdateSeqNum(opts *Options, dirname string, seqNum uint64, meta []*fileMetadata) error {
   195  	for _, m := range meta {
   196  		m.Smallest = base.MakeInternalKey(m.Smallest.UserKey, seqNum, m.Smallest.Kind())
   197  		m.Largest = base.MakeInternalKey(m.Largest.UserKey, seqNum, m.Largest.Kind())
   198  		// Setting smallestSeqNum == largestSeqNum triggers the setting of
   199  		// Properties.GlobalSeqNum when an sstable is loaded.
   200  		m.SmallestSeqNum = seqNum
   201  		m.LargestSeqNum = seqNum
   202  		seqNum++
   203  
   204  		// TODO(peter): Update the global sequence number property. This is only
   205  		// necessary for compatibility with RocksDB.
   206  	}
   207  	return nil
   208  }
   209  
   210  func ingestTargetLevel(cmp Compare, v *version, meta *fileMetadata) int {
   211  	// Find the lowest level which does not have any files which overlap meta.
   212  	if len(v.Overlaps(0, cmp, meta.Smallest.UserKey, meta.Largest.UserKey)) != 0 {
   213  		return 0
   214  	}
   215  
   216  	level := 1
   217  	for ; level < numLevels; level++ {
   218  		if len(v.Overlaps(level, cmp, meta.Smallest.UserKey, meta.Largest.UserKey)) != 0 {
   219  			break
   220  		}
   221  	}
   222  	return level - 1
   223  }
   224  
   225  // Ingest ingests a set of sstables into the DB. Ingestion of the files is
   226  // atomic and semantically equivalent to creating a single batch containing all
   227  // of the mutations in the sstables. Ingestion may require the memtable to be
   228  // flushed. The ingested sstable files are moved into the DB and must reside on
   229  // the same filesystem as the DB. Sstables can be created for ingestion using
   230  // sstable.Writer.
   231  //
   232  // Ingestion loads each sstable into the lowest level of the LSM which it
   233  // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable,
   234  // ingestion forces the memtable to flush, and then waits for the flush to
   235  // occur.
   236  //
   237  // The steps for ingestion are:
   238  //
   239  //   1. Allocate file numbers for every sstable beign ingested.
   240  //   2. Load the metadata for all sstables being ingest.
   241  //   3. Sort the sstables by smallest key, verifying non overlap.
   242  //   4. Hard link the sstables into the DB directory.
   243  //   5. Allocate a sequence number to use for all of the entries in the
   244  //      sstables. This is the step where overlap with memtables is
   245  //      determined. If there is overlap, we remember the most recent memtable
   246  //      that overlaps.
   247  //   6. Update the sequence number in the ingested sstables.
   248  //   7. Wait for the most recent memtable that overlaps to flush (if any).
   249  //   8. Add the ingested sstables to the version (DB.ingestApply).
   250  //   9. Publish the ingestion sequence number.
   251  //
   252  // Note that if the mutable memtable overlaps with ingestion, a flush of the
   253  // memtable is forced equivalent to DB.Flush. Additionally, subsequent
   254  // mutations that get sequence numbers larger than the ingestion sequence
   255  // number get queued up behind the ingestion waiting for it to complete. This
   256  // can produce a noticeable hiccup in performance. See
   257  // https://github.com/petermattis/pebble/issues/25 for an idea for how to fix
   258  // this hiccup.
   259  func (d *DB) Ingest(paths []string) error {
   260  	// Allocate file numbers for all of the files being ingested and mark them as
   261  	// pending in order to prevent them from being deleted. Note that this causes
   262  	// the file number ordering to be out of alignment with sequence number
   263  	// ordering. The sorting of L0 tables by sequence number avoids relying on
   264  	// that (busted) invariant.
   265  	d.mu.Lock()
   266  	pendingOutputs := make([]uint64, len(paths))
   267  	for i := range paths {
   268  		pendingOutputs[i] = d.mu.versions.getNextFileNum()
   269  	}
   270  	for _, fileNum := range pendingOutputs {
   271  		d.mu.compact.pendingOutputs[fileNum] = struct{}{}
   272  	}
   273  	jobID := d.mu.nextJobID
   274  	d.mu.nextJobID++
   275  	d.mu.Unlock()
   276  
   277  	defer func() {
   278  		d.mu.Lock()
   279  		for _, fileNum := range pendingOutputs {
   280  			delete(d.mu.compact.pendingOutputs, fileNum)
   281  		}
   282  		d.mu.Unlock()
   283  	}()
   284  
   285  	// Load the metadata for all of the files being ingested.
   286  	meta, err := ingestLoad(d.opts, paths, d.dbNum, pendingOutputs)
   287  	if err != nil {
   288  		return err
   289  	}
   290  
   291  	// Verify the sstables do not overlap.
   292  	if err := ingestSortAndVerify(d.cmp, meta); err != nil {
   293  		return err
   294  	}
   295  
   296  	// Hard link the sstables into the DB directory. Since the sstables aren't
   297  	// referenced by a version, they won't be used. If the hard linking fails
   298  	// (e.g. because the files reside on a different filesystem) we undo our work
   299  	// and return an error.
   300  	if err := ingestLink(d.opts, d.dirname, paths, meta); err != nil {
   301  		return err
   302  	}
   303  	// Fsync the directory we added the tables to. We need to do this at some
   304  	// point before we update the MANIFEST (via logAndApply), otherwise a crash
   305  	// can have the tables referenced in the MANIFEST, but not present in the
   306  	// directory.
   307  	if err := d.dataDir.Sync(); err != nil {
   308  		return err
   309  	}
   310  
   311  	var mem flushable
   312  	prepare := func() {
   313  		d.mu.Lock()
   314  		defer d.mu.Unlock()
   315  
   316  		// If the mutable memtable contains keys which overlap any of the sstables
   317  		// then flush the memtable. Note that apply will wait for the flushing to
   318  		// finish.
   319  		if ingestMemtableOverlaps(d.cmp, d.mu.mem.mutable, meta) {
   320  			mem = d.mu.mem.mutable
   321  			err = d.makeRoomForWrite(nil)
   322  			return
   323  		}
   324  
   325  		// Check to see if any files overlap with any of the immutable
   326  		// memtables. The queue is ordered from oldest to newest. We want to wait
   327  		// for the newest table that overlaps.
   328  		for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
   329  			m := d.mu.mem.queue[i]
   330  			if ingestMemtableOverlaps(d.cmp, m, meta) {
   331  				mem = m
   332  				return
   333  			}
   334  		}
   335  	}
   336  
   337  	var ve *versionEdit
   338  	apply := func(seqNum uint64) {
   339  		if err != nil {
   340  			// An error occurred during prepare.
   341  			return
   342  		}
   343  
   344  		// Update the sequence number for all of the sstables, both in the metadata
   345  		// and the global sequence number property on disk.
   346  		if err = ingestUpdateSeqNum(d.opts, d.dirname, seqNum, meta); err != nil {
   347  			return
   348  		}
   349  
   350  		// If we flushed the mutable memtable in prepare wait for the flush to
   351  		// finish.
   352  		if mem != nil {
   353  			<-mem.flushed()
   354  		}
   355  
   356  		// Assign the sstables to the correct level in the LSM and apply the
   357  		// version edit.
   358  		ve, err = d.ingestApply(jobID, meta)
   359  	}
   360  
   361  	d.commit.AllocateSeqNum(len(meta), prepare, apply)
   362  
   363  	if err != nil {
   364  		if err2 := ingestCleanup(d.opts.FS, d.dirname, meta); err2 != nil {
   365  			d.opts.Logger.Infof("ingest cleanup failed: %v", err2)
   366  		}
   367  	}
   368  
   369  	if d.opts.EventListener.TableIngested != nil {
   370  		info := TableIngestInfo{
   371  			JobID:        jobID,
   372  			GlobalSeqNum: meta[0].SmallestSeqNum,
   373  			Err:          err,
   374  		}
   375  		if ve != nil {
   376  			info.Tables = make([]struct {
   377  				TableInfo
   378  				Level int
   379  			}, len(ve.NewFiles))
   380  			for i := range ve.NewFiles {
   381  				e := &ve.NewFiles[i]
   382  				info.Tables[i].Level = e.Level
   383  				info.Tables[i].TableInfo = e.Meta.TableInfo(d.dirname)
   384  			}
   385  		}
   386  		d.opts.EventListener.TableIngested(info)
   387  	}
   388  
   389  	return err
   390  }
   391  
   392  func (d *DB) ingestApply(jobID int, meta []*fileMetadata) (*versionEdit, error) {
   393  	d.mu.Lock()
   394  	defer d.mu.Unlock()
   395  
   396  	ve := &versionEdit{
   397  		NewFiles: make([]newFileEntry, len(meta)),
   398  	}
   399  	metrics := make(map[int]*LevelMetrics)
   400  	current := d.mu.versions.currentVersion()
   401  	for i := range meta {
   402  		// Determine the lowest level in the LSM for which the sstable doesn't
   403  		// overlap any existing files in the level.
   404  		m := meta[i]
   405  		f := &ve.NewFiles[i]
   406  		f.Level = ingestTargetLevel(d.cmp, current, m)
   407  		f.Meta = *m
   408  		levelMetrics := metrics[f.Level]
   409  		if levelMetrics == nil {
   410  			levelMetrics = &LevelMetrics{}
   411  			metrics[f.Level] = levelMetrics
   412  		}
   413  		levelMetrics.BytesIngested += m.Size
   414  	}
   415  	if err := d.mu.versions.logAndApply(jobID, ve, metrics, d.dataDir); err != nil {
   416  		return nil, err
   417  	}
   418  	d.updateReadStateLocked()
   419  	return ve, nil
   420  }