github.com/cockroachdb/pebble@v1.1.5/checkpoint.go (about)

     1  // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"io"
     9  	"os"
    10  
    11  	"github.com/cockroachdb/errors/oserror"
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/record"
    14  	"github.com/cockroachdb/pebble/vfs"
    15  	"github.com/cockroachdb/pebble/vfs/atomicfs"
    16  )
    17  
    18  // checkpointOptions hold the optional parameters to construct checkpoint
    19  // snapshots.
    20  type checkpointOptions struct {
    21  	// flushWAL set to true will force a flush and sync of the WAL prior to
    22  	// checkpointing.
    23  	flushWAL bool
    24  
    25  	// If set, any SSTs that don't overlap with these spans are excluded from a checkpoint.
    26  	restrictToSpans []CheckpointSpan
    27  }
    28  
    29  // CheckpointOption set optional parameters used by `DB.Checkpoint`.
    30  type CheckpointOption func(*checkpointOptions)
    31  
    32  // WithFlushedWAL enables flushing and syncing the WAL prior to constructing a
    33  // checkpoint. This guarantees that any writes committed before calling
    34  // DB.Checkpoint will be part of that checkpoint.
    35  //
    36  // Note that this setting can only be useful in cases when some writes are
    37  // performed with Sync = false. Otherwise, the guarantee will already be met.
    38  //
    39  // Passing this option is functionally equivalent to calling
    40  // DB.LogData(nil, Sync) right before DB.Checkpoint.
    41  func WithFlushedWAL() CheckpointOption {
    42  	return func(opt *checkpointOptions) {
    43  		opt.flushWAL = true
    44  	}
    45  }
    46  
    47  // WithRestrictToSpans specifies spans of interest for the checkpoint. Any SSTs
    48  // that don't overlap with any of these spans are excluded from the checkpoint.
    49  //
    50  // Note that the checkpoint can still surface keys outside of these spans (from
    51  // the WAL and from SSTs that partially overlap with these spans). Moreover,
    52  // these surface keys aren't necessarily "valid" in that they could have been
    53  // modified but the SST containing the modification is excluded.
    54  func WithRestrictToSpans(spans []CheckpointSpan) CheckpointOption {
    55  	return func(opt *checkpointOptions) {
    56  		opt.restrictToSpans = spans
    57  	}
    58  }
    59  
    60  // CheckpointSpan is a key range [Start, End) (inclusive on Start, exclusive on
    61  // End) of interest for a checkpoint.
    62  type CheckpointSpan struct {
    63  	Start []byte
    64  	End   []byte
    65  }
    66  
    67  // excludeFromCheckpoint returns true if an SST file should be excluded from the
    68  // checkpoint because it does not overlap with the spans of interest
    69  // (opt.restrictToSpans).
    70  func excludeFromCheckpoint(f *fileMetadata, opt *checkpointOptions, cmp Compare) bool {
    71  	if len(opt.restrictToSpans) == 0 {
    72  		// Option not set; don't exclude anything.
    73  		return false
    74  	}
    75  	for _, s := range opt.restrictToSpans {
    76  		if f.Overlaps(cmp, s.Start, s.End, true /* exclusiveEnd */) {
    77  			return false
    78  		}
    79  	}
    80  	// None of the restrictToSpans overlapped; we can exclude this file.
    81  	return true
    82  }
    83  
    84  // mkdirAllAndSyncParents creates destDir and any of its missing parents.
    85  // Those missing parents, as well as the closest existing ancestor, are synced.
    86  // Returns a handle to the directory created at destDir.
    87  func mkdirAllAndSyncParents(fs vfs.FS, destDir string) (vfs.File, error) {
    88  	// Collect paths for all directories between destDir (excluded) and its
    89  	// closest existing ancestor (included).
    90  	var parentPaths []string
    91  	for parentPath := fs.PathDir(destDir); ; parentPath = fs.PathDir(parentPath) {
    92  		parentPaths = append(parentPaths, parentPath)
    93  		if fs.PathDir(parentPath) == parentPath {
    94  			break
    95  		}
    96  		_, err := fs.Stat(parentPath)
    97  		if err == nil {
    98  			// Exit loop at the closest existing ancestor.
    99  			break
   100  		}
   101  		if !oserror.IsNotExist(err) {
   102  			return nil, err
   103  		}
   104  	}
   105  	// Create destDir and any of its missing parents.
   106  	if err := fs.MkdirAll(destDir, 0755); err != nil {
   107  		return nil, err
   108  	}
   109  	// Sync all the parent directories up to the closest existing ancestor,
   110  	// included.
   111  	for _, parentPath := range parentPaths {
   112  		parentDir, err := fs.OpenDir(parentPath)
   113  		if err != nil {
   114  			return nil, err
   115  		}
   116  		err = parentDir.Sync()
   117  		if err != nil {
   118  			_ = parentDir.Close()
   119  			return nil, err
   120  		}
   121  		err = parentDir.Close()
   122  		if err != nil {
   123  			return nil, err
   124  		}
   125  	}
   126  	return fs.OpenDir(destDir)
   127  }
   128  
   129  // Checkpoint constructs a snapshot of the DB instance in the specified
   130  // directory. The WAL, MANIFEST, OPTIONS, and sstables will be copied into the
   131  // snapshot. Hard links will be used when possible. Beware of the significant
   132  // space overhead for a checkpoint if hard links are disabled. Also beware that
   133  // even if hard links are used, the space overhead for the checkpoint will
   134  // increase over time as the DB performs compactions.
   135  func (d *DB) Checkpoint(
   136  	destDir string, opts ...CheckpointOption,
   137  ) (
   138  	ckErr error, /* used in deferred cleanup */
   139  ) {
   140  	opt := &checkpointOptions{}
   141  	for _, fn := range opts {
   142  		fn(opt)
   143  	}
   144  
   145  	if _, err := d.opts.FS.Stat(destDir); !oserror.IsNotExist(err) {
   146  		if err == nil {
   147  			return &os.PathError{
   148  				Op:   "checkpoint",
   149  				Path: destDir,
   150  				Err:  oserror.ErrExist,
   151  			}
   152  		}
   153  		return err
   154  	}
   155  
   156  	if opt.flushWAL && !d.opts.DisableWAL {
   157  		// Write an empty log-data record to flush and sync the WAL.
   158  		if err := d.LogData(nil /* data */, Sync); err != nil {
   159  			return err
   160  		}
   161  	}
   162  
   163  	// Disable file deletions.
   164  	d.mu.Lock()
   165  	d.disableFileDeletions()
   166  	defer func() {
   167  		d.mu.Lock()
   168  		defer d.mu.Unlock()
   169  		d.enableFileDeletions()
   170  	}()
   171  
   172  	// TODO(peter): RocksDB provides the option to roll the manifest if the
   173  	// MANIFEST size is too large. Should we do this too?
   174  
   175  	// Lock the manifest before getting the current version. We need the
   176  	// length of the manifest that we read to match the current version that
   177  	// we read, otherwise we might copy a versionEdit not reflected in the
   178  	// sstables we copy/link.
   179  	d.mu.versions.logLock()
   180  	// Get the unflushed log files, the current version, and the current manifest
   181  	// file number.
   182  	memQueue := d.mu.mem.queue
   183  	current := d.mu.versions.currentVersion()
   184  	formatVers := d.FormatMajorVersion()
   185  	manifestFileNum := d.mu.versions.manifestFileNum
   186  	manifestSize := d.mu.versions.manifest.Size()
   187  	optionsFileNum := d.optionsFileNum
   188  	virtualBackingFiles := make(map[base.DiskFileNum]struct{})
   189  	for diskFileNum := range d.mu.versions.backingState.fileBackingMap {
   190  		virtualBackingFiles[diskFileNum] = struct{}{}
   191  	}
   192  	// Release the manifest and DB.mu so we don't block other operations on
   193  	// the database.
   194  	d.mu.versions.logUnlock()
   195  	d.mu.Unlock()
   196  
   197  	// Wrap the normal filesystem with one which wraps newly created files with
   198  	// vfs.NewSyncingFile.
   199  	fs := vfs.NewSyncingFS(d.opts.FS, vfs.SyncingFileOptions{
   200  		NoSyncOnClose: d.opts.NoSyncOnClose,
   201  		BytesPerSync:  d.opts.BytesPerSync,
   202  	})
   203  
   204  	// Create the dir and its parents (if necessary), and sync them.
   205  	var dir vfs.File
   206  	defer func() {
   207  		if dir != nil {
   208  			_ = dir.Close()
   209  		}
   210  		if ckErr != nil {
   211  			// Attempt to cleanup on error.
   212  			_ = fs.RemoveAll(destDir)
   213  		}
   214  	}()
   215  	dir, ckErr = mkdirAllAndSyncParents(fs, destDir)
   216  	if ckErr != nil {
   217  		return ckErr
   218  	}
   219  
   220  	{
   221  		// Link or copy the OPTIONS.
   222  		srcPath := base.MakeFilepath(fs, d.dirname, fileTypeOptions, optionsFileNum)
   223  		destPath := fs.PathJoin(destDir, fs.PathBase(srcPath))
   224  		ckErr = vfs.LinkOrCopy(fs, srcPath, destPath)
   225  		if ckErr != nil {
   226  			return ckErr
   227  		}
   228  	}
   229  
   230  	{
   231  		// Set the format major version in the destination directory.
   232  		var versionMarker *atomicfs.Marker
   233  		versionMarker, _, ckErr = atomicfs.LocateMarker(fs, destDir, formatVersionMarkerName)
   234  		if ckErr != nil {
   235  			return ckErr
   236  		}
   237  
   238  		// We use the marker to encode the active format version in the
   239  		// marker filename. Unlike other uses of the atomic marker,
   240  		// there is no file with the filename `formatVers.String()` on
   241  		// the filesystem.
   242  		ckErr = versionMarker.Move(formatVers.String())
   243  		if ckErr != nil {
   244  			return ckErr
   245  		}
   246  		ckErr = versionMarker.Close()
   247  		if ckErr != nil {
   248  			return ckErr
   249  		}
   250  	}
   251  
   252  	var excludedFiles map[deletedFileEntry]*fileMetadata
   253  	// Set of FileBacking.DiskFileNum which will be required by virtual sstables
   254  	// in the checkpoint.
   255  	requiredVirtualBackingFiles := make(map[base.DiskFileNum]struct{})
   256  	// Link or copy the sstables.
   257  	for l := range current.Levels {
   258  		iter := current.Levels[l].Iter()
   259  		for f := iter.First(); f != nil; f = iter.Next() {
   260  			if excludeFromCheckpoint(f, opt, d.cmp) {
   261  				if excludedFiles == nil {
   262  					excludedFiles = make(map[deletedFileEntry]*fileMetadata)
   263  				}
   264  				excludedFiles[deletedFileEntry{
   265  					Level:   l,
   266  					FileNum: f.FileNum,
   267  				}] = f
   268  				continue
   269  			}
   270  
   271  			fileBacking := f.FileBacking
   272  			if f.Virtual {
   273  				if _, ok := requiredVirtualBackingFiles[fileBacking.DiskFileNum]; ok {
   274  					continue
   275  				}
   276  				requiredVirtualBackingFiles[fileBacking.DiskFileNum] = struct{}{}
   277  			}
   278  
   279  			srcPath := base.MakeFilepath(fs, d.dirname, fileTypeTable, fileBacking.DiskFileNum)
   280  			destPath := fs.PathJoin(destDir, fs.PathBase(srcPath))
   281  			ckErr = vfs.LinkOrCopy(fs, srcPath, destPath)
   282  			if ckErr != nil {
   283  				return ckErr
   284  			}
   285  		}
   286  	}
   287  
   288  	var removeBackingTables []base.DiskFileNum
   289  	for diskFileNum := range virtualBackingFiles {
   290  		if _, ok := requiredVirtualBackingFiles[diskFileNum]; !ok {
   291  			// The backing sstable associated with fileNum is no longer
   292  			// required.
   293  			removeBackingTables = append(removeBackingTables, diskFileNum)
   294  		}
   295  	}
   296  
   297  	ckErr = d.writeCheckpointManifest(
   298  		fs, formatVers, destDir, dir, manifestFileNum.DiskFileNum(), manifestSize,
   299  		excludedFiles, removeBackingTables,
   300  	)
   301  	if ckErr != nil {
   302  		return ckErr
   303  	}
   304  
   305  	// Copy the WAL files. We copy rather than link because WAL file recycling
   306  	// will cause the WAL files to be reused which would invalidate the
   307  	// checkpoint.
   308  	for i := range memQueue {
   309  		logNum := memQueue[i].logNum
   310  		if logNum == 0 {
   311  			continue
   312  		}
   313  		srcPath := base.MakeFilepath(fs, d.walDirname, fileTypeLog, logNum.DiskFileNum())
   314  		destPath := fs.PathJoin(destDir, fs.PathBase(srcPath))
   315  		ckErr = vfs.Copy(fs, srcPath, destPath)
   316  		if ckErr != nil {
   317  			return ckErr
   318  		}
   319  	}
   320  
   321  	// Sync and close the checkpoint directory.
   322  	ckErr = dir.Sync()
   323  	if ckErr != nil {
   324  		return ckErr
   325  	}
   326  	ckErr = dir.Close()
   327  	dir = nil
   328  	return ckErr
   329  }
   330  
   331  func (d *DB) writeCheckpointManifest(
   332  	fs vfs.FS,
   333  	formatVers FormatMajorVersion,
   334  	destDirPath string,
   335  	destDir vfs.File,
   336  	manifestFileNum base.DiskFileNum,
   337  	manifestSize int64,
   338  	excludedFiles map[deletedFileEntry]*fileMetadata,
   339  	removeBackingTables []base.DiskFileNum,
   340  ) error {
   341  	// Copy the MANIFEST, and create a pointer to it. We copy rather
   342  	// than link because additional version edits added to the
   343  	// MANIFEST after we took our snapshot of the sstables will
   344  	// reference sstables that aren't in our checkpoint. For a
   345  	// similar reason, we need to limit how much of the MANIFEST we
   346  	// copy.
   347  	// If some files are excluded from the checkpoint, also append a block that
   348  	// records those files as deleted.
   349  	if err := func() error {
   350  		srcPath := base.MakeFilepath(fs, d.dirname, fileTypeManifest, manifestFileNum)
   351  		destPath := fs.PathJoin(destDirPath, fs.PathBase(srcPath))
   352  		src, err := fs.Open(srcPath, vfs.SequentialReadsOption)
   353  		if err != nil {
   354  			return err
   355  		}
   356  		defer src.Close()
   357  
   358  		dst, err := fs.Create(destPath)
   359  		if err != nil {
   360  			return err
   361  		}
   362  		defer dst.Close()
   363  
   364  		// Copy all existing records. We need to copy at the record level in case we
   365  		// need to append another record with the excluded files (we cannot simply
   366  		// append a record after a raw data copy; see
   367  		// https://github.com/cockroachdb/cockroach/issues/100935).
   368  		r := record.NewReader(&io.LimitedReader{R: src, N: manifestSize}, manifestFileNum.FileNum())
   369  		w := record.NewWriter(dst)
   370  		for {
   371  			rr, err := r.Next()
   372  			if err != nil {
   373  				if err == io.EOF {
   374  					break
   375  				}
   376  				return err
   377  			}
   378  
   379  			rw, err := w.Next()
   380  			if err != nil {
   381  				return err
   382  			}
   383  			if _, err := io.Copy(rw, rr); err != nil {
   384  				return err
   385  			}
   386  		}
   387  
   388  		if len(excludedFiles) > 0 {
   389  			// Write out an additional VersionEdit that deletes the excluded SST files.
   390  			ve := versionEdit{
   391  				DeletedFiles:         excludedFiles,
   392  				RemovedBackingTables: removeBackingTables,
   393  			}
   394  
   395  			rw, err := w.Next()
   396  			if err != nil {
   397  				return err
   398  			}
   399  			if err := ve.Encode(rw); err != nil {
   400  				return err
   401  			}
   402  		}
   403  		if err := w.Close(); err != nil {
   404  			return err
   405  		}
   406  		return dst.Sync()
   407  	}(); err != nil {
   408  		return err
   409  	}
   410  
   411  	// Recent format versions use an atomic marker for setting the
   412  	// active manifest. Older versions use the CURRENT file. The
   413  	// setCurrentFunc function will return a closure that will
   414  	// take the appropriate action for the database's format
   415  	// version.
   416  	var manifestMarker *atomicfs.Marker
   417  	manifestMarker, _, err := atomicfs.LocateMarker(fs, destDirPath, manifestMarkerName)
   418  	if err != nil {
   419  		return err
   420  	}
   421  	if err := setCurrentFunc(formatVers, manifestMarker, fs, destDirPath, destDir)(manifestFileNum.FileNum()); err != nil {
   422  		return err
   423  	}
   424  	return manifestMarker.Close()
   425  }