github.com/cockroachdb/pebble@v1.1.5/checkpoint.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "io" 9 "os" 10 11 "github.com/cockroachdb/errors/oserror" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/record" 14 "github.com/cockroachdb/pebble/vfs" 15 "github.com/cockroachdb/pebble/vfs/atomicfs" 16 ) 17 18 // checkpointOptions hold the optional parameters to construct checkpoint 19 // snapshots. 20 type checkpointOptions struct { 21 // flushWAL set to true will force a flush and sync of the WAL prior to 22 // checkpointing. 23 flushWAL bool 24 25 // If set, any SSTs that don't overlap with these spans are excluded from a checkpoint. 26 restrictToSpans []CheckpointSpan 27 } 28 29 // CheckpointOption set optional parameters used by `DB.Checkpoint`. 30 type CheckpointOption func(*checkpointOptions) 31 32 // WithFlushedWAL enables flushing and syncing the WAL prior to constructing a 33 // checkpoint. This guarantees that any writes committed before calling 34 // DB.Checkpoint will be part of that checkpoint. 35 // 36 // Note that this setting can only be useful in cases when some writes are 37 // performed with Sync = false. Otherwise, the guarantee will already be met. 38 // 39 // Passing this option is functionally equivalent to calling 40 // DB.LogData(nil, Sync) right before DB.Checkpoint. 41 func WithFlushedWAL() CheckpointOption { 42 return func(opt *checkpointOptions) { 43 opt.flushWAL = true 44 } 45 } 46 47 // WithRestrictToSpans specifies spans of interest for the checkpoint. Any SSTs 48 // that don't overlap with any of these spans are excluded from the checkpoint. 49 // 50 // Note that the checkpoint can still surface keys outside of these spans (from 51 // the WAL and from SSTs that partially overlap with these spans). Moreover, 52 // these surface keys aren't necessarily "valid" in that they could have been 53 // modified but the SST containing the modification is excluded. 54 func WithRestrictToSpans(spans []CheckpointSpan) CheckpointOption { 55 return func(opt *checkpointOptions) { 56 opt.restrictToSpans = spans 57 } 58 } 59 60 // CheckpointSpan is a key range [Start, End) (inclusive on Start, exclusive on 61 // End) of interest for a checkpoint. 62 type CheckpointSpan struct { 63 Start []byte 64 End []byte 65 } 66 67 // excludeFromCheckpoint returns true if an SST file should be excluded from the 68 // checkpoint because it does not overlap with the spans of interest 69 // (opt.restrictToSpans). 70 func excludeFromCheckpoint(f *fileMetadata, opt *checkpointOptions, cmp Compare) bool { 71 if len(opt.restrictToSpans) == 0 { 72 // Option not set; don't exclude anything. 73 return false 74 } 75 for _, s := range opt.restrictToSpans { 76 if f.Overlaps(cmp, s.Start, s.End, true /* exclusiveEnd */) { 77 return false 78 } 79 } 80 // None of the restrictToSpans overlapped; we can exclude this file. 81 return true 82 } 83 84 // mkdirAllAndSyncParents creates destDir and any of its missing parents. 85 // Those missing parents, as well as the closest existing ancestor, are synced. 86 // Returns a handle to the directory created at destDir. 87 func mkdirAllAndSyncParents(fs vfs.FS, destDir string) (vfs.File, error) { 88 // Collect paths for all directories between destDir (excluded) and its 89 // closest existing ancestor (included). 90 var parentPaths []string 91 for parentPath := fs.PathDir(destDir); ; parentPath = fs.PathDir(parentPath) { 92 parentPaths = append(parentPaths, parentPath) 93 if fs.PathDir(parentPath) == parentPath { 94 break 95 } 96 _, err := fs.Stat(parentPath) 97 if err == nil { 98 // Exit loop at the closest existing ancestor. 99 break 100 } 101 if !oserror.IsNotExist(err) { 102 return nil, err 103 } 104 } 105 // Create destDir and any of its missing parents. 106 if err := fs.MkdirAll(destDir, 0755); err != nil { 107 return nil, err 108 } 109 // Sync all the parent directories up to the closest existing ancestor, 110 // included. 111 for _, parentPath := range parentPaths { 112 parentDir, err := fs.OpenDir(parentPath) 113 if err != nil { 114 return nil, err 115 } 116 err = parentDir.Sync() 117 if err != nil { 118 _ = parentDir.Close() 119 return nil, err 120 } 121 err = parentDir.Close() 122 if err != nil { 123 return nil, err 124 } 125 } 126 return fs.OpenDir(destDir) 127 } 128 129 // Checkpoint constructs a snapshot of the DB instance in the specified 130 // directory. The WAL, MANIFEST, OPTIONS, and sstables will be copied into the 131 // snapshot. Hard links will be used when possible. Beware of the significant 132 // space overhead for a checkpoint if hard links are disabled. Also beware that 133 // even if hard links are used, the space overhead for the checkpoint will 134 // increase over time as the DB performs compactions. 135 func (d *DB) Checkpoint( 136 destDir string, opts ...CheckpointOption, 137 ) ( 138 ckErr error, /* used in deferred cleanup */ 139 ) { 140 opt := &checkpointOptions{} 141 for _, fn := range opts { 142 fn(opt) 143 } 144 145 if _, err := d.opts.FS.Stat(destDir); !oserror.IsNotExist(err) { 146 if err == nil { 147 return &os.PathError{ 148 Op: "checkpoint", 149 Path: destDir, 150 Err: oserror.ErrExist, 151 } 152 } 153 return err 154 } 155 156 if opt.flushWAL && !d.opts.DisableWAL { 157 // Write an empty log-data record to flush and sync the WAL. 158 if err := d.LogData(nil /* data */, Sync); err != nil { 159 return err 160 } 161 } 162 163 // Disable file deletions. 164 d.mu.Lock() 165 d.disableFileDeletions() 166 defer func() { 167 d.mu.Lock() 168 defer d.mu.Unlock() 169 d.enableFileDeletions() 170 }() 171 172 // TODO(peter): RocksDB provides the option to roll the manifest if the 173 // MANIFEST size is too large. Should we do this too? 174 175 // Lock the manifest before getting the current version. We need the 176 // length of the manifest that we read to match the current version that 177 // we read, otherwise we might copy a versionEdit not reflected in the 178 // sstables we copy/link. 179 d.mu.versions.logLock() 180 // Get the unflushed log files, the current version, and the current manifest 181 // file number. 182 memQueue := d.mu.mem.queue 183 current := d.mu.versions.currentVersion() 184 formatVers := d.FormatMajorVersion() 185 manifestFileNum := d.mu.versions.manifestFileNum 186 manifestSize := d.mu.versions.manifest.Size() 187 optionsFileNum := d.optionsFileNum 188 virtualBackingFiles := make(map[base.DiskFileNum]struct{}) 189 for diskFileNum := range d.mu.versions.backingState.fileBackingMap { 190 virtualBackingFiles[diskFileNum] = struct{}{} 191 } 192 // Release the manifest and DB.mu so we don't block other operations on 193 // the database. 194 d.mu.versions.logUnlock() 195 d.mu.Unlock() 196 197 // Wrap the normal filesystem with one which wraps newly created files with 198 // vfs.NewSyncingFile. 199 fs := vfs.NewSyncingFS(d.opts.FS, vfs.SyncingFileOptions{ 200 NoSyncOnClose: d.opts.NoSyncOnClose, 201 BytesPerSync: d.opts.BytesPerSync, 202 }) 203 204 // Create the dir and its parents (if necessary), and sync them. 205 var dir vfs.File 206 defer func() { 207 if dir != nil { 208 _ = dir.Close() 209 } 210 if ckErr != nil { 211 // Attempt to cleanup on error. 212 _ = fs.RemoveAll(destDir) 213 } 214 }() 215 dir, ckErr = mkdirAllAndSyncParents(fs, destDir) 216 if ckErr != nil { 217 return ckErr 218 } 219 220 { 221 // Link or copy the OPTIONS. 222 srcPath := base.MakeFilepath(fs, d.dirname, fileTypeOptions, optionsFileNum) 223 destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) 224 ckErr = vfs.LinkOrCopy(fs, srcPath, destPath) 225 if ckErr != nil { 226 return ckErr 227 } 228 } 229 230 { 231 // Set the format major version in the destination directory. 232 var versionMarker *atomicfs.Marker 233 versionMarker, _, ckErr = atomicfs.LocateMarker(fs, destDir, formatVersionMarkerName) 234 if ckErr != nil { 235 return ckErr 236 } 237 238 // We use the marker to encode the active format version in the 239 // marker filename. Unlike other uses of the atomic marker, 240 // there is no file with the filename `formatVers.String()` on 241 // the filesystem. 242 ckErr = versionMarker.Move(formatVers.String()) 243 if ckErr != nil { 244 return ckErr 245 } 246 ckErr = versionMarker.Close() 247 if ckErr != nil { 248 return ckErr 249 } 250 } 251 252 var excludedFiles map[deletedFileEntry]*fileMetadata 253 // Set of FileBacking.DiskFileNum which will be required by virtual sstables 254 // in the checkpoint. 255 requiredVirtualBackingFiles := make(map[base.DiskFileNum]struct{}) 256 // Link or copy the sstables. 257 for l := range current.Levels { 258 iter := current.Levels[l].Iter() 259 for f := iter.First(); f != nil; f = iter.Next() { 260 if excludeFromCheckpoint(f, opt, d.cmp) { 261 if excludedFiles == nil { 262 excludedFiles = make(map[deletedFileEntry]*fileMetadata) 263 } 264 excludedFiles[deletedFileEntry{ 265 Level: l, 266 FileNum: f.FileNum, 267 }] = f 268 continue 269 } 270 271 fileBacking := f.FileBacking 272 if f.Virtual { 273 if _, ok := requiredVirtualBackingFiles[fileBacking.DiskFileNum]; ok { 274 continue 275 } 276 requiredVirtualBackingFiles[fileBacking.DiskFileNum] = struct{}{} 277 } 278 279 srcPath := base.MakeFilepath(fs, d.dirname, fileTypeTable, fileBacking.DiskFileNum) 280 destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) 281 ckErr = vfs.LinkOrCopy(fs, srcPath, destPath) 282 if ckErr != nil { 283 return ckErr 284 } 285 } 286 } 287 288 var removeBackingTables []base.DiskFileNum 289 for diskFileNum := range virtualBackingFiles { 290 if _, ok := requiredVirtualBackingFiles[diskFileNum]; !ok { 291 // The backing sstable associated with fileNum is no longer 292 // required. 293 removeBackingTables = append(removeBackingTables, diskFileNum) 294 } 295 } 296 297 ckErr = d.writeCheckpointManifest( 298 fs, formatVers, destDir, dir, manifestFileNum.DiskFileNum(), manifestSize, 299 excludedFiles, removeBackingTables, 300 ) 301 if ckErr != nil { 302 return ckErr 303 } 304 305 // Copy the WAL files. We copy rather than link because WAL file recycling 306 // will cause the WAL files to be reused which would invalidate the 307 // checkpoint. 308 for i := range memQueue { 309 logNum := memQueue[i].logNum 310 if logNum == 0 { 311 continue 312 } 313 srcPath := base.MakeFilepath(fs, d.walDirname, fileTypeLog, logNum.DiskFileNum()) 314 destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) 315 ckErr = vfs.Copy(fs, srcPath, destPath) 316 if ckErr != nil { 317 return ckErr 318 } 319 } 320 321 // Sync and close the checkpoint directory. 322 ckErr = dir.Sync() 323 if ckErr != nil { 324 return ckErr 325 } 326 ckErr = dir.Close() 327 dir = nil 328 return ckErr 329 } 330 331 func (d *DB) writeCheckpointManifest( 332 fs vfs.FS, 333 formatVers FormatMajorVersion, 334 destDirPath string, 335 destDir vfs.File, 336 manifestFileNum base.DiskFileNum, 337 manifestSize int64, 338 excludedFiles map[deletedFileEntry]*fileMetadata, 339 removeBackingTables []base.DiskFileNum, 340 ) error { 341 // Copy the MANIFEST, and create a pointer to it. We copy rather 342 // than link because additional version edits added to the 343 // MANIFEST after we took our snapshot of the sstables will 344 // reference sstables that aren't in our checkpoint. For a 345 // similar reason, we need to limit how much of the MANIFEST we 346 // copy. 347 // If some files are excluded from the checkpoint, also append a block that 348 // records those files as deleted. 349 if err := func() error { 350 srcPath := base.MakeFilepath(fs, d.dirname, fileTypeManifest, manifestFileNum) 351 destPath := fs.PathJoin(destDirPath, fs.PathBase(srcPath)) 352 src, err := fs.Open(srcPath, vfs.SequentialReadsOption) 353 if err != nil { 354 return err 355 } 356 defer src.Close() 357 358 dst, err := fs.Create(destPath) 359 if err != nil { 360 return err 361 } 362 defer dst.Close() 363 364 // Copy all existing records. We need to copy at the record level in case we 365 // need to append another record with the excluded files (we cannot simply 366 // append a record after a raw data copy; see 367 // https://github.com/cockroachdb/cockroach/issues/100935). 368 r := record.NewReader(&io.LimitedReader{R: src, N: manifestSize}, manifestFileNum.FileNum()) 369 w := record.NewWriter(dst) 370 for { 371 rr, err := r.Next() 372 if err != nil { 373 if err == io.EOF { 374 break 375 } 376 return err 377 } 378 379 rw, err := w.Next() 380 if err != nil { 381 return err 382 } 383 if _, err := io.Copy(rw, rr); err != nil { 384 return err 385 } 386 } 387 388 if len(excludedFiles) > 0 { 389 // Write out an additional VersionEdit that deletes the excluded SST files. 390 ve := versionEdit{ 391 DeletedFiles: excludedFiles, 392 RemovedBackingTables: removeBackingTables, 393 } 394 395 rw, err := w.Next() 396 if err != nil { 397 return err 398 } 399 if err := ve.Encode(rw); err != nil { 400 return err 401 } 402 } 403 if err := w.Close(); err != nil { 404 return err 405 } 406 return dst.Sync() 407 }(); err != nil { 408 return err 409 } 410 411 // Recent format versions use an atomic marker for setting the 412 // active manifest. Older versions use the CURRENT file. The 413 // setCurrentFunc function will return a closure that will 414 // take the appropriate action for the database's format 415 // version. 416 var manifestMarker *atomicfs.Marker 417 manifestMarker, _, err := atomicfs.LocateMarker(fs, destDirPath, manifestMarkerName) 418 if err != nil { 419 return err 420 } 421 if err := setCurrentFunc(formatVers, manifestMarker, fs, destDirPath, destDir)(manifestFileNum.FileNum()); err != nil { 422 return err 423 } 424 return manifestMarker.Close() 425 }