github.com/cockroachdb/pebble@v1.1.2/ingest.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "sort" 10 "time" 11 12 "github.com/cockroachdb/errors" 13 "github.com/cockroachdb/pebble/internal/base" 14 "github.com/cockroachdb/pebble/internal/invariants" 15 "github.com/cockroachdb/pebble/internal/keyspan" 16 "github.com/cockroachdb/pebble/internal/manifest" 17 "github.com/cockroachdb/pebble/internal/private" 18 "github.com/cockroachdb/pebble/objstorage" 19 "github.com/cockroachdb/pebble/objstorage/remote" 20 "github.com/cockroachdb/pebble/sstable" 21 ) 22 23 func sstableKeyCompare(userCmp Compare, a, b InternalKey) int { 24 c := userCmp(a.UserKey, b.UserKey) 25 if c != 0 { 26 return c 27 } 28 if a.IsExclusiveSentinel() { 29 if !b.IsExclusiveSentinel() { 30 return -1 31 } 32 } else if b.IsExclusiveSentinel() { 33 return +1 34 } 35 return 0 36 } 37 38 // KeyRange encodes a key range in user key space. A KeyRange's Start is 39 // inclusive while its End is exclusive. 40 type KeyRange struct { 41 Start, End []byte 42 } 43 44 // Valid returns true if the KeyRange is defined. 45 func (k *KeyRange) Valid() bool { 46 return k.Start != nil && k.End != nil 47 } 48 49 // Contains returns whether the specified key exists in the KeyRange. 50 func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool { 51 v := cmp(key.UserKey, k.End) 52 return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0 53 } 54 55 // OverlapsInternalKeyRange checks if the specified internal key range has an 56 // overlap with the KeyRange. Note that we aren't checking for full containment 57 // of smallest-largest within k, rather just that there's some intersection 58 // between the two ranges. 59 func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool { 60 v := cmp(k.Start, largest.UserKey) 61 return v <= 0 && !(largest.IsExclusiveSentinel() && v == 0) && 62 cmp(k.End, smallest.UserKey) > 0 63 } 64 65 // Overlaps checks if the specified file has an overlap with the KeyRange. 66 // Note that we aren't checking for full containment of m within k, rather just 67 // that there's some intersection between m and k's bounds. 68 func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool { 69 return k.OverlapsInternalKeyRange(cmp, m.Smallest, m.Largest) 70 } 71 72 // OverlapsKeyRange checks if this span overlaps with the provided KeyRange. 73 // Note that we aren't checking for full containment of either span in the other, 74 // just that there's a key x that is in both key ranges. 75 func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool { 76 return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0 77 } 78 79 func ingestValidateKey(opts *Options, key *InternalKey) error { 80 if key.Kind() == InternalKeyKindInvalid { 81 return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s", 82 key.Pretty(opts.Comparer.FormatKey)) 83 } 84 if key.SeqNum() != 0 { 85 return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s", 86 key.Pretty(opts.Comparer.FormatKey)) 87 } 88 return nil 89 } 90 91 // ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned 92 // or shared by another node. 93 func ingestSynthesizeShared( 94 opts *Options, sm SharedSSTMeta, fileNum base.DiskFileNum, 95 ) (*fileMetadata, error) { 96 if sm.Size == 0 { 97 // Disallow 0 file sizes 98 return nil, errors.New("pebble: cannot ingest shared file with size 0") 99 } 100 // Don't load table stats. Doing a round trip to shared storage, one SST 101 // at a time is not worth it as it slows down ingestion. 102 meta := &fileMetadata{ 103 FileNum: fileNum.FileNum(), 104 CreationTime: time.Now().Unix(), 105 Virtual: true, 106 Size: sm.Size, 107 } 108 meta.InitProviderBacking(fileNum) 109 // Set the underlying FileBacking's size to the same size as the virtualized 110 // view of the sstable. This ensures that we don't over-prioritize this 111 // sstable for compaction just yet, as we do not have a clear sense of what 112 // parts of this sstable are referenced by other nodes. 113 meta.FileBacking.Size = sm.Size 114 if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil { 115 // Initialize meta.{HasRangeKeys,Smallest,Largest}, etc. 116 // 117 // NB: We create new internal keys and pass them into ExternalRangeKeyBounds 118 // so that we can sub a zero sequence number into the bounds. We can set 119 // the sequence number to anything here; it'll be reset in ingestUpdateSeqNum 120 // anyway. However we do need to use the same sequence number across all 121 // bound keys at this step so that we end up with bounds that are consistent 122 // across point/range keys. 123 smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, sm.SmallestRangeKey.Kind()) 124 largestRangeKey := base.MakeExclusiveSentinelKey(sm.LargestRangeKey.Kind(), sm.LargestRangeKey.UserKey) 125 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey) 126 } 127 if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil { 128 // Initialize meta.{HasPointKeys,Smallest,Largest}, etc. 129 // 130 // See point above in the ExtendRangeKeyBounds call on why we use a zero 131 // sequence number here. 132 smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, sm.SmallestPointKey.Kind()) 133 largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, sm.LargestPointKey.Kind()) 134 if sm.LargestPointKey.IsExclusiveSentinel() { 135 largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey) 136 } 137 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey) 138 } 139 if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { 140 return nil, err 141 } 142 return meta, nil 143 } 144 145 // ingestLoad1External loads the fileMetadata for one external sstable. 146 // Sequence number and target level calculation happens during prepare/apply. 147 func ingestLoad1External( 148 opts *Options, 149 e ExternalFile, 150 fileNum base.DiskFileNum, 151 objprovider objstorage.Provider, 152 jobID int, 153 ) (*fileMetadata, error) { 154 if e.Size == 0 { 155 // Disallow 0 file sizes 156 return nil, errors.New("pebble: cannot ingest external file with size 0") 157 } 158 if !e.HasRangeKey && !e.HasPointKey { 159 return nil, errors.New("pebble: cannot ingest external file with no point or range keys") 160 } 161 // Don't load table stats. Doing a round trip to shared storage, one SST 162 // at a time is not worth it as it slows down ingestion. 163 meta := &fileMetadata{} 164 meta.FileNum = fileNum.FileNum() 165 meta.CreationTime = time.Now().Unix() 166 meta.Virtual = true 167 meta.Size = e.Size 168 meta.InitProviderBacking(fileNum) 169 170 // Try to resolve a reference to the external file. 171 backing, err := objprovider.CreateExternalObjectBacking(e.Locator, e.ObjName) 172 if err != nil { 173 return nil, err 174 } 175 metas, err := objprovider.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{ 176 FileNum: fileNum, 177 FileType: fileTypeTable, 178 Backing: backing, 179 }}) 180 if err != nil { 181 return nil, err 182 } 183 if opts.EventListener.TableCreated != nil { 184 opts.EventListener.TableCreated(TableCreateInfo{ 185 JobID: jobID, 186 Reason: "ingesting", 187 Path: objprovider.Path(metas[0]), 188 FileNum: fileNum.FileNum(), 189 }) 190 } 191 // In the name of keeping this ingestion as fast as possible, we avoid 192 // *all* existence checks and synthesize a file metadata with smallest/largest 193 // keys that overlap whatever the passed-in span was. 194 smallestCopy := make([]byte, len(e.SmallestUserKey)) 195 copy(smallestCopy, e.SmallestUserKey) 196 largestCopy := make([]byte, len(e.LargestUserKey)) 197 copy(largestCopy, e.LargestUserKey) 198 if e.HasPointKey { 199 meta.ExtendPointKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax), 200 base.MakeRangeDeleteSentinelKey(largestCopy)) 201 } 202 if e.HasRangeKey { 203 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeySet), 204 base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyDelete, largestCopy)) 205 } 206 207 // Set the underlying FileBacking's size to the same size as the virtualized 208 // view of the sstable. This ensures that we don't over-prioritize this 209 // sstable for compaction just yet, as we do not have a clear sense of 210 // what parts of this sstable are referenced by other nodes. 211 meta.FileBacking.Size = e.Size 212 213 if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { 214 return nil, err 215 } 216 return meta, nil 217 } 218 219 // ingestLoad1 creates the FileMetadata for one file. This file will be owned 220 // by this store. 221 func ingestLoad1( 222 opts *Options, 223 fmv FormatMajorVersion, 224 readable objstorage.Readable, 225 cacheID uint64, 226 fileNum base.DiskFileNum, 227 ) (*fileMetadata, error) { 228 cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption) 229 r, err := sstable.NewReader(readable, opts.MakeReaderOptions(), cacheOpts) 230 if err != nil { 231 return nil, err 232 } 233 defer r.Close() 234 235 // Avoid ingesting tables with format versions this DB doesn't support. 236 tf, err := r.TableFormat() 237 if err != nil { 238 return nil, err 239 } 240 if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() { 241 return nil, errors.Newf( 242 "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)", 243 tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(), 244 ) 245 } 246 247 meta := &fileMetadata{} 248 meta.FileNum = fileNum.FileNum() 249 meta.Size = uint64(readable.Size()) 250 meta.CreationTime = time.Now().Unix() 251 meta.InitPhysicalBacking() 252 253 // Avoid loading into the table cache for collecting stats if we 254 // don't need to. If there are no range deletions, we have all the 255 // information to compute the stats here. 256 // 257 // This is helpful in tests for avoiding awkwardness around deletion of 258 // ingested files from MemFS. MemFS implements the Windows semantics of 259 // disallowing removal of an open file. Under MemFS, if we don't populate 260 // meta.Stats here, the file will be loaded into the table cache for 261 // calculating stats before we can remove the original link. 262 maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties) 263 264 { 265 iter, err := r.NewIter(nil /* lower */, nil /* upper */) 266 if err != nil { 267 return nil, err 268 } 269 defer iter.Close() 270 var smallest InternalKey 271 if key, _ := iter.First(); key != nil { 272 if err := ingestValidateKey(opts, key); err != nil { 273 return nil, err 274 } 275 smallest = (*key).Clone() 276 } 277 if err := iter.Error(); err != nil { 278 return nil, err 279 } 280 if key, _ := iter.Last(); key != nil { 281 if err := ingestValidateKey(opts, key); err != nil { 282 return nil, err 283 } 284 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone()) 285 } 286 if err := iter.Error(); err != nil { 287 return nil, err 288 } 289 } 290 291 iter, err := r.NewRawRangeDelIter() 292 if err != nil { 293 return nil, err 294 } 295 if iter != nil { 296 defer iter.Close() 297 var smallest InternalKey 298 if s := iter.First(); s != nil { 299 key := s.SmallestKey() 300 if err := ingestValidateKey(opts, &key); err != nil { 301 return nil, err 302 } 303 smallest = key.Clone() 304 } 305 if err := iter.Error(); err != nil { 306 return nil, err 307 } 308 if s := iter.Last(); s != nil { 309 k := s.SmallestKey() 310 if err := ingestValidateKey(opts, &k); err != nil { 311 return nil, err 312 } 313 largest := s.LargestKey().Clone() 314 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest) 315 } 316 } 317 318 // Update the range-key bounds for the table. 319 { 320 iter, err := r.NewRawRangeKeyIter() 321 if err != nil { 322 return nil, err 323 } 324 if iter != nil { 325 defer iter.Close() 326 var smallest InternalKey 327 if s := iter.First(); s != nil { 328 key := s.SmallestKey() 329 if err := ingestValidateKey(opts, &key); err != nil { 330 return nil, err 331 } 332 smallest = key.Clone() 333 } 334 if err := iter.Error(); err != nil { 335 return nil, err 336 } 337 if s := iter.Last(); s != nil { 338 k := s.SmallestKey() 339 if err := ingestValidateKey(opts, &k); err != nil { 340 return nil, err 341 } 342 // As range keys are fragmented, the end key of the last range key in 343 // the table provides the upper bound for the table. 344 largest := s.LargestKey().Clone() 345 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest) 346 } 347 if err := iter.Error(); err != nil { 348 return nil, err 349 } 350 } 351 } 352 353 if !meta.HasPointKeys && !meta.HasRangeKeys { 354 return nil, nil 355 } 356 357 // Sanity check that the various bounds on the file were set consistently. 358 if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { 359 return nil, err 360 } 361 362 return meta, nil 363 } 364 365 type ingestLoadResult struct { 366 localMeta, sharedMeta []*fileMetadata 367 externalMeta []*fileMetadata 368 localPaths []string 369 sharedLevels []uint8 370 fileCount int 371 } 372 373 func ingestLoad( 374 opts *Options, 375 fmv FormatMajorVersion, 376 paths []string, 377 shared []SharedSSTMeta, 378 external []ExternalFile, 379 cacheID uint64, 380 pending []base.DiskFileNum, 381 objProvider objstorage.Provider, 382 jobID int, 383 ) (ingestLoadResult, error) { 384 meta := make([]*fileMetadata, 0, len(paths)) 385 newPaths := make([]string, 0, len(paths)) 386 for i := range paths { 387 f, err := opts.FS.Open(paths[i]) 388 if err != nil { 389 return ingestLoadResult{}, err 390 } 391 392 readable, err := sstable.NewSimpleReadable(f) 393 if err != nil { 394 return ingestLoadResult{}, err 395 } 396 m, err := ingestLoad1(opts, fmv, readable, cacheID, pending[i]) 397 if err != nil { 398 return ingestLoadResult{}, err 399 } 400 if m != nil { 401 meta = append(meta, m) 402 newPaths = append(newPaths, paths[i]) 403 } 404 } 405 if len(shared) == 0 && len(external) == 0 { 406 return ingestLoadResult{localMeta: meta, localPaths: newPaths, fileCount: len(meta)}, nil 407 } 408 409 // Sort the shared files according to level. 410 sort.Sort(sharedByLevel(shared)) 411 412 sharedMeta := make([]*fileMetadata, 0, len(shared)) 413 levels := make([]uint8, 0, len(shared)) 414 for i := range shared { 415 m, err := ingestSynthesizeShared(opts, shared[i], pending[len(paths)+i]) 416 if err != nil { 417 return ingestLoadResult{}, err 418 } 419 if shared[i].Level < sharedLevelsStart { 420 return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart") 421 } 422 sharedMeta = append(sharedMeta, m) 423 levels = append(levels, shared[i].Level) 424 } 425 externalMeta := make([]*fileMetadata, 0, len(external)) 426 for i := range external { 427 m, err := ingestLoad1External(opts, external[i], pending[len(paths)+len(shared)+i], objProvider, jobID) 428 if err != nil { 429 return ingestLoadResult{}, err 430 } 431 externalMeta = append(externalMeta, m) 432 } 433 result := ingestLoadResult{ 434 localMeta: meta, 435 sharedMeta: sharedMeta, 436 externalMeta: externalMeta, 437 localPaths: newPaths, 438 sharedLevels: levels, 439 fileCount: len(meta) + len(sharedMeta) + len(externalMeta), 440 } 441 return result, nil 442 } 443 444 // Struct for sorting metadatas by smallest user keys, while ensuring the 445 // matching path also gets swapped to the same index. For use in 446 // ingestSortAndVerify. 447 type metaAndPaths struct { 448 meta []*fileMetadata 449 paths []string 450 cmp Compare 451 } 452 453 func (m metaAndPaths) Len() int { 454 return len(m.meta) 455 } 456 457 func (m metaAndPaths) Less(i, j int) bool { 458 return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0 459 } 460 461 func (m metaAndPaths) Swap(i, j int) { 462 m.meta[i], m.meta[j] = m.meta[j], m.meta[i] 463 if m.paths != nil { 464 m.paths[i], m.paths[j] = m.paths[j], m.paths[i] 465 } 466 } 467 468 func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error { 469 // Verify that all the shared files (i.e. files in sharedMeta) 470 // fit within the exciseSpan. 471 for i := range lr.sharedMeta { 472 f := lr.sharedMeta[i] 473 if !exciseSpan.Contains(cmp, f.Smallest) || !exciseSpan.Contains(cmp, f.Largest) { 474 return errors.AssertionFailedf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String()) 475 } 476 } 477 if len(lr.externalMeta) > 0 { 478 if len(lr.localMeta) > 0 || len(lr.sharedMeta) > 0 { 479 // Currently we only support external ingests on their own. If external 480 // files are present alongside local/shared files, return an error. 481 return errors.AssertionFailedf("pebble: external files cannot be ingested atomically alongside other types of files") 482 } 483 sort.Sort(&metaAndPaths{ 484 meta: lr.externalMeta, 485 cmp: cmp, 486 }) 487 for i := 1; i < len(lr.externalMeta); i++ { 488 if sstableKeyCompare(cmp, lr.externalMeta[i-1].Largest, lr.externalMeta[i].Smallest) >= 0 { 489 return errors.AssertionFailedf("pebble: external sstables have overlapping ranges") 490 } 491 } 492 return nil 493 } 494 if len(lr.localMeta) <= 1 || len(lr.localPaths) <= 1 { 495 return nil 496 } 497 498 sort.Sort(&metaAndPaths{ 499 meta: lr.localMeta, 500 paths: lr.localPaths, 501 cmp: cmp, 502 }) 503 504 for i := 1; i < len(lr.localPaths); i++ { 505 if sstableKeyCompare(cmp, lr.localMeta[i-1].Largest, lr.localMeta[i].Smallest) >= 0 { 506 return errors.AssertionFailedf("pebble: local ingestion sstables have overlapping ranges") 507 } 508 } 509 if len(lr.sharedMeta) == 0 { 510 return nil 511 } 512 filesInLevel := make([]*fileMetadata, 0, len(lr.sharedMeta)) 513 for l := sharedLevelsStart; l < numLevels; l++ { 514 filesInLevel = filesInLevel[:0] 515 for i := range lr.sharedMeta { 516 if lr.sharedLevels[i] == uint8(l) { 517 filesInLevel = append(filesInLevel, lr.sharedMeta[i]) 518 } 519 } 520 sort.Slice(filesInLevel, func(i, j int) bool { 521 return cmp(filesInLevel[i].Smallest.UserKey, filesInLevel[j].Smallest.UserKey) < 0 522 }) 523 for i := 1; i < len(filesInLevel); i++ { 524 if sstableKeyCompare(cmp, filesInLevel[i-1].Largest, filesInLevel[i].Smallest) >= 0 { 525 return errors.AssertionFailedf("pebble: external shared sstables have overlapping ranges") 526 } 527 } 528 } 529 return nil 530 } 531 532 func ingestCleanup(objProvider objstorage.Provider, meta []*fileMetadata) error { 533 var firstErr error 534 for i := range meta { 535 if err := objProvider.Remove(fileTypeTable, meta[i].FileBacking.DiskFileNum); err != nil { 536 firstErr = firstError(firstErr, err) 537 } 538 } 539 return firstErr 540 } 541 542 // ingestLink creates new objects which are backed by either hardlinks to or 543 // copies of the ingested files. It also attaches shared objects to the provider. 544 func ingestLink( 545 jobID int, 546 opts *Options, 547 objProvider objstorage.Provider, 548 lr ingestLoadResult, 549 shared []SharedSSTMeta, 550 ) error { 551 for i := range lr.localPaths { 552 objMeta, err := objProvider.LinkOrCopyFromLocal( 553 context.TODO(), opts.FS, lr.localPaths[i], fileTypeTable, lr.localMeta[i].FileBacking.DiskFileNum, 554 objstorage.CreateOptions{PreferSharedStorage: true}, 555 ) 556 if err != nil { 557 if err2 := ingestCleanup(objProvider, lr.localMeta[:i]); err2 != nil { 558 opts.Logger.Infof("ingest cleanup failed: %v", err2) 559 } 560 return err 561 } 562 if opts.EventListener.TableCreated != nil { 563 opts.EventListener.TableCreated(TableCreateInfo{ 564 JobID: jobID, 565 Reason: "ingesting", 566 Path: objProvider.Path(objMeta), 567 FileNum: lr.localMeta[i].FileNum, 568 }) 569 } 570 } 571 sharedObjs := make([]objstorage.RemoteObjectToAttach, 0, len(shared)) 572 for i := range shared { 573 backing, err := shared[i].Backing.Get() 574 if err != nil { 575 return err 576 } 577 sharedObjs = append(sharedObjs, objstorage.RemoteObjectToAttach{ 578 FileNum: lr.sharedMeta[i].FileBacking.DiskFileNum, 579 FileType: fileTypeTable, 580 Backing: backing, 581 }) 582 } 583 sharedObjMetas, err := objProvider.AttachRemoteObjects(sharedObjs) 584 if err != nil { 585 return err 586 } 587 for i := range sharedObjMetas { 588 // One corner case around file sizes we need to be mindful of, is that 589 // if one of the shareObjs was initially created by us (and has boomeranged 590 // back from another node), we'll need to update the FileBacking's size 591 // to be the true underlying size. Otherwise, we could hit errors when we 592 // open the db again after a crash/restart (see checkConsistency in open.go), 593 // plus it more accurately allows us to prioritize compactions of files 594 // that were originally created by us. 595 if sharedObjMetas[i].IsShared() && !objProvider.IsSharedForeign(sharedObjMetas[i]) { 596 size, err := objProvider.Size(sharedObjMetas[i]) 597 if err != nil { 598 return err 599 } 600 lr.sharedMeta[i].FileBacking.Size = uint64(size) 601 } 602 if opts.EventListener.TableCreated != nil { 603 opts.EventListener.TableCreated(TableCreateInfo{ 604 JobID: jobID, 605 Reason: "ingesting", 606 Path: objProvider.Path(sharedObjMetas[i]), 607 FileNum: lr.sharedMeta[i].FileNum, 608 }) 609 } 610 } 611 // We do not need to do anything about lr.externalMetas. Those were already 612 // linked in ingestLoad. 613 614 return nil 615 } 616 617 func ingestMemtableOverlaps(cmp Compare, mem flushable, keyRanges []internalKeyRange) bool { 618 iter := mem.newIter(nil) 619 rangeDelIter := mem.newRangeDelIter(nil) 620 rkeyIter := mem.newRangeKeyIter(nil) 621 622 closeIters := func() error { 623 err := iter.Close() 624 if rangeDelIter != nil { 625 err = firstError(err, rangeDelIter.Close()) 626 } 627 if rkeyIter != nil { 628 err = firstError(err, rkeyIter.Close()) 629 } 630 return err 631 } 632 633 for _, kr := range keyRanges { 634 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, cmp) { 635 closeIters() 636 return true 637 } 638 } 639 640 // Assume overlap if any iterator errored out. 641 return closeIters() != nil 642 } 643 644 func ingestUpdateSeqNum( 645 cmp Compare, format base.FormatKey, seqNum uint64, loadResult ingestLoadResult, 646 ) error { 647 setSeqFn := func(k base.InternalKey) base.InternalKey { 648 return base.MakeInternalKey(k.UserKey, seqNum, k.Kind()) 649 } 650 updateMetadata := func(m *fileMetadata) error { 651 // NB: we set the fields directly here, rather than via their Extend* 652 // methods, as we are updating sequence numbers. 653 if m.HasPointKeys { 654 m.SmallestPointKey = setSeqFn(m.SmallestPointKey) 655 } 656 if m.HasRangeKeys { 657 m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey) 658 } 659 m.Smallest = setSeqFn(m.Smallest) 660 // Only update the seqnum for the largest key if that key is not an 661 // "exclusive sentinel" (i.e. a range deletion sentinel or a range key 662 // boundary), as doing so effectively drops the exclusive sentinel (by 663 // lowering the seqnum from the max value), and extends the bounds of the 664 // table. 665 // NB: as the largest range key is always an exclusive sentinel, it is never 666 // updated. 667 if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() { 668 m.LargestPointKey = setSeqFn(m.LargestPointKey) 669 } 670 if !m.Largest.IsExclusiveSentinel() { 671 m.Largest = setSeqFn(m.Largest) 672 } 673 // Setting smallestSeqNum == largestSeqNum triggers the setting of 674 // Properties.GlobalSeqNum when an sstable is loaded. 675 m.SmallestSeqNum = seqNum 676 m.LargestSeqNum = seqNum 677 // Ensure the new bounds are consistent. 678 if err := m.Validate(cmp, format); err != nil { 679 return err 680 } 681 seqNum++ 682 return nil 683 } 684 685 // Shared sstables are required to be sorted by level ascending. We then 686 // iterate the shared sstables in reverse, assigning the lower sequence 687 // numbers to the shared sstables that will be ingested into the lower 688 // (larger numbered) levels first. This ensures sequence number shadowing is 689 // correct. 690 for i := len(loadResult.sharedMeta) - 1; i >= 0; i-- { 691 if i-1 >= 0 && loadResult.sharedLevels[i-1] > loadResult.sharedLevels[i] { 692 panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.sharedMeta[i-1], loadResult.sharedMeta[i])) 693 } 694 if err := updateMetadata(loadResult.sharedMeta[i]); err != nil { 695 return err 696 } 697 } 698 for i := range loadResult.localMeta { 699 if err := updateMetadata(loadResult.localMeta[i]); err != nil { 700 return err 701 } 702 } 703 for i := range loadResult.externalMeta { 704 if err := updateMetadata(loadResult.externalMeta[i]); err != nil { 705 return err 706 } 707 } 708 return nil 709 } 710 711 // Denotes an internal key range. Smallest and largest are both inclusive. 712 type internalKeyRange struct { 713 smallest, largest InternalKey 714 } 715 716 func overlapWithIterator( 717 iter internalIterator, 718 rangeDelIter *keyspan.FragmentIterator, 719 rkeyIter keyspan.FragmentIterator, 720 keyRange internalKeyRange, 721 cmp Compare, 722 ) bool { 723 // Check overlap with point operations. 724 // 725 // When using levelIter, it seeks to the SST whose boundaries 726 // contain keyRange.smallest.UserKey(S). 727 // It then tries to find a point in that SST that is >= S. 728 // If there's no such point it means the SST ends in a tombstone in which case 729 // levelIter.SeekGE generates a boundary range del sentinel. 730 // The comparison of this boundary with keyRange.largest(L) below 731 // is subtle but maintains correctness. 732 // 1) boundary < L, 733 // since boundary is also > S (initial seek), 734 // whatever the boundary's start key may be, we're always overlapping. 735 // 2) boundary > L, 736 // overlap with boundary cannot be determined since we don't know boundary's start key. 737 // We require checking for overlap with rangeDelIter. 738 // 3) boundary == L and L is not sentinel, 739 // means boundary < L and hence is similar to 1). 740 // 4) boundary == L and L is sentinel, 741 // we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap. 742 key, _ := iter.SeekGE(keyRange.smallest.UserKey, base.SeekGEFlagsNone) 743 if key != nil { 744 c := sstableKeyCompare(cmp, *key, keyRange.largest) 745 if c <= 0 { 746 return true 747 } 748 } 749 // Assume overlap if iterator errored. 750 if err := iter.Error(); err != nil { 751 return true 752 } 753 754 computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) bool { 755 // NB: The spans surfaced by the fragment iterator are non-overlapping. 756 span := rIter.SeekLT(keyRange.smallest.UserKey) 757 if span == nil { 758 span = rIter.Next() 759 } 760 for ; span != nil; span = rIter.Next() { 761 if span.Empty() { 762 continue 763 } 764 key := span.SmallestKey() 765 c := sstableKeyCompare(cmp, key, keyRange.largest) 766 if c > 0 { 767 // The start of the span is after the largest key in the 768 // ingested table. 769 return false 770 } 771 if cmp(span.End, keyRange.smallest.UserKey) > 0 { 772 // The end of the span is greater than the smallest in the 773 // table. Note that the span end key is exclusive, thus ">0" 774 // instead of ">=0". 775 return true 776 } 777 } 778 // Assume overlap if iterator errored. 779 if err := rIter.Error(); err != nil { 780 return true 781 } 782 return false 783 } 784 785 // rkeyIter is either a range key level iter, or a range key iterator 786 // over a single file. 787 if rkeyIter != nil { 788 if computeOverlapWithSpans(rkeyIter) { 789 return true 790 } 791 } 792 793 // Check overlap with range deletions. 794 if rangeDelIter == nil || *rangeDelIter == nil { 795 return false 796 } 797 return computeOverlapWithSpans(*rangeDelIter) 798 } 799 800 // ingestTargetLevel returns the target level for a file being ingested. 801 // If suggestSplit is true, it accounts for ingest-time splitting as part of 802 // its target level calculation, and if a split candidate is found, that file 803 // is returned as the splitFile. 804 func ingestTargetLevel( 805 newIters tableNewIters, 806 newRangeKeyIter keyspan.TableNewSpanIter, 807 iterOps IterOptions, 808 comparer *Comparer, 809 v *version, 810 baseLevel int, 811 compactions map[*compaction]struct{}, 812 meta *fileMetadata, 813 suggestSplit bool, 814 ) (targetLevel int, splitFile *fileMetadata, err error) { 815 // Find the lowest level which does not have any files which overlap meta. We 816 // search from L0 to L6 looking for whether there are any files in the level 817 // which overlap meta. We want the "lowest" level (where lower means 818 // increasing level number) in order to reduce write amplification. 819 // 820 // There are 2 kinds of overlap we need to check for: file boundary overlap 821 // and data overlap. Data overlap implies file boundary overlap. Note that it 822 // is always possible to ingest into L0. 823 // 824 // To place meta at level i where i > 0: 825 // - there must not be any data overlap with levels <= i, since that will 826 // violate the sequence number invariant. 827 // - no file boundary overlap with level i, since that will violate the 828 // invariant that files do not overlap in levels i > 0. 829 // - if there is only a file overlap at a given level, and no data overlap, 830 // we can still slot a file at that level. We return the fileMetadata with 831 // which we have file boundary overlap (must be only one file, as sstable 832 // bounds are usually tight on user keys) and the caller is expected to split 833 // that sstable into two virtual sstables, allowing this file to go into that 834 // level. Note that if we have file boundary overlap with two files, which 835 // should only happen on rare occasions, we treat it as data overlap and 836 // don't use this optimization. 837 // 838 // The file boundary overlap check is simpler to conceptualize. Consider the 839 // following example, in which the ingested file lies completely before or 840 // after the file being considered. 841 // 842 // |--| |--| ingested file: [a,b] or [f,g] 843 // |-----| existing file: [c,e] 844 // _____________________ 845 // a b c d e f g 846 // 847 // In both cases the ingested file can move to considering the next level. 848 // 849 // File boundary overlap does not necessarily imply data overlap. The check 850 // for data overlap is a little more nuanced. Consider the following examples: 851 // 852 // 1. No data overlap: 853 // 854 // |-| |--| ingested file: [cc-d] or [ee-ff] 855 // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] 856 // _____________________ 857 // a b c d e f g 858 // 859 // In this case the ingested files can "fall through" this level. The checks 860 // continue at the next level. 861 // 862 // 2. Data overlap: 863 // 864 // |--| ingested file: [d-e] 865 // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] 866 // _____________________ 867 // a b c d e f g 868 // 869 // In this case the file cannot be ingested into this level as the point 'dd' 870 // is in the way. 871 // 872 // It is worth noting that the check for data overlap is only approximate. In 873 // the previous example, the ingested table [d-e] could contain only the 874 // points 'd' and 'e', in which case the table would be eligible for 875 // considering lower levels. However, such a fine-grained check would need to 876 // be exhaustive (comparing points and ranges in both the ingested existing 877 // tables) and such a check is prohibitively expensive. Thus Pebble treats any 878 // existing point that falls within the ingested table bounds as being "data 879 // overlap". 880 881 // This assertion implicitly checks that we have the current version of 882 // the metadata. 883 if v.L0Sublevels == nil { 884 return 0, nil, errors.AssertionFailedf("could not read L0 sublevels") 885 } 886 // Check for overlap over the keys of L0 by iterating over the sublevels. 887 for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ { 888 iter := newLevelIter(iterOps, comparer, newIters, 889 v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{}) 890 891 var rangeDelIter keyspan.FragmentIterator 892 // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE 893 // sets it up for the target file. 894 iter.initRangeDel(&rangeDelIter) 895 896 levelIter := keyspan.LevelIter{} 897 levelIter.Init( 898 keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, 899 v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange, 900 ) 901 902 kr := internalKeyRange{ 903 smallest: meta.Smallest, 904 largest: meta.Largest, 905 } 906 overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, kr, comparer.Compare) 907 err := iter.Close() // Closes range del iter as well. 908 err = firstError(err, levelIter.Close()) 909 if err != nil { 910 return 0, nil, err 911 } 912 if overlap { 913 return targetLevel, nil, nil 914 } 915 } 916 917 level := baseLevel 918 for ; level < numLevels; level++ { 919 levelIter := newLevelIter(iterOps, comparer, newIters, 920 v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) 921 var rangeDelIter keyspan.FragmentIterator 922 // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE 923 // sets it up for the target file. 924 levelIter.initRangeDel(&rangeDelIter) 925 926 rkeyLevelIter := &keyspan.LevelIter{} 927 rkeyLevelIter.Init( 928 keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, 929 v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange, 930 ) 931 932 kr := internalKeyRange{ 933 smallest: meta.Smallest, 934 largest: meta.Largest, 935 } 936 overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, kr, comparer.Compare) 937 err := levelIter.Close() // Closes range del iter as well. 938 err = firstError(err, rkeyLevelIter.Close()) 939 if err != nil { 940 return 0, nil, err 941 } 942 if overlap { 943 return targetLevel, splitFile, nil 944 } 945 946 // Check boundary overlap. 947 var candidateSplitFile *fileMetadata 948 boundaryOverlaps := v.Overlaps(level, comparer.Compare, meta.Smallest.UserKey, 949 meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) 950 if !boundaryOverlaps.Empty() { 951 // We are already guaranteed to not have any data overlaps with files 952 // in boundaryOverlaps, otherwise we'd have returned in the above if 953 // statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for 954 // the case where we can slot this file into the current level despite 955 // a boundary overlap, by splitting one existing file into two virtual 956 // sstables. 957 if suggestSplit && boundaryOverlaps.Len() == 1 { 958 iter := boundaryOverlaps.Iter() 959 candidateSplitFile = iter.First() 960 } else { 961 // We either don't want to suggest ingest-time splits (i.e. 962 // !suggestSplit), or we boundary-overlapped with more than one file. 963 continue 964 } 965 } 966 967 // Check boundary overlap with any ongoing compactions. We consider an 968 // overlapping compaction that's writing files to an output level as 969 // equivalent to boundary overlap with files in that output level. 970 // 971 // We cannot check for data overlap with the new SSTs compaction will produce 972 // since compaction hasn't been done yet. However, there's no need to check 973 // since all keys in them will be from levels in [c.startLevel, 974 // c.outputLevel], and all those levels have already had their data overlap 975 // tested negative (else we'd have returned earlier). 976 // 977 // An alternative approach would be to cancel these compactions and proceed 978 // with an ingest-time split on this level if necessary. However, compaction 979 // cancellation can result in significant wasted effort and is best avoided 980 // unless necessary. 981 overlaps := false 982 for c := range compactions { 983 if c.outputLevel == nil || level != c.outputLevel.level { 984 continue 985 } 986 if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 && 987 comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 { 988 overlaps = true 989 break 990 } 991 } 992 if !overlaps { 993 targetLevel = level 994 splitFile = candidateSplitFile 995 } 996 } 997 return targetLevel, splitFile, nil 998 } 999 1000 // Ingest ingests a set of sstables into the DB. Ingestion of the files is 1001 // atomic and semantically equivalent to creating a single batch containing all 1002 // of the mutations in the sstables. Ingestion may require the memtable to be 1003 // flushed. The ingested sstable files are moved into the DB and must reside on 1004 // the same filesystem as the DB. Sstables can be created for ingestion using 1005 // sstable.Writer. On success, Ingest removes the input paths. 1006 // 1007 // Two types of sstables are accepted for ingestion(s): one is sstables present 1008 // in the instance's vfs.FS and can be referenced locally. The other is sstables 1009 // present in remote.Storage, referred to as shared or foreign sstables. These 1010 // shared sstables can be linked through objstorageprovider.Provider, and do not 1011 // need to already be present on the local vfs.FS. Foreign sstables must all fit 1012 // in an excise span, and are destined for a level specified in SharedSSTMeta. 1013 // 1014 // All sstables *must* be Sync()'d by the caller after all bytes are written 1015 // and before its file handle is closed; failure to do so could violate 1016 // durability or lead to corrupted on-disk state. This method cannot, in a 1017 // platform-and-FS-agnostic way, ensure that all sstables in the input are 1018 // properly synced to disk. Opening new file handles and Sync()-ing them 1019 // does not always guarantee durability; see the discussion here on that: 1020 // https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379 1021 // 1022 // Ingestion loads each sstable into the lowest level of the LSM which it 1023 // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, 1024 // ingestion forces the memtable to flush, and then waits for the flush to 1025 // occur. In some cases, such as with no foreign sstables and no excise span, 1026 // ingestion that gets blocked on a memtable can join the flushable queue and 1027 // finish even before the memtable has been flushed. 1028 // 1029 // The steps for ingestion are: 1030 // 1031 // 1. Allocate file numbers for every sstable being ingested. 1032 // 2. Load the metadata for all sstables being ingested. 1033 // 3. Sort the sstables by smallest key, verifying non overlap (for local 1034 // sstables). 1035 // 4. Hard link (or copy) the local sstables into the DB directory. 1036 // 5. Allocate a sequence number to use for all of the entries in the 1037 // local sstables. This is the step where overlap with memtables is 1038 // determined. If there is overlap, we remember the most recent memtable 1039 // that overlaps. 1040 // 6. Update the sequence number in the ingested local sstables. (Remote 1041 // sstables get fixed sequence numbers that were determined at load time.) 1042 // 7. Wait for the most recent memtable that overlaps to flush (if any). 1043 // 8. Add the ingested sstables to the version (DB.ingestApply). 1044 // 8.1. If an excise span was specified, figure out what sstables in the 1045 // current version overlap with the excise span, and create new virtual 1046 // sstables out of those sstables that exclude the excised span (DB.excise). 1047 // 9. Publish the ingestion sequence number. 1048 // 1049 // Note that if the mutable memtable overlaps with ingestion, a flush of the 1050 // memtable is forced equivalent to DB.Flush. Additionally, subsequent 1051 // mutations that get sequence numbers larger than the ingestion sequence 1052 // number get queued up behind the ingestion waiting for it to complete. This 1053 // can produce a noticeable hiccup in performance. See 1054 // https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix 1055 // this hiccup. 1056 func (d *DB) Ingest(paths []string) error { 1057 if err := d.closed.Load(); err != nil { 1058 panic(err) 1059 } 1060 if d.opts.ReadOnly { 1061 return ErrReadOnly 1062 } 1063 _, err := d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */) 1064 return err 1065 } 1066 1067 // IngestOperationStats provides some information about where in the LSM the 1068 // bytes were ingested. 1069 type IngestOperationStats struct { 1070 // Bytes is the total bytes in the ingested sstables. 1071 Bytes uint64 1072 // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested 1073 // into L0. This value is approximate when flushable ingests are active and 1074 // an ingest overlaps an entry in the flushable queue. Currently, this 1075 // approximation is very rough, only including tables that overlapped the 1076 // memtable. This estimate may be improved with #2112. 1077 ApproxIngestedIntoL0Bytes uint64 1078 // MemtableOverlappingFiles is the count of ingested sstables 1079 // that overlapped keys in the memtables. 1080 MemtableOverlappingFiles int 1081 } 1082 1083 // ExternalFile are external sstables that can be referenced through 1084 // objprovider and ingested as remote files that will not be refcounted or 1085 // cleaned up. For use with online restore. Note that the underlying sstable 1086 // could contain keys outside the [Smallest,Largest) bounds; however Pebble 1087 // is expected to only read the keys within those bounds. 1088 type ExternalFile struct { 1089 // Locator is the shared.Locator that can be used with objProvider to 1090 // resolve a reference to this external sstable. 1091 Locator remote.Locator 1092 // ObjName is the unique name of this sstable on Locator. 1093 ObjName string 1094 // Size of the referenced proportion of the virtualized sstable. An estimate 1095 // is acceptable in lieu of the backing file size. 1096 Size uint64 1097 // SmallestUserKey and LargestUserKey are the [smallest,largest) user key 1098 // bounds of the sstable. Both these bounds are loose i.e. it's possible for 1099 // the sstable to not span the entirety of this range. However, multiple 1100 // ExternalFiles in one ingestion must all have non-overlapping 1101 // [smallest, largest) spans. Note that this Largest bound is exclusive. 1102 SmallestUserKey, LargestUserKey []byte 1103 // HasPointKey and HasRangeKey denote whether this file contains point keys 1104 // or range keys. If both structs are false, an error is returned during 1105 // ingestion. 1106 HasPointKey, HasRangeKey bool 1107 } 1108 1109 // IngestWithStats does the same as Ingest, and additionally returns 1110 // IngestOperationStats. 1111 func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) { 1112 if err := d.closed.Load(); err != nil { 1113 panic(err) 1114 } 1115 if d.opts.ReadOnly { 1116 return IngestOperationStats{}, ErrReadOnly 1117 } 1118 return d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */) 1119 } 1120 1121 // IngestExternalFiles does the same as IngestWithStats, and additionally 1122 // accepts external files (with locator info that can be resolved using 1123 // d.opts.SharedStorage). These files must also be non-overlapping with 1124 // each other, and must be resolvable through d.objProvider. 1125 func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error) { 1126 if err := d.closed.Load(); err != nil { 1127 panic(err) 1128 } 1129 1130 if d.opts.ReadOnly { 1131 return IngestOperationStats{}, ErrReadOnly 1132 } 1133 if d.opts.Experimental.RemoteStorage == nil { 1134 return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured") 1135 } 1136 return d.ingest(nil, ingestTargetLevel, nil /* shared */, KeyRange{}, external) 1137 } 1138 1139 // IngestAndExcise does the same as IngestWithStats, and additionally accepts a 1140 // list of shared files to ingest that can be read from a remote.Storage through 1141 // a Provider. All the shared files must live within exciseSpan, and any existing 1142 // keys in exciseSpan are deleted by turning existing sstables into virtual 1143 // sstables (if not virtual already) and shrinking their spans to exclude 1144 // exciseSpan. See the comment at Ingest for a more complete picture of the 1145 // ingestion process. 1146 // 1147 // Panics if this DB instance was not instantiated with a remote.Storage and 1148 // shared sstables are present. 1149 func (d *DB) IngestAndExcise( 1150 paths []string, shared []SharedSSTMeta, exciseSpan KeyRange, 1151 ) (IngestOperationStats, error) { 1152 if err := d.closed.Load(); err != nil { 1153 panic(err) 1154 } 1155 if d.opts.ReadOnly { 1156 return IngestOperationStats{}, ErrReadOnly 1157 } 1158 return d.ingest(paths, ingestTargetLevel, shared, exciseSpan, nil /* external */) 1159 } 1160 1161 // Both DB.mu and commitPipeline.mu must be held while this is called. 1162 func (d *DB) newIngestedFlushableEntry( 1163 meta []*fileMetadata, seqNum uint64, logNum FileNum, 1164 ) (*flushableEntry, error) { 1165 // Update the sequence number for all of the sstables in the 1166 // metadata. Writing the metadata to the manifest when the 1167 // version edit is applied is the mechanism that persists the 1168 // sequence number. The sstables themselves are left unmodified. 1169 // In this case, a version edit will only be written to the manifest 1170 // when the flushable is eventually flushed. If Pebble restarts in that 1171 // time, then we'll lose the ingest sequence number information. But this 1172 // information will also be reconstructed on node restart. 1173 if err := ingestUpdateSeqNum( 1174 d.cmp, d.opts.Comparer.FormatKey, seqNum, ingestLoadResult{localMeta: meta}, 1175 ); err != nil { 1176 return nil, err 1177 } 1178 1179 f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter) 1180 1181 // NB: The logNum/seqNum are the WAL number which we're writing this entry 1182 // to and the sequence number within the WAL which we'll write this entry 1183 // to. 1184 entry := d.newFlushableEntry(f, logNum, seqNum) 1185 // The flushable entry starts off with a single reader ref, so increment 1186 // the FileMetadata.Refs. 1187 for _, file := range f.files { 1188 file.Ref() 1189 } 1190 entry.unrefFiles = func() []*fileBacking { 1191 var obsolete []*fileBacking 1192 for _, file := range f.files { 1193 if file.Unref() == 0 { 1194 obsolete = append(obsolete, file.FileMetadata.FileBacking) 1195 } 1196 } 1197 return obsolete 1198 } 1199 1200 entry.flushForced = true 1201 entry.releaseMemAccounting = func() {} 1202 return entry, nil 1203 } 1204 1205 // Both DB.mu and commitPipeline.mu must be held while this is called. Since 1206 // we're holding both locks, the order in which we rotate the memtable or 1207 // recycle the WAL in this function is irrelevant as long as the correct log 1208 // numbers are assigned to the appropriate flushable. 1209 func (d *DB) handleIngestAsFlushable(meta []*fileMetadata, seqNum uint64) error { 1210 b := d.NewBatch() 1211 for _, m := range meta { 1212 b.ingestSST(m.FileNum) 1213 } 1214 b.setSeqNum(seqNum) 1215 1216 // If the WAL is disabled, then the logNum used to create the flushable 1217 // entry doesn't matter. We just use the logNum assigned to the current 1218 // mutable memtable. If the WAL is enabled, then this logNum will be 1219 // overwritten by the logNum of the log which will contain the log entry 1220 // for the ingestedFlushable. 1221 logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum 1222 if !d.opts.DisableWAL { 1223 // We create a new WAL for the flushable instead of reusing the end of 1224 // the previous WAL. This simplifies the increment of the minimum 1225 // unflushed log number, and also simplifies WAL replay. 1226 var prevLogSize uint64 1227 logNum, prevLogSize = d.rotateWAL() 1228 // As the rotator of the WAL, we're responsible for updating the 1229 // previous flushable queue tail's log size. 1230 d.mu.mem.queue[len(d.mu.mem.queue)-1].logSize = prevLogSize 1231 1232 d.mu.Unlock() 1233 err := d.commit.directWrite(b) 1234 if err != nil { 1235 d.opts.Logger.Fatalf("%v", err) 1236 } 1237 d.mu.Lock() 1238 } 1239 1240 entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum) 1241 if err != nil { 1242 return err 1243 } 1244 nextSeqNum := seqNum + uint64(b.Count()) 1245 1246 // Set newLogNum to the logNum of the previous flushable. This value is 1247 // irrelevant if the WAL is disabled. If the WAL is enabled, then we set 1248 // the appropriate value below. 1249 newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum 1250 if !d.opts.DisableWAL { 1251 // newLogNum will be the WAL num of the next mutable memtable which 1252 // comes after the ingestedFlushable in the flushable queue. The mutable 1253 // memtable will be created below. 1254 // 1255 // The prevLogSize returned by rotateWAL is the WAL to which the 1256 // flushable ingest keys were appended. This intermediary WAL is only 1257 // used to record the flushable ingest and nothing else. 1258 newLogNum, entry.logSize = d.rotateWAL() 1259 } 1260 1261 currMem := d.mu.mem.mutable 1262 // NB: Placing ingested sstables above the current memtables 1263 // requires rotating of the existing memtables/WAL. There is 1264 // some concern of churning through tiny memtables due to 1265 // ingested sstables being placed on top of them, but those 1266 // memtables would have to be flushed anyways. 1267 d.mu.mem.queue = append(d.mu.mem.queue, entry) 1268 d.rotateMemtable(newLogNum, nextSeqNum, currMem) 1269 d.updateReadStateLocked(d.opts.DebugCheck) 1270 d.maybeScheduleFlush() 1271 return nil 1272 } 1273 1274 // See comment at Ingest() for details on how this works. 1275 func (d *DB) ingest( 1276 paths []string, 1277 targetLevelFunc ingestTargetLevelFunc, 1278 shared []SharedSSTMeta, 1279 exciseSpan KeyRange, 1280 external []ExternalFile, 1281 ) (IngestOperationStats, error) { 1282 if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil { 1283 panic("cannot ingest shared sstables with nil SharedStorage") 1284 } 1285 if (exciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables { 1286 return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion") 1287 } 1288 // Allocate file numbers for all of the files being ingested and mark them as 1289 // pending in order to prevent them from being deleted. Note that this causes 1290 // the file number ordering to be out of alignment with sequence number 1291 // ordering. The sorting of L0 tables by sequence number avoids relying on 1292 // that (busted) invariant. 1293 d.mu.Lock() 1294 pendingOutputs := make([]base.DiskFileNum, len(paths)+len(shared)+len(external)) 1295 for i := 0; i < len(paths)+len(shared)+len(external); i++ { 1296 pendingOutputs[i] = d.mu.versions.getNextFileNum().DiskFileNum() 1297 } 1298 1299 jobID := d.mu.nextJobID 1300 d.mu.nextJobID++ 1301 d.mu.Unlock() 1302 1303 // Load the metadata for all the files being ingested. This step detects 1304 // and elides empty sstables. 1305 loadResult, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheID, pendingOutputs, d.objProvider, jobID) 1306 if err != nil { 1307 return IngestOperationStats{}, err 1308 } 1309 1310 if loadResult.fileCount == 0 { 1311 // All of the sstables to be ingested were empty. Nothing to do. 1312 return IngestOperationStats{}, nil 1313 } 1314 1315 // Verify the sstables do not overlap. 1316 if err := ingestSortAndVerify(d.cmp, loadResult, exciseSpan); err != nil { 1317 return IngestOperationStats{}, err 1318 } 1319 1320 // Hard link the sstables into the DB directory. Since the sstables aren't 1321 // referenced by a version, they won't be used. If the hard linking fails 1322 // (e.g. because the files reside on a different filesystem), ingestLink will 1323 // fall back to copying, and if that fails we undo our work and return an 1324 // error. 1325 if err := ingestLink(jobID, d.opts, d.objProvider, loadResult, shared); err != nil { 1326 return IngestOperationStats{}, err 1327 } 1328 1329 // Make the new tables durable. We need to do this at some point before we 1330 // update the MANIFEST (via logAndApply), otherwise a crash can have the 1331 // tables referenced in the MANIFEST, but not present in the provider. 1332 if err := d.objProvider.Sync(); err != nil { 1333 return IngestOperationStats{}, err 1334 } 1335 1336 // metaFlushableOverlaps is a slice parallel to meta indicating which of the 1337 // ingested sstables overlap some table in the flushable queue. It's used to 1338 // approximate ingest-into-L0 stats when using flushable ingests. 1339 metaFlushableOverlaps := make([]bool, loadResult.fileCount) 1340 var mem *flushableEntry 1341 var mut *memTable 1342 // asFlushable indicates whether the sstable was ingested as a flushable. 1343 var asFlushable bool 1344 prepare := func(seqNum uint64) { 1345 // Note that d.commit.mu is held by commitPipeline when calling prepare. 1346 1347 d.mu.Lock() 1348 defer d.mu.Unlock() 1349 1350 // Check to see if any files overlap with any of the memtables. The queue 1351 // is ordered from oldest to newest with the mutable memtable being the 1352 // last element in the slice. We want to wait for the newest table that 1353 // overlaps. 1354 1355 for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { 1356 m := d.mu.mem.queue[i] 1357 iter := m.newIter(nil) 1358 rangeDelIter := m.newRangeDelIter(nil) 1359 rkeyIter := m.newRangeKeyIter(nil) 1360 1361 checkForOverlap := func(i int, meta *fileMetadata) { 1362 if metaFlushableOverlaps[i] { 1363 // This table already overlapped a more recent flushable. 1364 return 1365 } 1366 kr := internalKeyRange{ 1367 smallest: meta.Smallest, 1368 largest: meta.Largest, 1369 } 1370 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) { 1371 // If this is the first table to overlap a flushable, save 1372 // the flushable. This ingest must be ingested or flushed 1373 // after it. 1374 if mem == nil { 1375 mem = m 1376 } 1377 metaFlushableOverlaps[i] = true 1378 } 1379 } 1380 for i := range loadResult.localMeta { 1381 checkForOverlap(i, loadResult.localMeta[i]) 1382 } 1383 for i := range loadResult.sharedMeta { 1384 checkForOverlap(len(loadResult.localMeta)+i, loadResult.sharedMeta[i]) 1385 } 1386 for i := range loadResult.externalMeta { 1387 checkForOverlap(len(loadResult.localMeta)+len(loadResult.sharedMeta)+i, loadResult.externalMeta[i]) 1388 } 1389 if exciseSpan.Valid() { 1390 kr := internalKeyRange{ 1391 smallest: base.MakeInternalKey(exciseSpan.Start, InternalKeySeqNumMax, InternalKeyKindMax), 1392 largest: base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, exciseSpan.End), 1393 } 1394 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) { 1395 if mem == nil { 1396 mem = m 1397 } 1398 } 1399 } 1400 err := iter.Close() 1401 if rangeDelIter != nil { 1402 err = firstError(err, rangeDelIter.Close()) 1403 } 1404 if rkeyIter != nil { 1405 err = firstError(err, rkeyIter.Close()) 1406 } 1407 if err != nil { 1408 d.opts.Logger.Infof("ingest error reading flushable for log %s: %s", m.logNum, err) 1409 } 1410 } 1411 1412 if mem == nil { 1413 // No overlap with any of the queued flushables, so no need to queue 1414 // after them. 1415 1416 // New writes with higher sequence numbers may be concurrently 1417 // committed. We must ensure they don't flush before this ingest 1418 // completes. To do that, we ref the mutable memtable as a writer, 1419 // preventing its flushing (and the flushing of all subsequent 1420 // flushables in the queue). Once we've acquired the manifest lock 1421 // to add the ingested sstables to the LSM, we can unref as we're 1422 // guaranteed that the flush won't edit the LSM before this ingest. 1423 mut = d.mu.mem.mutable 1424 mut.writerRef() 1425 return 1426 } 1427 // The ingestion overlaps with some entry in the flushable queue. 1428 if d.FormatMajorVersion() < FormatFlushableIngest || 1429 d.opts.Experimental.DisableIngestAsFlushable() || 1430 len(shared) > 0 || exciseSpan.Valid() || len(external) > 0 || 1431 (len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) { 1432 // We're not able to ingest as a flushable, 1433 // so we must synchronously flush. 1434 // 1435 // TODO(bilal): Currently, if any of the files being ingested are shared or 1436 // there's an excise span present, we cannot use flushable ingests and need 1437 // to wait synchronously. Either remove this caveat by fleshing out 1438 // flushable ingest logic to also account for these cases, or remove this 1439 // comment. Tracking issue: https://github.com/cockroachdb/pebble/issues/2676 1440 if mem.flushable == d.mu.mem.mutable { 1441 err = d.makeRoomForWrite(nil) 1442 } 1443 // New writes with higher sequence numbers may be concurrently 1444 // committed. We must ensure they don't flush before this ingest 1445 // completes. To do that, we ref the mutable memtable as a writer, 1446 // preventing its flushing (and the flushing of all subsequent 1447 // flushables in the queue). Once we've acquired the manifest lock 1448 // to add the ingested sstables to the LSM, we can unref as we're 1449 // guaranteed that the flush won't edit the LSM before this ingest. 1450 mut = d.mu.mem.mutable 1451 mut.writerRef() 1452 mem.flushForced = true 1453 d.maybeScheduleFlush() 1454 return 1455 } 1456 // Since there aren't too many memtables already queued up, we can 1457 // slide the ingested sstables on top of the existing memtables. 1458 asFlushable = true 1459 err = d.handleIngestAsFlushable(loadResult.localMeta, seqNum) 1460 } 1461 1462 var ve *versionEdit 1463 apply := func(seqNum uint64) { 1464 if err != nil || asFlushable { 1465 // An error occurred during prepare. 1466 if mut != nil { 1467 if mut.writerUnref() { 1468 d.mu.Lock() 1469 d.maybeScheduleFlush() 1470 d.mu.Unlock() 1471 } 1472 } 1473 return 1474 } 1475 1476 // Update the sequence numbers for all ingested sstables' 1477 // metadata. When the version edit is applied, the metadata is 1478 // written to the manifest, persisting the sequence number. 1479 // The sstables themselves are left unmodified. 1480 if err = ingestUpdateSeqNum( 1481 d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult, 1482 ); err != nil { 1483 if mut != nil { 1484 if mut.writerUnref() { 1485 d.mu.Lock() 1486 d.maybeScheduleFlush() 1487 d.mu.Unlock() 1488 } 1489 } 1490 return 1491 } 1492 1493 // If we overlapped with a memtable in prepare wait for the flush to 1494 // finish. 1495 if mem != nil { 1496 <-mem.flushed 1497 } 1498 1499 // Assign the sstables to the correct level in the LSM and apply the 1500 // version edit. 1501 ve, err = d.ingestApply(jobID, loadResult, targetLevelFunc, mut, exciseSpan) 1502 } 1503 1504 // Only one ingest can occur at a time because if not, one would block waiting 1505 // for the other to finish applying. This blocking would happen while holding 1506 // the commit mutex which would prevent unrelated batches from writing their 1507 // changes to the WAL and memtable. This will cause a bigger commit hiccup 1508 // during ingestion. 1509 d.commit.ingestSem <- struct{}{} 1510 d.commit.AllocateSeqNum(loadResult.fileCount, prepare, apply) 1511 <-d.commit.ingestSem 1512 1513 if err != nil { 1514 if err2 := ingestCleanup(d.objProvider, loadResult.localMeta); err2 != nil { 1515 d.opts.Logger.Infof("ingest cleanup failed: %v", err2) 1516 } 1517 } else { 1518 // Since we either created a hard link to the ingesting files, or copied 1519 // them over, it is safe to remove the originals paths. 1520 for _, path := range loadResult.localPaths { 1521 if err2 := d.opts.FS.Remove(path); err2 != nil { 1522 d.opts.Logger.Infof("ingest failed to remove original file: %s", err2) 1523 } 1524 } 1525 } 1526 1527 if invariants.Enabled { 1528 for _, sharedMeta := range loadResult.sharedMeta { 1529 d.checkVirtualBounds(sharedMeta) 1530 } 1531 } 1532 1533 info := TableIngestInfo{ 1534 JobID: jobID, 1535 Err: err, 1536 flushable: asFlushable, 1537 } 1538 if len(loadResult.localMeta) > 0 { 1539 info.GlobalSeqNum = loadResult.localMeta[0].SmallestSeqNum 1540 } else if len(loadResult.sharedMeta) > 0 { 1541 info.GlobalSeqNum = loadResult.sharedMeta[0].SmallestSeqNum 1542 } else { 1543 info.GlobalSeqNum = loadResult.externalMeta[0].SmallestSeqNum 1544 } 1545 var stats IngestOperationStats 1546 if ve != nil { 1547 info.Tables = make([]struct { 1548 TableInfo 1549 Level int 1550 }, len(ve.NewFiles)) 1551 for i := range ve.NewFiles { 1552 e := &ve.NewFiles[i] 1553 info.Tables[i].Level = e.Level 1554 info.Tables[i].TableInfo = e.Meta.TableInfo() 1555 stats.Bytes += e.Meta.Size 1556 if e.Level == 0 { 1557 stats.ApproxIngestedIntoL0Bytes += e.Meta.Size 1558 } 1559 if i < len(metaFlushableOverlaps) && metaFlushableOverlaps[i] { 1560 stats.MemtableOverlappingFiles++ 1561 } 1562 } 1563 } else if asFlushable { 1564 // NB: If asFlushable == true, there are no shared sstables. 1565 info.Tables = make([]struct { 1566 TableInfo 1567 Level int 1568 }, len(loadResult.localMeta)) 1569 for i, f := range loadResult.localMeta { 1570 info.Tables[i].Level = -1 1571 info.Tables[i].TableInfo = f.TableInfo() 1572 stats.Bytes += f.Size 1573 // We don't have exact stats on which files will be ingested into 1574 // L0, because actual ingestion into the LSM has been deferred until 1575 // flush time. Instead, we infer based on memtable overlap. 1576 // 1577 // TODO(jackson): If we optimistically compute data overlap (#2112) 1578 // before entering the commit pipeline, we can use that overlap to 1579 // improve our approximation by incorporating overlap with L0, not 1580 // just memtables. 1581 if metaFlushableOverlaps[i] { 1582 stats.ApproxIngestedIntoL0Bytes += f.Size 1583 stats.MemtableOverlappingFiles++ 1584 } 1585 } 1586 } 1587 d.opts.EventListener.TableIngested(info) 1588 1589 return stats, err 1590 } 1591 1592 // excise updates ve to include a replacement of the file m with new virtual 1593 // sstables that exclude exciseSpan, returning a slice of newly-created files if 1594 // any. If the entirety of m is deleted by exciseSpan, no new sstables are added 1595 // and m is deleted. Note that ve is updated in-place. 1596 // 1597 // The manifest lock must be held when calling this method. 1598 func (d *DB) excise( 1599 exciseSpan KeyRange, m *fileMetadata, ve *versionEdit, level int, 1600 ) ([]manifest.NewFileEntry, error) { 1601 numCreatedFiles := 0 1602 // Check if there's actually an overlap between m and exciseSpan. 1603 if !exciseSpan.Overlaps(d.cmp, m) { 1604 return nil, nil 1605 } 1606 ve.DeletedFiles[deletedFileEntry{ 1607 Level: level, 1608 FileNum: m.FileNum, 1609 }] = m 1610 // Fast path: m sits entirely within the exciseSpan, so just delete it. 1611 if exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) { 1612 return nil, nil 1613 } 1614 var iter internalIterator 1615 var rangeDelIter keyspan.FragmentIterator 1616 var rangeKeyIter keyspan.FragmentIterator 1617 needsBacking := false 1618 // Create a file to the left of the excise span, if necessary. 1619 // The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)]. 1620 // 1621 // We create bounds that are tight on user keys, and we make the effort to find 1622 // the last key in the original sstable that's smaller than exciseSpan.Start 1623 // even though it requires some sstable reads. We could choose to create 1624 // virtual sstables on loose userKey bounds, in which case we could just set 1625 // leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest 1626 // issue with that approach would be that it'd lead to lots of small virtual 1627 // sstables in the LSM that have no guarantee on containing even a single user 1628 // key within the file bounds. This has the potential to increase both read and 1629 // write-amp as we will be opening up these sstables only to find no relevant 1630 // keys in the read path, and compacting sstables on top of them instead of 1631 // directly into the space occupied by them. We choose to incur the cost of 1632 // calculating tight bounds at this time instead of creating more work in the 1633 // future. 1634 // 1635 // TODO(bilal): Some of this work can happen without grabbing the manifest 1636 // lock; we could grab one currentVersion, release the lock, calculate excised 1637 // files, then grab the lock again and recalculate for just the files that 1638 // have changed since our previous calculation. Do this optimiaztino as part of 1639 // https://github.com/cockroachdb/pebble/issues/2112 . 1640 if d.cmp(m.Smallest.UserKey, exciseSpan.Start) < 0 { 1641 leftFile := &fileMetadata{ 1642 Virtual: true, 1643 FileBacking: m.FileBacking, 1644 FileNum: d.mu.versions.getNextFileNum(), 1645 // Note that these are loose bounds for smallest/largest seqnums, but they're 1646 // sufficient for maintaining correctness. 1647 SmallestSeqNum: m.SmallestSeqNum, 1648 LargestSeqNum: m.LargestSeqNum, 1649 } 1650 if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.SmallestPointKey) { 1651 // This file will contain point keys 1652 smallestPointKey := m.SmallestPointKey 1653 var err error 1654 iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{level: manifest.Level(level)}, internalIterOpts{}) 1655 if err != nil { 1656 return nil, err 1657 } 1658 var key *InternalKey 1659 if iter != nil { 1660 defer iter.Close() 1661 key, _ = iter.SeekLT(exciseSpan.Start, base.SeekLTFlagsNone) 1662 } else { 1663 iter = emptyIter 1664 } 1665 if key != nil { 1666 leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, key.Clone()) 1667 } 1668 // Store the min of (exciseSpan.Start, rdel.End) in lastRangeDel. This 1669 // needs to be a copy if the key is owned by the range del iter. 1670 var lastRangeDel []byte 1671 if rangeDelIter != nil { 1672 defer rangeDelIter.Close() 1673 rdel := rangeDelIter.SeekLT(exciseSpan.Start) 1674 if rdel != nil { 1675 lastRangeDel = append(lastRangeDel[:0], rdel.End...) 1676 if d.cmp(lastRangeDel, exciseSpan.Start) > 0 { 1677 lastRangeDel = exciseSpan.Start 1678 } 1679 } 1680 } else { 1681 rangeDelIter = emptyKeyspanIter 1682 } 1683 if lastRangeDel != nil { 1684 leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel)) 1685 } 1686 } 1687 if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.SmallestRangeKey) { 1688 // This file will contain range keys 1689 var err error 1690 smallestRangeKey := m.SmallestRangeKey 1691 rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) 1692 if err != nil { 1693 return nil, err 1694 } 1695 // Store the min of (exciseSpan.Start, rkey.End) in lastRangeKey. This 1696 // needs to be a copy if the key is owned by the range key iter. 1697 var lastRangeKey []byte 1698 var lastRangeKeyKind InternalKeyKind 1699 defer rangeKeyIter.Close() 1700 rkey := rangeKeyIter.SeekLT(exciseSpan.Start) 1701 if rkey != nil { 1702 lastRangeKey = append(lastRangeKey[:0], rkey.End...) 1703 if d.cmp(lastRangeKey, exciseSpan.Start) > 0 { 1704 lastRangeKey = exciseSpan.Start 1705 } 1706 lastRangeKeyKind = rkey.Keys[0].Kind() 1707 } 1708 if lastRangeKey != nil { 1709 leftFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, base.MakeExclusiveSentinelKey(lastRangeKeyKind, lastRangeKey)) 1710 } 1711 } 1712 if leftFile.HasRangeKeys || leftFile.HasPointKeys { 1713 var err error 1714 leftFile.Size, err = d.tableCache.estimateSize(m, leftFile.Smallest.UserKey, leftFile.Largest.UserKey) 1715 if err != nil { 1716 return nil, err 1717 } 1718 if leftFile.Size == 0 { 1719 // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size, 1720 // such as if the excised file only has range keys/dels and no point 1721 // keys. This can cause panics in places where we divide by file sizes. 1722 // Correct for it here. 1723 leftFile.Size = 1 1724 } 1725 if err := leftFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { 1726 return nil, err 1727 } 1728 leftFile.ValidateVirtual(m) 1729 d.checkVirtualBounds(leftFile) 1730 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: leftFile}) 1731 needsBacking = true 1732 numCreatedFiles++ 1733 } 1734 } 1735 // Create a file to the right, if necessary. 1736 if exciseSpan.Contains(d.cmp, m.Largest) { 1737 // No key exists to the right of the excise span in this file. 1738 if needsBacking && !m.Virtual { 1739 // If m is virtual, then its file backing is already known to the manifest. 1740 // We don't need to create another file backing. Note that there must be 1741 // only one CreatedBackingTables entry per backing sstable. This is 1742 // indicated by the VersionEdit.CreatedBackingTables invariant. 1743 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 1744 } 1745 return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil 1746 } 1747 // Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest]. 1748 // 1749 // See comment before the definition of leftFile for the motivation behind 1750 // calculating tight user-key bounds. 1751 rightFile := &fileMetadata{ 1752 Virtual: true, 1753 FileBacking: m.FileBacking, 1754 FileNum: d.mu.versions.getNextFileNum(), 1755 // Note that these are loose bounds for smallest/largest seqnums, but they're 1756 // sufficient for maintaining correctness. 1757 SmallestSeqNum: m.SmallestSeqNum, 1758 LargestSeqNum: m.LargestSeqNum, 1759 } 1760 if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.LargestPointKey) { 1761 // This file will contain point keys 1762 largestPointKey := m.LargestPointKey 1763 var err error 1764 if iter == nil && rangeDelIter == nil { 1765 iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{level: manifest.Level(level)}, internalIterOpts{}) 1766 if err != nil { 1767 return nil, err 1768 } 1769 if iter != nil { 1770 defer iter.Close() 1771 } else { 1772 iter = emptyIter 1773 } 1774 if rangeDelIter != nil { 1775 defer rangeDelIter.Close() 1776 } else { 1777 rangeDelIter = emptyKeyspanIter 1778 } 1779 } 1780 key, _ := iter.SeekGE(exciseSpan.End, base.SeekGEFlagsNone) 1781 if key != nil { 1782 rightFile.ExtendPointKeyBounds(d.cmp, key.Clone(), largestPointKey) 1783 } 1784 // Store the max of (exciseSpan.End, rdel.Start) in firstRangeDel. This 1785 // needs to be a copy if the key is owned by the range del iter. 1786 var firstRangeDel []byte 1787 rdel := rangeDelIter.SeekGE(exciseSpan.End) 1788 if rdel != nil { 1789 firstRangeDel = append(firstRangeDel[:0], rdel.Start...) 1790 if d.cmp(firstRangeDel, exciseSpan.End) < 0 { 1791 firstRangeDel = exciseSpan.End 1792 } 1793 } 1794 if firstRangeDel != nil { 1795 smallestPointKey := rdel.SmallestKey() 1796 smallestPointKey.UserKey = firstRangeDel 1797 rightFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, largestPointKey) 1798 } 1799 } 1800 if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.LargestRangeKey) { 1801 // This file will contain range keys. 1802 largestRangeKey := m.LargestRangeKey 1803 if rangeKeyIter == nil { 1804 var err error 1805 rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) 1806 if err != nil { 1807 return nil, err 1808 } 1809 defer rangeKeyIter.Close() 1810 } 1811 // Store the max of (exciseSpan.End, rkey.Start) in firstRangeKey. This 1812 // needs to be a copy if the key is owned by the range key iter. 1813 var firstRangeKey []byte 1814 rkey := rangeKeyIter.SeekGE(exciseSpan.End) 1815 if rkey != nil { 1816 firstRangeKey = append(firstRangeKey[:0], rkey.Start...) 1817 if d.cmp(firstRangeKey, exciseSpan.End) < 0 { 1818 firstRangeKey = exciseSpan.End 1819 } 1820 } 1821 if firstRangeKey != nil { 1822 smallestRangeKey := rkey.SmallestKey() 1823 smallestRangeKey.UserKey = firstRangeKey 1824 // We call ExtendRangeKeyBounds so any internal boundType fields are 1825 // set correctly. Note that this is mildly wasteful as we'll be comparing 1826 // rightFile.{Smallest,Largest}RangeKey with themselves, which can be 1827 // avoided if we exported ExtendOverallKeyBounds or so. 1828 rightFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, largestRangeKey) 1829 } 1830 } 1831 if rightFile.HasRangeKeys || rightFile.HasPointKeys { 1832 var err error 1833 rightFile.Size, err = d.tableCache.estimateSize(m, rightFile.Smallest.UserKey, rightFile.Largest.UserKey) 1834 if err != nil { 1835 return nil, err 1836 } 1837 if rightFile.Size == 0 { 1838 // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size, 1839 // such as if the excised file only has range keys/dels and no point keys. 1840 // This can cause panics in places where we divide by file sizes. Correct 1841 // for it here. 1842 rightFile.Size = 1 1843 } 1844 rightFile.ValidateVirtual(m) 1845 d.checkVirtualBounds(rightFile) 1846 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: rightFile}) 1847 needsBacking = true 1848 numCreatedFiles++ 1849 } 1850 1851 if needsBacking && !m.Virtual { 1852 // If m is virtual, then its file backing is already known to the manifest. 1853 // We don't need to create another file backing. Note that there must be 1854 // only one CreatedBackingTables entry per backing sstable. This is 1855 // indicated by the VersionEdit.CreatedBackingTables invariant. 1856 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 1857 } 1858 1859 if err := rightFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { 1860 return nil, err 1861 } 1862 return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil 1863 } 1864 1865 type ingestTargetLevelFunc func( 1866 newIters tableNewIters, 1867 newRangeKeyIter keyspan.TableNewSpanIter, 1868 iterOps IterOptions, 1869 comparer *Comparer, 1870 v *version, 1871 baseLevel int, 1872 compactions map[*compaction]struct{}, 1873 meta *fileMetadata, 1874 suggestSplit bool, 1875 ) (int, *fileMetadata, error) 1876 1877 type ingestSplitFile struct { 1878 // ingestFile is the file being ingested. 1879 ingestFile *fileMetadata 1880 // splitFile is the file that needs to be split to allow ingestFile to slot 1881 // into `level` level. 1882 splitFile *fileMetadata 1883 // The level where ingestFile will go (and where splitFile already is). 1884 level int 1885 } 1886 1887 // ingestSplit splits files specified in `files` and updates ve in-place to 1888 // account for existing files getting split into two virtual sstables. The map 1889 // `replacedFiles` contains an in-progress map of all files that have been 1890 // replaced with new virtual sstables in this version edit so far, which is also 1891 // updated in-place. 1892 // 1893 // d.mu as well as the manifest lock must be held when calling this method. 1894 func (d *DB) ingestSplit( 1895 ve *versionEdit, 1896 updateMetrics func(*fileMetadata, int, []newFileEntry), 1897 files []ingestSplitFile, 1898 replacedFiles map[base.FileNum][]newFileEntry, 1899 ) error { 1900 for _, s := range files { 1901 // replacedFiles can be thought of as a tree, where we start iterating with 1902 // s.splitFile and run its fileNum through replacedFiles, then find which of 1903 // the replaced files overlaps with s.ingestFile, which becomes the new 1904 // splitFile, then we check splitFile's replacements in replacedFiles again 1905 // for overlap with s.ingestFile, and so on until we either can't find the 1906 // current splitFile in replacedFiles (i.e. that's the file that now needs to 1907 // be split), or we don't find a file that overlaps with s.ingestFile, which 1908 // means a prior ingest split already produced enough room for s.ingestFile 1909 // to go into this level without necessitating another ingest split. 1910 splitFile := s.splitFile 1911 for splitFile != nil { 1912 replaced, ok := replacedFiles[splitFile.FileNum] 1913 if !ok { 1914 break 1915 } 1916 updatedSplitFile := false 1917 for i := range replaced { 1918 if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) { 1919 if updatedSplitFile { 1920 // This should never happen because the earlier ingestTargetLevel 1921 // function only finds split file candidates that are guaranteed to 1922 // have no data overlap, only boundary overlap. See the comments 1923 // in that method to see the definitions of data vs boundary 1924 // overlap. That, plus the fact that files in `replaced` are 1925 // guaranteed to have file bounds that are tight on user keys 1926 // (as that's what `d.excise` produces), means that the only case 1927 // where we overlap with two or more files in `replaced` is if we 1928 // actually had data overlap all along, or if the ingestion files 1929 // were overlapping, either of which is an invariant violation. 1930 panic("updated with two files in ingestSplit") 1931 } 1932 splitFile = replaced[i].Meta 1933 updatedSplitFile = true 1934 } 1935 } 1936 if !updatedSplitFile { 1937 // None of the replaced files overlapped with the file being ingested. 1938 // This can happen if we've already excised a span overlapping with 1939 // this file, or if we have consecutive ingested files that can slide 1940 // within the same gap between keys in an existing file. For instance, 1941 // if an existing file has keys a and g and we're ingesting b-c, d-e, 1942 // the first loop iteration will split the existing file into one that 1943 // ends in a and another that starts at g, and the second iteration will 1944 // fall into this case and require no splitting. 1945 // 1946 // No splitting necessary. 1947 splitFile = nil 1948 } 1949 } 1950 if splitFile == nil { 1951 continue 1952 } 1953 // NB: excise operates on [start, end). We're splitting at [start, end] 1954 // (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation 1955 // of exclusive vs inclusive end bounds should not make a difference here 1956 // as we're guaranteed to not have any data overlap between splitFile and 1957 // s.ingestFile, so panic if we do see a newly added file with an endKey 1958 // equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel() 1959 added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level) 1960 if err != nil { 1961 return err 1962 } 1963 if _, ok := ve.DeletedFiles[deletedFileEntry{ 1964 Level: s.level, 1965 FileNum: splitFile.FileNum, 1966 }]; !ok { 1967 panic("did not split file that was expected to be split") 1968 } 1969 replacedFiles[splitFile.FileNum] = added 1970 for i := range added { 1971 if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) { 1972 panic("ingest-time split produced a file that overlaps with ingested file") 1973 } 1974 } 1975 updateMetrics(splitFile, s.level, added) 1976 } 1977 // Flatten the version edit by removing any entries from ve.NewFiles that 1978 // are also in ve.DeletedFiles. 1979 newNewFiles := ve.NewFiles[:0] 1980 for i := range ve.NewFiles { 1981 fn := ve.NewFiles[i].Meta.FileNum 1982 deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn} 1983 if _, ok := ve.DeletedFiles[deEntry]; ok { 1984 delete(ve.DeletedFiles, deEntry) 1985 } else { 1986 newNewFiles = append(newNewFiles, ve.NewFiles[i]) 1987 } 1988 } 1989 ve.NewFiles = newNewFiles 1990 return nil 1991 } 1992 1993 func (d *DB) ingestApply( 1994 jobID int, 1995 lr ingestLoadResult, 1996 findTargetLevel ingestTargetLevelFunc, 1997 mut *memTable, 1998 exciseSpan KeyRange, 1999 ) (*versionEdit, error) { 2000 d.mu.Lock() 2001 defer d.mu.Unlock() 2002 2003 ve := &versionEdit{ 2004 NewFiles: make([]newFileEntry, lr.fileCount), 2005 } 2006 if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) { 2007 ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{} 2008 } 2009 metrics := make(map[int]*LevelMetrics) 2010 2011 // Lock the manifest for writing before we use the current version to 2012 // determine the target level. This prevents two concurrent ingestion jobs 2013 // from using the same version to determine the target level, and also 2014 // provides serialization with concurrent compaction and flush jobs. 2015 // logAndApply unconditionally releases the manifest lock, but any earlier 2016 // returns must unlock the manifest. 2017 d.mu.versions.logLock() 2018 2019 if mut != nil { 2020 // Unref the mutable memtable to allows its flush to proceed. Now that we've 2021 // acquired the manifest lock, we can be certain that if the mutable 2022 // memtable has received more recent conflicting writes, the flush won't 2023 // beat us to applying to the manifest resulting in sequence number 2024 // inversion. Even though we call maybeScheduleFlush right now, this flush 2025 // will apply after our ingestion. 2026 if mut.writerUnref() { 2027 d.maybeScheduleFlush() 2028 } 2029 } 2030 2031 shouldIngestSplit := d.opts.Experimental.IngestSplit != nil && 2032 d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables 2033 current := d.mu.versions.currentVersion() 2034 baseLevel := d.mu.versions.picker.getBaseLevel() 2035 iterOps := IterOptions{logger: d.opts.Logger} 2036 // filesToSplit is a list where each element is a pair consisting of a file 2037 // being ingested and a file being split to make room for an ingestion into 2038 // that level. Each ingested file will appear at most once in this list. It 2039 // is possible for split files to appear twice in this list. 2040 filesToSplit := make([]ingestSplitFile, 0) 2041 checkCompactions := false 2042 for i := 0; i < lr.fileCount; i++ { 2043 // Determine the lowest level in the LSM for which the sstable doesn't 2044 // overlap any existing files in the level. 2045 var m *fileMetadata 2046 sharedIdx := -1 2047 sharedLevel := -1 2048 externalFile := false 2049 if i < len(lr.localMeta) { 2050 // local file. 2051 m = lr.localMeta[i] 2052 } else if (i - len(lr.localMeta)) < len(lr.sharedMeta) { 2053 // shared file. 2054 sharedIdx = i - len(lr.localMeta) 2055 m = lr.sharedMeta[sharedIdx] 2056 sharedLevel = int(lr.sharedLevels[sharedIdx]) 2057 } else { 2058 // external file. 2059 externalFile = true 2060 m = lr.externalMeta[i-(len(lr.localMeta)+len(lr.sharedMeta))] 2061 } 2062 f := &ve.NewFiles[i] 2063 var err error 2064 if sharedIdx >= 0 { 2065 f.Level = sharedLevel 2066 if f.Level < sharedLevelsStart { 2067 panic("cannot slot a shared file higher than the highest shared level") 2068 } 2069 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 2070 } else { 2071 if externalFile { 2072 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 2073 } 2074 var splitFile *fileMetadata 2075 if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) { 2076 // This file fits perfectly within the excise span. We can slot it at 2077 // L6, or sharedLevelsStart - 1 if we have shared files. 2078 if len(lr.sharedMeta) > 0 { 2079 f.Level = sharedLevelsStart - 1 2080 if baseLevel > f.Level { 2081 f.Level = 0 2082 } 2083 } else { 2084 f.Level = 6 2085 } 2086 } else { 2087 // TODO(bilal): findTargetLevel does disk IO (reading files for data 2088 // overlap) even though we're holding onto d.mu. Consider unlocking 2089 // d.mu while we do this. We already hold versions.logLock so we should 2090 // not see any version applications while we're at this. The one 2091 // complication here would be pulling out the mu.compact.inProgress 2092 // check from findTargetLevel, as that requires d.mu to be held. 2093 f.Level, splitFile, err = findTargetLevel( 2094 d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit) 2095 } 2096 2097 if splitFile != nil { 2098 if invariants.Enabled { 2099 if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil { 2100 panic("splitFile returned is not in level it should be") 2101 } 2102 } 2103 // We take advantage of the fact that we won't drop the db mutex 2104 // between now and the call to logAndApply. So, no files should 2105 // get added to a new in-progress compaction at this point. We can 2106 // avoid having to iterate on in-progress compactions to cancel them 2107 // if none of the files being split have a compacting state. 2108 if splitFile.IsCompacting() { 2109 checkCompactions = true 2110 } 2111 filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level}) 2112 } 2113 } 2114 if err != nil { 2115 d.mu.versions.logUnlock() 2116 return nil, err 2117 } 2118 f.Meta = m 2119 levelMetrics := metrics[f.Level] 2120 if levelMetrics == nil { 2121 levelMetrics = &LevelMetrics{} 2122 metrics[f.Level] = levelMetrics 2123 } 2124 levelMetrics.NumFiles++ 2125 levelMetrics.Size += int64(m.Size) 2126 levelMetrics.BytesIngested += m.Size 2127 levelMetrics.TablesIngested++ 2128 } 2129 // replacedFiles maps files excised due to exciseSpan (or splitFiles returned 2130 // by ingestTargetLevel), to files that were created to replace it. This map 2131 // is used to resolve references to split files in filesToSplit, as it is 2132 // possible for a file that we want to split to no longer exist or have a 2133 // newer fileMetadata due to a split induced by another ingestion file, or an 2134 // excise. 2135 replacedFiles := make(map[base.FileNum][]newFileEntry) 2136 updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) { 2137 levelMetrics := metrics[level] 2138 if levelMetrics == nil { 2139 levelMetrics = &LevelMetrics{} 2140 metrics[level] = levelMetrics 2141 } 2142 levelMetrics.NumFiles-- 2143 levelMetrics.Size -= int64(m.Size) 2144 for i := range added { 2145 levelMetrics.NumFiles++ 2146 levelMetrics.Size += int64(added[i].Meta.Size) 2147 } 2148 } 2149 if exciseSpan.Valid() { 2150 // Iterate through all levels and find files that intersect with exciseSpan. 2151 // 2152 // TODO(bilal): We could drop the DB mutex here as we don't need it for 2153 // excises; we only need to hold the version lock which we already are 2154 // holding. However releasing the DB mutex could mess with the 2155 // ingestTargetLevel calculation that happened above, as it assumed that it 2156 // had a complete view of in-progress compactions that wouldn't change 2157 // until logAndApply is called. If we were to drop the mutex now, we could 2158 // schedule another in-progress compaction that would go into the chosen target 2159 // level and lead to file overlap within level (which would panic in 2160 // logAndApply). We should drop the db mutex here, do the excise, then 2161 // re-grab the DB mutex and rerun just the in-progress compaction check to 2162 // see if any new compactions are conflicting with our chosen target levels 2163 // for files, and if they are, we should signal those compactions to error 2164 // out. 2165 for level := range current.Levels { 2166 overlaps := current.Overlaps(level, d.cmp, exciseSpan.Start, exciseSpan.End, true /* exclusiveEnd */) 2167 iter := overlaps.Iter() 2168 2169 for m := iter.First(); m != nil; m = iter.Next() { 2170 newFiles, err := d.excise(exciseSpan, m, ve, level) 2171 if err != nil { 2172 return nil, err 2173 } 2174 2175 if _, ok := ve.DeletedFiles[deletedFileEntry{ 2176 Level: level, 2177 FileNum: m.FileNum, 2178 }]; !ok { 2179 // We did not excise this file. 2180 continue 2181 } 2182 replacedFiles[m.FileNum] = newFiles 2183 updateLevelMetricsOnExcise(m, level, newFiles) 2184 } 2185 } 2186 } 2187 if len(filesToSplit) > 0 { 2188 // For the same reasons as the above call to excise, we hold the db mutex 2189 // while calling this method. 2190 if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil { 2191 return nil, err 2192 } 2193 } 2194 if len(filesToSplit) > 0 || exciseSpan.Valid() { 2195 for c := range d.mu.compact.inProgress { 2196 if c.versionEditApplied { 2197 continue 2198 } 2199 // Check if this compaction overlaps with the excise span. Note that just 2200 // checking if the inputs individually overlap with the excise span 2201 // isn't sufficient; for instance, a compaction could have [a,b] and [e,f] 2202 // as inputs and write it all out as [a,b,e,f] in one sstable. If we're 2203 // doing a [c,d) excise at the same time as this compaction, we will have 2204 // to error out the whole compaction as we can't guarantee it hasn't/won't 2205 // write a file overlapping with the excise span. 2206 if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) { 2207 c.cancel.Store(true) 2208 } 2209 // Check if this compaction's inputs have been replaced due to an 2210 // ingest-time split. In that case, cancel the compaction as a newly picked 2211 // compaction would need to include any new files that slid in between 2212 // previously-existing files. Note that we cancel any compaction that has a 2213 // file that was ingest-split as an input, even if it started before this 2214 // ingestion. 2215 if checkCompactions { 2216 for i := range c.inputs { 2217 iter := c.inputs[i].files.Iter() 2218 for f := iter.First(); f != nil; f = iter.Next() { 2219 if _, ok := replacedFiles[f.FileNum]; ok { 2220 c.cancel.Store(true) 2221 break 2222 } 2223 } 2224 } 2225 } 2226 } 2227 // Check for any EventuallyFileOnlySnapshots that could be watching for 2228 // an excise on this span. 2229 if exciseSpan.Valid() { 2230 for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next { 2231 if s.efos == nil { 2232 continue 2233 } 2234 efos := s.efos 2235 // TODO(bilal): We can make this faster by taking advantage of the sorted 2236 // nature of protectedRanges to do a sort.Search, or even maintaining a 2237 // global list of all protected ranges instead of having to peer into every 2238 // snapshot. 2239 for i := range efos.protectedRanges { 2240 if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) { 2241 efos.excised.Store(true) 2242 break 2243 } 2244 } 2245 } 2246 } 2247 } 2248 if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo { 2249 return d.getInProgressCompactionInfoLocked(nil) 2250 }); err != nil { 2251 return nil, err 2252 } 2253 2254 d.mu.versions.metrics.Ingest.Count++ 2255 2256 d.updateReadStateLocked(d.opts.DebugCheck) 2257 // updateReadStateLocked could have generated obsolete tables, schedule a 2258 // cleanup job if necessary. 2259 d.deleteObsoleteFiles(jobID) 2260 d.updateTableStatsLocked(ve.NewFiles) 2261 // The ingestion may have pushed a level over the threshold for compaction, 2262 // so check to see if one is necessary and schedule it. 2263 d.maybeScheduleCompaction() 2264 var toValidate []manifest.NewFileEntry 2265 dedup := make(map[base.DiskFileNum]struct{}) 2266 for _, entry := range ve.NewFiles { 2267 if _, ok := dedup[entry.Meta.FileBacking.DiskFileNum]; !ok { 2268 toValidate = append(toValidate, entry) 2269 dedup[entry.Meta.FileBacking.DiskFileNum] = struct{}{} 2270 } 2271 } 2272 d.maybeValidateSSTablesLocked(toValidate) 2273 return ve, nil 2274 } 2275 2276 // maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending 2277 // queue of files to be validated, when the feature is enabled. 2278 // 2279 // Note that if two entries with the same backing file are added twice, then the 2280 // block checksums for the backing file will be validated twice. 2281 // 2282 // DB.mu must be locked when calling. 2283 func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) { 2284 // Only add to the validation queue when the feature is enabled. 2285 if !d.opts.Experimental.ValidateOnIngest { 2286 return 2287 } 2288 2289 d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...) 2290 if d.shouldValidateSSTablesLocked() { 2291 go d.validateSSTables() 2292 } 2293 } 2294 2295 // shouldValidateSSTablesLocked returns true if SSTable validation should run. 2296 // DB.mu must be locked when calling. 2297 func (d *DB) shouldValidateSSTablesLocked() bool { 2298 return !d.mu.tableValidation.validating && 2299 d.closed.Load() == nil && 2300 d.opts.Experimental.ValidateOnIngest && 2301 len(d.mu.tableValidation.pending) > 0 2302 } 2303 2304 // validateSSTables runs a round of validation on the tables in the pending 2305 // queue. 2306 func (d *DB) validateSSTables() { 2307 d.mu.Lock() 2308 if !d.shouldValidateSSTablesLocked() { 2309 d.mu.Unlock() 2310 return 2311 } 2312 2313 pending := d.mu.tableValidation.pending 2314 d.mu.tableValidation.pending = nil 2315 d.mu.tableValidation.validating = true 2316 jobID := d.mu.nextJobID 2317 d.mu.nextJobID++ 2318 rs := d.loadReadState() 2319 2320 // Drop DB.mu before performing IO. 2321 d.mu.Unlock() 2322 2323 // Validate all tables in the pending queue. This could lead to a situation 2324 // where we are starving IO from other tasks due to having to page through 2325 // all the blocks in all the sstables in the queue. 2326 // TODO(travers): Add some form of pacing to avoid IO starvation. 2327 for _, f := range pending { 2328 // The file may have been moved or deleted since it was ingested, in 2329 // which case we skip. 2330 if !rs.current.Contains(f.Level, d.cmp, f.Meta) { 2331 // Assume the file was moved to a lower level. It is rare enough 2332 // that a table is moved or deleted between the time it was ingested 2333 // and the time the validation routine runs that the overall cost of 2334 // this inner loop is tolerably low, when amortized over all 2335 // ingested tables. 2336 found := false 2337 for i := f.Level + 1; i < numLevels; i++ { 2338 if rs.current.Contains(i, d.cmp, f.Meta) { 2339 found = true 2340 break 2341 } 2342 } 2343 if !found { 2344 continue 2345 } 2346 } 2347 2348 var err error 2349 if f.Meta.Virtual { 2350 err = d.tableCache.withVirtualReader( 2351 f.Meta.VirtualMeta(), func(v sstable.VirtualReader) error { 2352 return v.ValidateBlockChecksumsOnBacking() 2353 }) 2354 } else { 2355 err = d.tableCache.withReader( 2356 f.Meta.PhysicalMeta(), func(r *sstable.Reader) error { 2357 return r.ValidateBlockChecksums() 2358 }) 2359 } 2360 2361 if err != nil { 2362 // TODO(travers): Hook into the corruption reporting pipeline, once 2363 // available. See pebble#1192. 2364 d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err) 2365 } 2366 2367 d.opts.EventListener.TableValidated(TableValidatedInfo{ 2368 JobID: jobID, 2369 Meta: f.Meta, 2370 }) 2371 } 2372 rs.unref() 2373 2374 d.mu.Lock() 2375 defer d.mu.Unlock() 2376 d.mu.tableValidation.validating = false 2377 d.mu.tableValidation.cond.Broadcast() 2378 if d.shouldValidateSSTablesLocked() { 2379 go d.validateSSTables() 2380 } 2381 }