github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/ingest.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "slices" 10 "sort" 11 "time" 12 13 "github.com/cockroachdb/errors" 14 "github.com/cockroachdb/pebble/internal/base" 15 "github.com/cockroachdb/pebble/internal/invariants" 16 "github.com/cockroachdb/pebble/internal/keyspan" 17 "github.com/cockroachdb/pebble/internal/manifest" 18 "github.com/cockroachdb/pebble/internal/private" 19 "github.com/cockroachdb/pebble/objstorage" 20 "github.com/cockroachdb/pebble/objstorage/remote" 21 "github.com/cockroachdb/pebble/sstable" 22 ) 23 24 func sstableKeyCompare(userCmp Compare, a, b InternalKey) int { 25 c := userCmp(a.UserKey, b.UserKey) 26 if c != 0 { 27 return c 28 } 29 if a.IsExclusiveSentinel() { 30 if !b.IsExclusiveSentinel() { 31 return -1 32 } 33 } else if b.IsExclusiveSentinel() { 34 return +1 35 } 36 return 0 37 } 38 39 // KeyRange encodes a key range in user key space. A KeyRange's Start is 40 // inclusive while its End is exclusive. 41 type KeyRange struct { 42 Start, End []byte 43 } 44 45 // Valid returns true if the KeyRange is defined. 46 func (k *KeyRange) Valid() bool { 47 return k.Start != nil && k.End != nil 48 } 49 50 // Contains returns whether the specified key exists in the KeyRange. 51 func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool { 52 v := cmp(key.UserKey, k.End) 53 return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0 54 } 55 56 // OverlapsInternalKeyRange checks if the specified internal key range has an 57 // overlap with the KeyRange. Note that we aren't checking for full containment 58 // of smallest-largest within k, rather just that there's some intersection 59 // between the two ranges. 60 func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool { 61 v := cmp(k.Start, largest.UserKey) 62 return v <= 0 && !(largest.IsExclusiveSentinel() && v == 0) && 63 cmp(k.End, smallest.UserKey) > 0 64 } 65 66 // Overlaps checks if the specified file has an overlap with the KeyRange. 67 // Note that we aren't checking for full containment of m within k, rather just 68 // that there's some intersection between m and k's bounds. 69 func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool { 70 return k.OverlapsInternalKeyRange(cmp, m.Smallest, m.Largest) 71 } 72 73 // OverlapsKeyRange checks if this span overlaps with the provided KeyRange. 74 // Note that we aren't checking for full containment of either span in the other, 75 // just that there's a key x that is in both key ranges. 76 func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool { 77 return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0 78 } 79 80 func ingestValidateKey(opts *Options, key *InternalKey) error { 81 if key.Kind() == InternalKeyKindInvalid { 82 return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s", 83 key.Pretty(opts.Comparer.FormatKey)) 84 } 85 if key.SeqNum() != 0 { 86 return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s", 87 key.Pretty(opts.Comparer.FormatKey)) 88 } 89 return nil 90 } 91 92 // ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned 93 // or shared by another node. 94 func ingestSynthesizeShared( 95 opts *Options, sm SharedSSTMeta, fileNum base.DiskFileNum, 96 ) (*fileMetadata, error) { 97 if sm.Size == 0 { 98 // Disallow 0 file sizes 99 return nil, errors.New("pebble: cannot ingest shared file with size 0") 100 } 101 // Don't load table stats. Doing a round trip to shared storage, one SST 102 // at a time is not worth it as it slows down ingestion. 103 meta := &fileMetadata{ 104 FileNum: fileNum.FileNum(), 105 CreationTime: time.Now().Unix(), 106 Virtual: true, 107 Size: sm.Size, 108 } 109 meta.InitProviderBacking(fileNum) 110 // Set the underlying FileBacking's size to the same size as the virtualized 111 // view of the sstable. This ensures that we don't over-prioritize this 112 // sstable for compaction just yet, as we do not have a clear sense of what 113 // parts of this sstable are referenced by other nodes. 114 meta.FileBacking.Size = sm.Size 115 if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil { 116 // Initialize meta.{HasRangeKeys,Smallest,Largest}, etc. 117 // 118 // NB: We create new internal keys and pass them into ExternalRangeKeyBounds 119 // so that we can sub a zero sequence number into the bounds. We can set 120 // the sequence number to anything here; it'll be reset in ingestUpdateSeqNum 121 // anyway. However we do need to use the same sequence number across all 122 // bound keys at this step so that we end up with bounds that are consistent 123 // across point/range keys. 124 smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, sm.SmallestRangeKey.Kind()) 125 largestRangeKey := base.MakeExclusiveSentinelKey(sm.LargestRangeKey.Kind(), sm.LargestRangeKey.UserKey) 126 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey) 127 } 128 if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil { 129 // Initialize meta.{HasPointKeys,Smallest,Largest}, etc. 130 // 131 // See point above in the ExtendRangeKeyBounds call on why we use a zero 132 // sequence number here. 133 smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, sm.SmallestPointKey.Kind()) 134 largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, sm.LargestPointKey.Kind()) 135 if sm.LargestPointKey.IsExclusiveSentinel() { 136 largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey) 137 } 138 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey) 139 } 140 if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { 141 return nil, err 142 } 143 return meta, nil 144 } 145 146 // ingestLoad1External loads the fileMetadata for one external sstable. 147 // Sequence number and target level calculation happens during prepare/apply. 148 func ingestLoad1External( 149 opts *Options, 150 e ExternalFile, 151 fileNum base.DiskFileNum, 152 objprovider objstorage.Provider, 153 jobID int, 154 ) (*fileMetadata, error) { 155 if e.Size == 0 { 156 // Disallow 0 file sizes 157 return nil, errors.New("pebble: cannot ingest external file with size 0") 158 } 159 if !e.HasRangeKey && !e.HasPointKey { 160 return nil, errors.New("pebble: cannot ingest external file with no point or range keys") 161 } 162 // Don't load table stats. Doing a round trip to shared storage, one SST 163 // at a time is not worth it as it slows down ingestion. 164 meta := &fileMetadata{} 165 meta.FileNum = fileNum.FileNum() 166 meta.CreationTime = time.Now().Unix() 167 meta.Virtual = true 168 meta.Size = e.Size 169 meta.InitProviderBacking(fileNum) 170 171 // Try to resolve a reference to the external file. 172 backing, err := objprovider.CreateExternalObjectBacking(e.Locator, e.ObjName) 173 if err != nil { 174 return nil, err 175 } 176 metas, err := objprovider.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{ 177 FileNum: fileNum, 178 FileType: fileTypeTable, 179 Backing: backing, 180 }}) 181 if err != nil { 182 return nil, err 183 } 184 if opts.EventListener.TableCreated != nil { 185 opts.EventListener.TableCreated(TableCreateInfo{ 186 JobID: jobID, 187 Reason: "ingesting", 188 Path: objprovider.Path(metas[0]), 189 FileNum: fileNum.FileNum(), 190 }) 191 } 192 // In the name of keeping this ingestion as fast as possible, we avoid 193 // *all* existence checks and synthesize a file metadata with smallest/largest 194 // keys that overlap whatever the passed-in span was. 195 smallestCopy := make([]byte, len(e.SmallestUserKey)) 196 copy(smallestCopy, e.SmallestUserKey) 197 largestCopy := make([]byte, len(e.LargestUserKey)) 198 copy(largestCopy, e.LargestUserKey) 199 if e.HasPointKey { 200 meta.ExtendPointKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax), 201 base.MakeRangeDeleteSentinelKey(largestCopy)) 202 } 203 if e.HasRangeKey { 204 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeySet), 205 base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyDelete, largestCopy)) 206 } 207 208 // Set the underlying FileBacking's size to the same size as the virtualized 209 // view of the sstable. This ensures that we don't over-prioritize this 210 // sstable for compaction just yet, as we do not have a clear sense of 211 // what parts of this sstable are referenced by other nodes. 212 meta.FileBacking.Size = e.Size 213 214 if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { 215 return nil, err 216 } 217 return meta, nil 218 } 219 220 // ingestLoad1 creates the FileMetadata for one file. This file will be owned 221 // by this store. 222 func ingestLoad1( 223 opts *Options, 224 fmv FormatMajorVersion, 225 readable objstorage.Readable, 226 cacheID uint64, 227 fileNum base.DiskFileNum, 228 ) (*fileMetadata, error) { 229 cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption) 230 r, err := sstable.NewReader(readable, opts.MakeReaderOptions(), cacheOpts) 231 if err != nil { 232 return nil, err 233 } 234 defer r.Close() 235 236 // Avoid ingesting tables with format versions this DB doesn't support. 237 tf, err := r.TableFormat() 238 if err != nil { 239 return nil, err 240 } 241 if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() { 242 return nil, errors.Newf( 243 "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)", 244 tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(), 245 ) 246 } 247 248 meta := &fileMetadata{} 249 meta.FileNum = fileNum.FileNum() 250 meta.Size = uint64(readable.Size()) 251 meta.CreationTime = time.Now().Unix() 252 meta.InitPhysicalBacking() 253 254 // Avoid loading into the table cache for collecting stats if we 255 // don't need to. If there are no range deletions, we have all the 256 // information to compute the stats here. 257 // 258 // This is helpful in tests for avoiding awkwardness around deletion of 259 // ingested files from MemFS. MemFS implements the Windows semantics of 260 // disallowing removal of an open file. Under MemFS, if we don't populate 261 // meta.Stats here, the file will be loaded into the table cache for 262 // calculating stats before we can remove the original link. 263 maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties) 264 265 { 266 iter, err := r.NewIter(nil /* lower */, nil /* upper */) 267 if err != nil { 268 return nil, err 269 } 270 defer iter.Close() 271 var smallest InternalKey 272 if key, _ := iter.First(); key != nil { 273 if err := ingestValidateKey(opts, key); err != nil { 274 return nil, err 275 } 276 smallest = (*key).Clone() 277 } 278 if err := iter.Error(); err != nil { 279 return nil, err 280 } 281 if key, _ := iter.Last(); key != nil { 282 if err := ingestValidateKey(opts, key); err != nil { 283 return nil, err 284 } 285 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone()) 286 } 287 if err := iter.Error(); err != nil { 288 return nil, err 289 } 290 } 291 292 iter, err := r.NewRawRangeDelIter() 293 if err != nil { 294 return nil, err 295 } 296 if iter != nil { 297 defer iter.Close() 298 var smallest InternalKey 299 if s := iter.First(); s != nil { 300 key := s.SmallestKey() 301 if err := ingestValidateKey(opts, &key); err != nil { 302 return nil, err 303 } 304 smallest = key.Clone() 305 } 306 if err := iter.Error(); err != nil { 307 return nil, err 308 } 309 if s := iter.Last(); s != nil { 310 k := s.SmallestKey() 311 if err := ingestValidateKey(opts, &k); err != nil { 312 return nil, err 313 } 314 largest := s.LargestKey().Clone() 315 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest) 316 } 317 } 318 319 // Update the range-key bounds for the table. 320 { 321 iter, err := r.NewRawRangeKeyIter() 322 if err != nil { 323 return nil, err 324 } 325 if iter != nil { 326 defer iter.Close() 327 var smallest InternalKey 328 if s := iter.First(); s != nil { 329 key := s.SmallestKey() 330 if err := ingestValidateKey(opts, &key); err != nil { 331 return nil, err 332 } 333 smallest = key.Clone() 334 } 335 if err := iter.Error(); err != nil { 336 return nil, err 337 } 338 if s := iter.Last(); s != nil { 339 k := s.SmallestKey() 340 if err := ingestValidateKey(opts, &k); err != nil { 341 return nil, err 342 } 343 // As range keys are fragmented, the end key of the last range key in 344 // the table provides the upper bound for the table. 345 largest := s.LargestKey().Clone() 346 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest) 347 } 348 if err := iter.Error(); err != nil { 349 return nil, err 350 } 351 } 352 } 353 354 if !meta.HasPointKeys && !meta.HasRangeKeys { 355 return nil, nil 356 } 357 358 // Sanity check that the various bounds on the file were set consistently. 359 if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { 360 return nil, err 361 } 362 363 return meta, nil 364 } 365 366 type ingestLoadResult struct { 367 localMeta, sharedMeta []*fileMetadata 368 externalMeta []*fileMetadata 369 localPaths []string 370 sharedLevels []uint8 371 fileCount int 372 } 373 374 func ingestLoad( 375 opts *Options, 376 fmv FormatMajorVersion, 377 paths []string, 378 shared []SharedSSTMeta, 379 external []ExternalFile, 380 cacheID uint64, 381 pending []base.DiskFileNum, 382 objProvider objstorage.Provider, 383 jobID int, 384 ) (ingestLoadResult, error) { 385 meta := make([]*fileMetadata, 0, len(paths)) 386 newPaths := make([]string, 0, len(paths)) 387 for i := range paths { 388 f, err := opts.FS.Open(paths[i]) 389 if err != nil { 390 return ingestLoadResult{}, err 391 } 392 393 readable, err := sstable.NewSimpleReadable(f) 394 if err != nil { 395 return ingestLoadResult{}, err 396 } 397 m, err := ingestLoad1(opts, fmv, readable, cacheID, pending[i]) 398 if err != nil { 399 return ingestLoadResult{}, err 400 } 401 if m != nil { 402 meta = append(meta, m) 403 newPaths = append(newPaths, paths[i]) 404 } 405 } 406 if len(shared) == 0 && len(external) == 0 { 407 return ingestLoadResult{localMeta: meta, localPaths: newPaths, fileCount: len(meta)}, nil 408 } 409 410 // Sort the shared files according to level. 411 sort.Sort(sharedByLevel(shared)) 412 413 sharedMeta := make([]*fileMetadata, 0, len(shared)) 414 levels := make([]uint8, 0, len(shared)) 415 for i := range shared { 416 m, err := ingestSynthesizeShared(opts, shared[i], pending[len(paths)+i]) 417 if err != nil { 418 return ingestLoadResult{}, err 419 } 420 if shared[i].Level < sharedLevelsStart { 421 return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart") 422 } 423 sharedMeta = append(sharedMeta, m) 424 levels = append(levels, shared[i].Level) 425 } 426 externalMeta := make([]*fileMetadata, 0, len(external)) 427 for i := range external { 428 m, err := ingestLoad1External(opts, external[i], pending[len(paths)+len(shared)+i], objProvider, jobID) 429 if err != nil { 430 return ingestLoadResult{}, err 431 } 432 externalMeta = append(externalMeta, m) 433 } 434 result := ingestLoadResult{ 435 localMeta: meta, 436 sharedMeta: sharedMeta, 437 externalMeta: externalMeta, 438 localPaths: newPaths, 439 sharedLevels: levels, 440 fileCount: len(meta) + len(sharedMeta) + len(externalMeta), 441 } 442 return result, nil 443 } 444 445 // Struct for sorting metadatas by smallest user keys, while ensuring the 446 // matching path also gets swapped to the same index. For use in 447 // ingestSortAndVerify. 448 type metaAndPaths struct { 449 meta []*fileMetadata 450 paths []string 451 cmp Compare 452 } 453 454 func (m metaAndPaths) Len() int { 455 return len(m.meta) 456 } 457 458 func (m metaAndPaths) Less(i, j int) bool { 459 return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0 460 } 461 462 func (m metaAndPaths) Swap(i, j int) { 463 m.meta[i], m.meta[j] = m.meta[j], m.meta[i] 464 if m.paths != nil { 465 m.paths[i], m.paths[j] = m.paths[j], m.paths[i] 466 } 467 } 468 469 func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error { 470 // Verify that all the shared files (i.e. files in sharedMeta) 471 // fit within the exciseSpan. 472 for i := range lr.sharedMeta { 473 f := lr.sharedMeta[i] 474 if !exciseSpan.Contains(cmp, f.Smallest) || !exciseSpan.Contains(cmp, f.Largest) { 475 return errors.AssertionFailedf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String()) 476 } 477 } 478 if len(lr.externalMeta) > 0 { 479 if len(lr.localMeta) > 0 || len(lr.sharedMeta) > 0 { 480 // Currently we only support external ingests on their own. If external 481 // files are present alongside local/shared files, return an error. 482 return errors.AssertionFailedf("pebble: external files cannot be ingested atomically alongside other types of files") 483 } 484 sort.Sort(&metaAndPaths{ 485 meta: lr.externalMeta, 486 cmp: cmp, 487 }) 488 for i := 1; i < len(lr.externalMeta); i++ { 489 if sstableKeyCompare(cmp, lr.externalMeta[i-1].Largest, lr.externalMeta[i].Smallest) >= 0 { 490 return errors.AssertionFailedf("pebble: external sstables have overlapping ranges") 491 } 492 } 493 return nil 494 } 495 if len(lr.localMeta) <= 1 || len(lr.localPaths) <= 1 { 496 return nil 497 } 498 499 sort.Sort(&metaAndPaths{ 500 meta: lr.localMeta, 501 paths: lr.localPaths, 502 cmp: cmp, 503 }) 504 505 for i := 1; i < len(lr.localPaths); i++ { 506 if sstableKeyCompare(cmp, lr.localMeta[i-1].Largest, lr.localMeta[i].Smallest) >= 0 { 507 return errors.AssertionFailedf("pebble: local ingestion sstables have overlapping ranges") 508 } 509 } 510 if len(lr.sharedMeta) == 0 { 511 return nil 512 } 513 filesInLevel := make([]*fileMetadata, 0, len(lr.sharedMeta)) 514 for l := sharedLevelsStart; l < numLevels; l++ { 515 filesInLevel = filesInLevel[:0] 516 for i := range lr.sharedMeta { 517 if lr.sharedLevels[i] == uint8(l) { 518 filesInLevel = append(filesInLevel, lr.sharedMeta[i]) 519 } 520 } 521 slices.SortFunc(filesInLevel, func(a, b *fileMetadata) int { 522 return cmp(a.Smallest.UserKey, b.Smallest.UserKey) 523 }) 524 for i := 1; i < len(filesInLevel); i++ { 525 if sstableKeyCompare(cmp, filesInLevel[i-1].Largest, filesInLevel[i].Smallest) >= 0 { 526 return errors.AssertionFailedf("pebble: external shared sstables have overlapping ranges") 527 } 528 } 529 } 530 return nil 531 } 532 533 func ingestCleanup(objProvider objstorage.Provider, meta []*fileMetadata) error { 534 var firstErr error 535 for i := range meta { 536 if err := objProvider.Remove(fileTypeTable, meta[i].FileBacking.DiskFileNum); err != nil { 537 firstErr = firstError(firstErr, err) 538 } 539 } 540 return firstErr 541 } 542 543 // ingestLink creates new objects which are backed by either hardlinks to or 544 // copies of the ingested files. It also attaches shared objects to the provider. 545 func ingestLink( 546 jobID int, 547 opts *Options, 548 objProvider objstorage.Provider, 549 lr ingestLoadResult, 550 shared []SharedSSTMeta, 551 ) error { 552 for i := range lr.localPaths { 553 objMeta, err := objProvider.LinkOrCopyFromLocal( 554 context.TODO(), opts.FS, lr.localPaths[i], fileTypeTable, lr.localMeta[i].FileBacking.DiskFileNum, 555 objstorage.CreateOptions{PreferSharedStorage: true}, 556 ) 557 if err != nil { 558 if err2 := ingestCleanup(objProvider, lr.localMeta[:i]); err2 != nil { 559 opts.Logger.Errorf("ingest cleanup failed: %v", err2) 560 } 561 return err 562 } 563 if opts.EventListener.TableCreated != nil { 564 opts.EventListener.TableCreated(TableCreateInfo{ 565 JobID: jobID, 566 Reason: "ingesting", 567 Path: objProvider.Path(objMeta), 568 FileNum: lr.localMeta[i].FileNum, 569 }) 570 } 571 } 572 sharedObjs := make([]objstorage.RemoteObjectToAttach, 0, len(shared)) 573 for i := range shared { 574 backing, err := shared[i].Backing.Get() 575 if err != nil { 576 return err 577 } 578 sharedObjs = append(sharedObjs, objstorage.RemoteObjectToAttach{ 579 FileNum: lr.sharedMeta[i].FileBacking.DiskFileNum, 580 FileType: fileTypeTable, 581 Backing: backing, 582 }) 583 } 584 sharedObjMetas, err := objProvider.AttachRemoteObjects(sharedObjs) 585 if err != nil { 586 return err 587 } 588 for i := range sharedObjMetas { 589 // One corner case around file sizes we need to be mindful of, is that 590 // if one of the shareObjs was initially created by us (and has boomeranged 591 // back from another node), we'll need to update the FileBacking's size 592 // to be the true underlying size. Otherwise, we could hit errors when we 593 // open the db again after a crash/restart (see checkConsistency in open.go), 594 // plus it more accurately allows us to prioritize compactions of files 595 // that were originally created by us. 596 if sharedObjMetas[i].IsShared() && !objProvider.IsSharedForeign(sharedObjMetas[i]) { 597 size, err := objProvider.Size(sharedObjMetas[i]) 598 if err != nil { 599 return err 600 } 601 lr.sharedMeta[i].FileBacking.Size = uint64(size) 602 } 603 if opts.EventListener.TableCreated != nil { 604 opts.EventListener.TableCreated(TableCreateInfo{ 605 JobID: jobID, 606 Reason: "ingesting", 607 Path: objProvider.Path(sharedObjMetas[i]), 608 FileNum: lr.sharedMeta[i].FileNum, 609 }) 610 } 611 } 612 // We do not need to do anything about lr.externalMetas. Those were already 613 // linked in ingestLoad. 614 615 return nil 616 } 617 618 func ingestMemtableOverlaps(cmp Compare, mem flushable, keyRanges []internalKeyRange) bool { 619 iter := mem.newIter(nil) 620 rangeDelIter := mem.newRangeDelIter(nil) 621 rkeyIter := mem.newRangeKeyIter(nil) 622 623 closeIters := func() error { 624 err := iter.Close() 625 if rangeDelIter != nil { 626 err = firstError(err, rangeDelIter.Close()) 627 } 628 if rkeyIter != nil { 629 err = firstError(err, rkeyIter.Close()) 630 } 631 return err 632 } 633 634 for _, kr := range keyRanges { 635 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, cmp) { 636 closeIters() 637 return true 638 } 639 } 640 641 // Assume overlap if any iterator errored out. 642 return closeIters() != nil 643 } 644 645 func ingestUpdateSeqNum( 646 cmp Compare, format base.FormatKey, seqNum uint64, loadResult ingestLoadResult, 647 ) error { 648 setSeqFn := func(k base.InternalKey) base.InternalKey { 649 return base.MakeInternalKey(k.UserKey, seqNum, k.Kind()) 650 } 651 updateMetadata := func(m *fileMetadata) error { 652 // NB: we set the fields directly here, rather than via their Extend* 653 // methods, as we are updating sequence numbers. 654 if m.HasPointKeys { 655 m.SmallestPointKey = setSeqFn(m.SmallestPointKey) 656 } 657 if m.HasRangeKeys { 658 m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey) 659 } 660 m.Smallest = setSeqFn(m.Smallest) 661 // Only update the seqnum for the largest key if that key is not an 662 // "exclusive sentinel" (i.e. a range deletion sentinel or a range key 663 // boundary), as doing so effectively drops the exclusive sentinel (by 664 // lowering the seqnum from the max value), and extends the bounds of the 665 // table. 666 // NB: as the largest range key is always an exclusive sentinel, it is never 667 // updated. 668 if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() { 669 m.LargestPointKey = setSeqFn(m.LargestPointKey) 670 } 671 if !m.Largest.IsExclusiveSentinel() { 672 m.Largest = setSeqFn(m.Largest) 673 } 674 // Setting smallestSeqNum == largestSeqNum triggers the setting of 675 // Properties.GlobalSeqNum when an sstable is loaded. 676 m.SmallestSeqNum = seqNum 677 m.LargestSeqNum = seqNum 678 // Ensure the new bounds are consistent. 679 if err := m.Validate(cmp, format); err != nil { 680 return err 681 } 682 seqNum++ 683 return nil 684 } 685 686 // Shared sstables are required to be sorted by level ascending. We then 687 // iterate the shared sstables in reverse, assigning the lower sequence 688 // numbers to the shared sstables that will be ingested into the lower 689 // (larger numbered) levels first. This ensures sequence number shadowing is 690 // correct. 691 for i := len(loadResult.sharedMeta) - 1; i >= 0; i-- { 692 if i-1 >= 0 && loadResult.sharedLevels[i-1] > loadResult.sharedLevels[i] { 693 panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.sharedMeta[i-1], loadResult.sharedMeta[i])) 694 } 695 if err := updateMetadata(loadResult.sharedMeta[i]); err != nil { 696 return err 697 } 698 } 699 for i := range loadResult.localMeta { 700 if err := updateMetadata(loadResult.localMeta[i]); err != nil { 701 return err 702 } 703 } 704 for i := range loadResult.externalMeta { 705 if err := updateMetadata(loadResult.externalMeta[i]); err != nil { 706 return err 707 } 708 } 709 return nil 710 } 711 712 // Denotes an internal key range. Smallest and largest are both inclusive. 713 type internalKeyRange struct { 714 smallest, largest InternalKey 715 } 716 717 func overlapWithIterator( 718 iter internalIterator, 719 rangeDelIter *keyspan.FragmentIterator, 720 rkeyIter keyspan.FragmentIterator, 721 keyRange internalKeyRange, 722 cmp Compare, 723 ) bool { 724 // Check overlap with point operations. 725 // 726 // When using levelIter, it seeks to the SST whose boundaries 727 // contain keyRange.smallest.UserKey(S). 728 // It then tries to find a point in that SST that is >= S. 729 // If there's no such point it means the SST ends in a tombstone in which case 730 // levelIter.SeekGE generates a boundary range del sentinel. 731 // The comparison of this boundary with keyRange.largest(L) below 732 // is subtle but maintains correctness. 733 // 1) boundary < L, 734 // since boundary is also > S (initial seek), 735 // whatever the boundary's start key may be, we're always overlapping. 736 // 2) boundary > L, 737 // overlap with boundary cannot be determined since we don't know boundary's start key. 738 // We require checking for overlap with rangeDelIter. 739 // 3) boundary == L and L is not sentinel, 740 // means boundary < L and hence is similar to 1). 741 // 4) boundary == L and L is sentinel, 742 // we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap. 743 key, _ := iter.SeekGE(keyRange.smallest.UserKey, base.SeekGEFlagsNone) 744 if key != nil { 745 c := sstableKeyCompare(cmp, *key, keyRange.largest) 746 if c <= 0 { 747 return true 748 } 749 } 750 // Assume overlap if iterator errored. 751 if err := iter.Error(); err != nil { 752 return true 753 } 754 755 computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) bool { 756 // NB: The spans surfaced by the fragment iterator are non-overlapping. 757 span := rIter.SeekLT(keyRange.smallest.UserKey) 758 if span == nil { 759 span = rIter.Next() 760 } 761 for ; span != nil; span = rIter.Next() { 762 if span.Empty() { 763 continue 764 } 765 key := span.SmallestKey() 766 c := sstableKeyCompare(cmp, key, keyRange.largest) 767 if c > 0 { 768 // The start of the span is after the largest key in the 769 // ingested table. 770 return false 771 } 772 if cmp(span.End, keyRange.smallest.UserKey) > 0 { 773 // The end of the span is greater than the smallest in the 774 // table. Note that the span end key is exclusive, thus ">0" 775 // instead of ">=0". 776 return true 777 } 778 } 779 // Assume overlap if iterator errored. 780 if err := rIter.Error(); err != nil { 781 return true 782 } 783 return false 784 } 785 786 // rkeyIter is either a range key level iter, or a range key iterator 787 // over a single file. 788 if rkeyIter != nil { 789 if computeOverlapWithSpans(rkeyIter) { 790 return true 791 } 792 } 793 794 // Check overlap with range deletions. 795 if rangeDelIter == nil || *rangeDelIter == nil { 796 return false 797 } 798 return computeOverlapWithSpans(*rangeDelIter) 799 } 800 801 // ingestTargetLevel returns the target level for a file being ingested. 802 // If suggestSplit is true, it accounts for ingest-time splitting as part of 803 // its target level calculation, and if a split candidate is found, that file 804 // is returned as the splitFile. 805 func ingestTargetLevel( 806 newIters tableNewIters, 807 newRangeKeyIter keyspan.TableNewSpanIter, 808 iterOps IterOptions, 809 comparer *Comparer, 810 v *version, 811 baseLevel int, 812 compactions map[*compaction]struct{}, 813 meta *fileMetadata, 814 suggestSplit bool, 815 ) (targetLevel int, splitFile *fileMetadata, err error) { 816 // Find the lowest level which does not have any files which overlap meta. We 817 // search from L0 to L6 looking for whether there are any files in the level 818 // which overlap meta. We want the "lowest" level (where lower means 819 // increasing level number) in order to reduce write amplification. 820 // 821 // There are 2 kinds of overlap we need to check for: file boundary overlap 822 // and data overlap. Data overlap implies file boundary overlap. Note that it 823 // is always possible to ingest into L0. 824 // 825 // To place meta at level i where i > 0: 826 // - there must not be any data overlap with levels <= i, since that will 827 // violate the sequence number invariant. 828 // - no file boundary overlap with level i, since that will violate the 829 // invariant that files do not overlap in levels i > 0. 830 // - if there is only a file overlap at a given level, and no data overlap, 831 // we can still slot a file at that level. We return the fileMetadata with 832 // which we have file boundary overlap (must be only one file, as sstable 833 // bounds are usually tight on user keys) and the caller is expected to split 834 // that sstable into two virtual sstables, allowing this file to go into that 835 // level. Note that if we have file boundary overlap with two files, which 836 // should only happen on rare occasions, we treat it as data overlap and 837 // don't use this optimization. 838 // 839 // The file boundary overlap check is simpler to conceptualize. Consider the 840 // following example, in which the ingested file lies completely before or 841 // after the file being considered. 842 // 843 // |--| |--| ingested file: [a,b] or [f,g] 844 // |-----| existing file: [c,e] 845 // _____________________ 846 // a b c d e f g 847 // 848 // In both cases the ingested file can move to considering the next level. 849 // 850 // File boundary overlap does not necessarily imply data overlap. The check 851 // for data overlap is a little more nuanced. Consider the following examples: 852 // 853 // 1. No data overlap: 854 // 855 // |-| |--| ingested file: [cc-d] or [ee-ff] 856 // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] 857 // _____________________ 858 // a b c d e f g 859 // 860 // In this case the ingested files can "fall through" this level. The checks 861 // continue at the next level. 862 // 863 // 2. Data overlap: 864 // 865 // |--| ingested file: [d-e] 866 // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] 867 // _____________________ 868 // a b c d e f g 869 // 870 // In this case the file cannot be ingested into this level as the point 'dd' 871 // is in the way. 872 // 873 // It is worth noting that the check for data overlap is only approximate. In 874 // the previous example, the ingested table [d-e] could contain only the 875 // points 'd' and 'e', in which case the table would be eligible for 876 // considering lower levels. However, such a fine-grained check would need to 877 // be exhaustive (comparing points and ranges in both the ingested existing 878 // tables) and such a check is prohibitively expensive. Thus Pebble treats any 879 // existing point that falls within the ingested table bounds as being "data 880 // overlap". 881 882 // This assertion implicitly checks that we have the current version of 883 // the metadata. 884 if v.L0Sublevels == nil { 885 return 0, nil, errors.AssertionFailedf("could not read L0 sublevels") 886 } 887 iterOps.CategoryAndQoS = sstable.CategoryAndQoS{ 888 Category: "pebble-ingest", 889 QoSLevel: sstable.LatencySensitiveQoSLevel, 890 } 891 // Check for overlap over the keys of L0 by iterating over the sublevels. 892 for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ { 893 iter := newLevelIter(context.Background(), 894 iterOps, comparer, newIters, v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{}) 895 896 var rangeDelIter keyspan.FragmentIterator 897 // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE 898 // sets it up for the target file. 899 iter.initRangeDel(&rangeDelIter) 900 901 levelIter := keyspan.LevelIter{} 902 levelIter.Init( 903 keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, 904 v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange, 905 ) 906 907 kr := internalKeyRange{ 908 smallest: meta.Smallest, 909 largest: meta.Largest, 910 } 911 overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, kr, comparer.Compare) 912 err := iter.Close() // Closes range del iter as well. 913 err = firstError(err, levelIter.Close()) 914 if err != nil { 915 return 0, nil, err 916 } 917 if overlap { 918 return targetLevel, nil, nil 919 } 920 } 921 922 level := baseLevel 923 for ; level < numLevels; level++ { 924 levelIter := newLevelIter(context.Background(), 925 iterOps, comparer, newIters, v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) 926 var rangeDelIter keyspan.FragmentIterator 927 // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE 928 // sets it up for the target file. 929 levelIter.initRangeDel(&rangeDelIter) 930 931 rkeyLevelIter := &keyspan.LevelIter{} 932 rkeyLevelIter.Init( 933 keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, 934 v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange, 935 ) 936 937 kr := internalKeyRange{ 938 smallest: meta.Smallest, 939 largest: meta.Largest, 940 } 941 overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, kr, comparer.Compare) 942 err := levelIter.Close() // Closes range del iter as well. 943 err = firstError(err, rkeyLevelIter.Close()) 944 if err != nil { 945 return 0, nil, err 946 } 947 if overlap { 948 return targetLevel, splitFile, nil 949 } 950 951 // Check boundary overlap. 952 var candidateSplitFile *fileMetadata 953 boundaryOverlaps := v.Overlaps(level, comparer.Compare, meta.Smallest.UserKey, 954 meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) 955 if !boundaryOverlaps.Empty() { 956 // We are already guaranteed to not have any data overlaps with files 957 // in boundaryOverlaps, otherwise we'd have returned in the above if 958 // statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for 959 // the case where we can slot this file into the current level despite 960 // a boundary overlap, by splitting one existing file into two virtual 961 // sstables. 962 if suggestSplit && boundaryOverlaps.Len() == 1 { 963 iter := boundaryOverlaps.Iter() 964 candidateSplitFile = iter.First() 965 } else { 966 // We either don't want to suggest ingest-time splits (i.e. 967 // !suggestSplit), or we boundary-overlapped with more than one file. 968 continue 969 } 970 } 971 972 // Check boundary overlap with any ongoing compactions. We consider an 973 // overlapping compaction that's writing files to an output level as 974 // equivalent to boundary overlap with files in that output level. 975 // 976 // We cannot check for data overlap with the new SSTs compaction will produce 977 // since compaction hasn't been done yet. However, there's no need to check 978 // since all keys in them will be from levels in [c.startLevel, 979 // c.outputLevel], and all those levels have already had their data overlap 980 // tested negative (else we'd have returned earlier). 981 // 982 // An alternative approach would be to cancel these compactions and proceed 983 // with an ingest-time split on this level if necessary. However, compaction 984 // cancellation can result in significant wasted effort and is best avoided 985 // unless necessary. 986 overlaps := false 987 for c := range compactions { 988 if c.outputLevel == nil || level != c.outputLevel.level { 989 continue 990 } 991 if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 && 992 comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 { 993 overlaps = true 994 break 995 } 996 } 997 if !overlaps { 998 targetLevel = level 999 splitFile = candidateSplitFile 1000 } 1001 } 1002 return targetLevel, splitFile, nil 1003 } 1004 1005 // Ingest ingests a set of sstables into the DB. Ingestion of the files is 1006 // atomic and semantically equivalent to creating a single batch containing all 1007 // of the mutations in the sstables. Ingestion may require the memtable to be 1008 // flushed. The ingested sstable files are moved into the DB and must reside on 1009 // the same filesystem as the DB. Sstables can be created for ingestion using 1010 // sstable.Writer. On success, Ingest removes the input paths. 1011 // 1012 // Two types of sstables are accepted for ingestion(s): one is sstables present 1013 // in the instance's vfs.FS and can be referenced locally. The other is sstables 1014 // present in remote.Storage, referred to as shared or foreign sstables. These 1015 // shared sstables can be linked through objstorageprovider.Provider, and do not 1016 // need to already be present on the local vfs.FS. Foreign sstables must all fit 1017 // in an excise span, and are destined for a level specified in SharedSSTMeta. 1018 // 1019 // All sstables *must* be Sync()'d by the caller after all bytes are written 1020 // and before its file handle is closed; failure to do so could violate 1021 // durability or lead to corrupted on-disk state. This method cannot, in a 1022 // platform-and-FS-agnostic way, ensure that all sstables in the input are 1023 // properly synced to disk. Opening new file handles and Sync()-ing them 1024 // does not always guarantee durability; see the discussion here on that: 1025 // https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379 1026 // 1027 // Ingestion loads each sstable into the lowest level of the LSM which it 1028 // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, 1029 // ingestion forces the memtable to flush, and then waits for the flush to 1030 // occur. In some cases, such as with no foreign sstables and no excise span, 1031 // ingestion that gets blocked on a memtable can join the flushable queue and 1032 // finish even before the memtable has been flushed. 1033 // 1034 // The steps for ingestion are: 1035 // 1036 // 1. Allocate file numbers for every sstable being ingested. 1037 // 2. Load the metadata for all sstables being ingested. 1038 // 3. Sort the sstables by smallest key, verifying non overlap (for local 1039 // sstables). 1040 // 4. Hard link (or copy) the local sstables into the DB directory. 1041 // 5. Allocate a sequence number to use for all of the entries in the 1042 // local sstables. This is the step where overlap with memtables is 1043 // determined. If there is overlap, we remember the most recent memtable 1044 // that overlaps. 1045 // 6. Update the sequence number in the ingested local sstables. (Remote 1046 // sstables get fixed sequence numbers that were determined at load time.) 1047 // 7. Wait for the most recent memtable that overlaps to flush (if any). 1048 // 8. Add the ingested sstables to the version (DB.ingestApply). 1049 // 8.1. If an excise span was specified, figure out what sstables in the 1050 // current version overlap with the excise span, and create new virtual 1051 // sstables out of those sstables that exclude the excised span (DB.excise). 1052 // 9. Publish the ingestion sequence number. 1053 // 1054 // Note that if the mutable memtable overlaps with ingestion, a flush of the 1055 // memtable is forced equivalent to DB.Flush. Additionally, subsequent 1056 // mutations that get sequence numbers larger than the ingestion sequence 1057 // number get queued up behind the ingestion waiting for it to complete. This 1058 // can produce a noticeable hiccup in performance. See 1059 // https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix 1060 // this hiccup. 1061 func (d *DB) Ingest(paths []string) error { 1062 if err := d.closed.Load(); err != nil { 1063 panic(err) 1064 } 1065 if d.opts.ReadOnly { 1066 return ErrReadOnly 1067 } 1068 _, err := d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */) 1069 return err 1070 } 1071 1072 // IngestOperationStats provides some information about where in the LSM the 1073 // bytes were ingested. 1074 type IngestOperationStats struct { 1075 // Bytes is the total bytes in the ingested sstables. 1076 Bytes uint64 1077 // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested 1078 // into L0. This value is approximate when flushable ingests are active and 1079 // an ingest overlaps an entry in the flushable queue. Currently, this 1080 // approximation is very rough, only including tables that overlapped the 1081 // memtable. This estimate may be improved with #2112. 1082 ApproxIngestedIntoL0Bytes uint64 1083 // MemtableOverlappingFiles is the count of ingested sstables 1084 // that overlapped keys in the memtables. 1085 MemtableOverlappingFiles int 1086 } 1087 1088 // ExternalFile are external sstables that can be referenced through 1089 // objprovider and ingested as remote files that will not be refcounted or 1090 // cleaned up. For use with online restore. Note that the underlying sstable 1091 // could contain keys outside the [Smallest,Largest) bounds; however Pebble 1092 // is expected to only read the keys within those bounds. 1093 type ExternalFile struct { 1094 // Locator is the shared.Locator that can be used with objProvider to 1095 // resolve a reference to this external sstable. 1096 Locator remote.Locator 1097 // ObjName is the unique name of this sstable on Locator. 1098 ObjName string 1099 // Size of the referenced proportion of the virtualized sstable. An estimate 1100 // is acceptable in lieu of the backing file size. 1101 Size uint64 1102 // SmallestUserKey and LargestUserKey are the [smallest,largest) user key 1103 // bounds of the sstable. Both these bounds are loose i.e. it's possible for 1104 // the sstable to not span the entirety of this range. However, multiple 1105 // ExternalFiles in one ingestion must all have non-overlapping 1106 // [smallest, largest) spans. Note that this Largest bound is exclusive. 1107 SmallestUserKey, LargestUserKey []byte 1108 // HasPointKey and HasRangeKey denote whether this file contains point keys 1109 // or range keys. If both structs are false, an error is returned during 1110 // ingestion. 1111 HasPointKey, HasRangeKey bool 1112 } 1113 1114 // IngestWithStats does the same as Ingest, and additionally returns 1115 // IngestOperationStats. 1116 func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) { 1117 if err := d.closed.Load(); err != nil { 1118 panic(err) 1119 } 1120 if d.opts.ReadOnly { 1121 return IngestOperationStats{}, ErrReadOnly 1122 } 1123 return d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */) 1124 } 1125 1126 // IngestExternalFiles does the same as IngestWithStats, and additionally 1127 // accepts external files (with locator info that can be resolved using 1128 // d.opts.SharedStorage). These files must also be non-overlapping with 1129 // each other, and must be resolvable through d.objProvider. 1130 func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error) { 1131 if err := d.closed.Load(); err != nil { 1132 panic(err) 1133 } 1134 1135 if d.opts.ReadOnly { 1136 return IngestOperationStats{}, ErrReadOnly 1137 } 1138 if d.opts.Experimental.RemoteStorage == nil { 1139 return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured") 1140 } 1141 return d.ingest(nil, ingestTargetLevel, nil /* shared */, KeyRange{}, external) 1142 } 1143 1144 // IngestAndExcise does the same as IngestWithStats, and additionally accepts a 1145 // list of shared files to ingest that can be read from a remote.Storage through 1146 // a Provider. All the shared files must live within exciseSpan, and any existing 1147 // keys in exciseSpan are deleted by turning existing sstables into virtual 1148 // sstables (if not virtual already) and shrinking their spans to exclude 1149 // exciseSpan. See the comment at Ingest for a more complete picture of the 1150 // ingestion process. 1151 // 1152 // Panics if this DB instance was not instantiated with a remote.Storage and 1153 // shared sstables are present. 1154 func (d *DB) IngestAndExcise( 1155 paths []string, shared []SharedSSTMeta, exciseSpan KeyRange, 1156 ) (IngestOperationStats, error) { 1157 if err := d.closed.Load(); err != nil { 1158 panic(err) 1159 } 1160 if d.opts.ReadOnly { 1161 return IngestOperationStats{}, ErrReadOnly 1162 } 1163 if invariants.Enabled && d.opts.Comparer.Split != nil { 1164 // Excise is only supported on prefix keys. 1165 if d.opts.Comparer.Split(exciseSpan.Start) != len(exciseSpan.Start) { 1166 panic("IngestAndExcise called with suffixed start key") 1167 } 1168 if d.opts.Comparer.Split(exciseSpan.End) != len(exciseSpan.End) { 1169 panic("IngestAndExcise called with suffixed end key") 1170 } 1171 } 1172 return d.ingest(paths, ingestTargetLevel, shared, exciseSpan, nil /* external */) 1173 } 1174 1175 // Both DB.mu and commitPipeline.mu must be held while this is called. 1176 func (d *DB) newIngestedFlushableEntry( 1177 meta []*fileMetadata, seqNum uint64, logNum base.DiskFileNum, 1178 ) (*flushableEntry, error) { 1179 // Update the sequence number for all of the sstables in the 1180 // metadata. Writing the metadata to the manifest when the 1181 // version edit is applied is the mechanism that persists the 1182 // sequence number. The sstables themselves are left unmodified. 1183 // In this case, a version edit will only be written to the manifest 1184 // when the flushable is eventually flushed. If Pebble restarts in that 1185 // time, then we'll lose the ingest sequence number information. But this 1186 // information will also be reconstructed on node restart. 1187 if err := ingestUpdateSeqNum( 1188 d.cmp, d.opts.Comparer.FormatKey, seqNum, ingestLoadResult{localMeta: meta}, 1189 ); err != nil { 1190 return nil, err 1191 } 1192 1193 f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter) 1194 1195 // NB: The logNum/seqNum are the WAL number which we're writing this entry 1196 // to and the sequence number within the WAL which we'll write this entry 1197 // to. 1198 entry := d.newFlushableEntry(f, logNum, seqNum) 1199 // The flushable entry starts off with a single reader ref, so increment 1200 // the FileMetadata.Refs. 1201 for _, file := range f.files { 1202 file.Ref() 1203 } 1204 entry.unrefFiles = func() []*fileBacking { 1205 var obsolete []*fileBacking 1206 for _, file := range f.files { 1207 if file.Unref() == 0 { 1208 obsolete = append(obsolete, file.FileMetadata.FileBacking) 1209 } 1210 } 1211 return obsolete 1212 } 1213 1214 entry.flushForced = true 1215 entry.releaseMemAccounting = func() {} 1216 return entry, nil 1217 } 1218 1219 // Both DB.mu and commitPipeline.mu must be held while this is called. Since 1220 // we're holding both locks, the order in which we rotate the memtable or 1221 // recycle the WAL in this function is irrelevant as long as the correct log 1222 // numbers are assigned to the appropriate flushable. 1223 func (d *DB) handleIngestAsFlushable(meta []*fileMetadata, seqNum uint64) error { 1224 b := d.NewBatch() 1225 for _, m := range meta { 1226 b.ingestSST(m.FileNum) 1227 } 1228 b.setSeqNum(seqNum) 1229 1230 // If the WAL is disabled, then the logNum used to create the flushable 1231 // entry doesn't matter. We just use the logNum assigned to the current 1232 // mutable memtable. If the WAL is enabled, then this logNum will be 1233 // overwritten by the logNum of the log which will contain the log entry 1234 // for the ingestedFlushable. 1235 logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum 1236 if !d.opts.DisableWAL { 1237 // We create a new WAL for the flushable instead of reusing the end of 1238 // the previous WAL. This simplifies the increment of the minimum 1239 // unflushed log number, and also simplifies WAL replay. 1240 logNum, _ = d.recycleWAL() 1241 d.mu.Unlock() 1242 err := d.commit.directWrite(b) 1243 if err != nil { 1244 d.opts.Logger.Fatalf("%v", err) 1245 } 1246 d.mu.Lock() 1247 } 1248 1249 entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum) 1250 if err != nil { 1251 return err 1252 } 1253 nextSeqNum := seqNum + uint64(b.Count()) 1254 1255 // Set newLogNum to the logNum of the previous flushable. This value is 1256 // irrelevant if the WAL is disabled. If the WAL is enabled, then we set 1257 // the appropriate value below. 1258 newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum 1259 if !d.opts.DisableWAL { 1260 // This is WAL num of the next mutable memtable which comes after the 1261 // ingestedFlushable in the flushable queue. The mutable memtable 1262 // will be created below. 1263 newLogNum, _ = d.recycleWAL() 1264 if err != nil { 1265 return err 1266 } 1267 } 1268 1269 currMem := d.mu.mem.mutable 1270 // NB: Placing ingested sstables above the current memtables 1271 // requires rotating of the existing memtables/WAL. There is 1272 // some concern of churning through tiny memtables due to 1273 // ingested sstables being placed on top of them, but those 1274 // memtables would have to be flushed anyways. 1275 d.mu.mem.queue = append(d.mu.mem.queue, entry) 1276 d.rotateMemtable(newLogNum, nextSeqNum, currMem) 1277 d.updateReadStateLocked(d.opts.DebugCheck) 1278 d.maybeScheduleFlush() 1279 return nil 1280 } 1281 1282 // See comment at Ingest() for details on how this works. 1283 func (d *DB) ingest( 1284 paths []string, 1285 targetLevelFunc ingestTargetLevelFunc, 1286 shared []SharedSSTMeta, 1287 exciseSpan KeyRange, 1288 external []ExternalFile, 1289 ) (IngestOperationStats, error) { 1290 if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil { 1291 panic("cannot ingest shared sstables with nil SharedStorage") 1292 } 1293 if (exciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables { 1294 return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion") 1295 } 1296 // Allocate file numbers for all of the files being ingested and mark them as 1297 // pending in order to prevent them from being deleted. Note that this causes 1298 // the file number ordering to be out of alignment with sequence number 1299 // ordering. The sorting of L0 tables by sequence number avoids relying on 1300 // that (busted) invariant. 1301 d.mu.Lock() 1302 pendingOutputs := make([]base.DiskFileNum, len(paths)+len(shared)+len(external)) 1303 for i := 0; i < len(paths)+len(shared)+len(external); i++ { 1304 pendingOutputs[i] = d.mu.versions.getNextDiskFileNum() 1305 } 1306 1307 jobID := d.mu.nextJobID 1308 d.mu.nextJobID++ 1309 d.mu.Unlock() 1310 1311 // Load the metadata for all the files being ingested. This step detects 1312 // and elides empty sstables. 1313 loadResult, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheID, pendingOutputs, d.objProvider, jobID) 1314 if err != nil { 1315 return IngestOperationStats{}, err 1316 } 1317 1318 if loadResult.fileCount == 0 { 1319 // All of the sstables to be ingested were empty. Nothing to do. 1320 return IngestOperationStats{}, nil 1321 } 1322 1323 // Verify the sstables do not overlap. 1324 if err := ingestSortAndVerify(d.cmp, loadResult, exciseSpan); err != nil { 1325 return IngestOperationStats{}, err 1326 } 1327 1328 // Hard link the sstables into the DB directory. Since the sstables aren't 1329 // referenced by a version, they won't be used. If the hard linking fails 1330 // (e.g. because the files reside on a different filesystem), ingestLink will 1331 // fall back to copying, and if that fails we undo our work and return an 1332 // error. 1333 if err := ingestLink(jobID, d.opts, d.objProvider, loadResult, shared); err != nil { 1334 return IngestOperationStats{}, err 1335 } 1336 1337 // Make the new tables durable. We need to do this at some point before we 1338 // update the MANIFEST (via logAndApply), otherwise a crash can have the 1339 // tables referenced in the MANIFEST, but not present in the provider. 1340 if err := d.objProvider.Sync(); err != nil { 1341 return IngestOperationStats{}, err 1342 } 1343 1344 // metaFlushableOverlaps is a slice parallel to meta indicating which of the 1345 // ingested sstables overlap some table in the flushable queue. It's used to 1346 // approximate ingest-into-L0 stats when using flushable ingests. 1347 metaFlushableOverlaps := make([]bool, loadResult.fileCount) 1348 var mem *flushableEntry 1349 var mut *memTable 1350 // asFlushable indicates whether the sstable was ingested as a flushable. 1351 var asFlushable bool 1352 iterOps := IterOptions{ 1353 CategoryAndQoS: sstable.CategoryAndQoS{ 1354 Category: "pebble-ingest", 1355 QoSLevel: sstable.LatencySensitiveQoSLevel, 1356 }, 1357 } 1358 prepare := func(seqNum uint64) { 1359 // Note that d.commit.mu is held by commitPipeline when calling prepare. 1360 1361 d.mu.Lock() 1362 defer d.mu.Unlock() 1363 1364 // Check to see if any files overlap with any of the memtables. The queue 1365 // is ordered from oldest to newest with the mutable memtable being the 1366 // last element in the slice. We want to wait for the newest table that 1367 // overlaps. 1368 1369 for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { 1370 m := d.mu.mem.queue[i] 1371 iter := m.newIter(&iterOps) 1372 rangeDelIter := m.newRangeDelIter(&iterOps) 1373 rkeyIter := m.newRangeKeyIter(&iterOps) 1374 1375 checkForOverlap := func(i int, meta *fileMetadata) { 1376 if metaFlushableOverlaps[i] { 1377 // This table already overlapped a more recent flushable. 1378 return 1379 } 1380 kr := internalKeyRange{ 1381 smallest: meta.Smallest, 1382 largest: meta.Largest, 1383 } 1384 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) { 1385 // If this is the first table to overlap a flushable, save 1386 // the flushable. This ingest must be ingested or flushed 1387 // after it. 1388 if mem == nil { 1389 mem = m 1390 } 1391 metaFlushableOverlaps[i] = true 1392 } 1393 } 1394 for i := range loadResult.localMeta { 1395 checkForOverlap(i, loadResult.localMeta[i]) 1396 } 1397 for i := range loadResult.sharedMeta { 1398 checkForOverlap(len(loadResult.localMeta)+i, loadResult.sharedMeta[i]) 1399 } 1400 for i := range loadResult.externalMeta { 1401 checkForOverlap(len(loadResult.localMeta)+len(loadResult.sharedMeta)+i, loadResult.externalMeta[i]) 1402 } 1403 if exciseSpan.Valid() { 1404 kr := internalKeyRange{ 1405 smallest: base.MakeInternalKey(exciseSpan.Start, InternalKeySeqNumMax, InternalKeyKindMax), 1406 largest: base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, exciseSpan.End), 1407 } 1408 if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) { 1409 if mem == nil { 1410 mem = m 1411 } 1412 } 1413 } 1414 err := iter.Close() 1415 if rangeDelIter != nil { 1416 err = firstError(err, rangeDelIter.Close()) 1417 } 1418 if rkeyIter != nil { 1419 err = firstError(err, rkeyIter.Close()) 1420 } 1421 if err != nil { 1422 d.opts.Logger.Errorf("ingest error reading flushable for log %s: %s", m.logNum, err) 1423 } 1424 } 1425 1426 if mem == nil { 1427 // No overlap with any of the queued flushables, so no need to queue 1428 // after them. 1429 1430 // New writes with higher sequence numbers may be concurrently 1431 // committed. We must ensure they don't flush before this ingest 1432 // completes. To do that, we ref the mutable memtable as a writer, 1433 // preventing its flushing (and the flushing of all subsequent 1434 // flushables in the queue). Once we've acquired the manifest lock 1435 // to add the ingested sstables to the LSM, we can unref as we're 1436 // guaranteed that the flush won't edit the LSM before this ingest. 1437 mut = d.mu.mem.mutable 1438 mut.writerRef() 1439 return 1440 } 1441 // The ingestion overlaps with some entry in the flushable queue. 1442 if d.FormatMajorVersion() < FormatFlushableIngest || 1443 d.opts.Experimental.DisableIngestAsFlushable() || 1444 len(shared) > 0 || exciseSpan.Valid() || len(external) > 0 || 1445 (len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) { 1446 // We're not able to ingest as a flushable, 1447 // so we must synchronously flush. 1448 // 1449 // TODO(bilal): Currently, if any of the files being ingested are shared or 1450 // there's an excise span present, we cannot use flushable ingests and need 1451 // to wait synchronously. Either remove this caveat by fleshing out 1452 // flushable ingest logic to also account for these cases, or remove this 1453 // comment. Tracking issue: https://github.com/cockroachdb/pebble/issues/2676 1454 if mem.flushable == d.mu.mem.mutable { 1455 err = d.makeRoomForWrite(nil) 1456 } 1457 // New writes with higher sequence numbers may be concurrently 1458 // committed. We must ensure they don't flush before this ingest 1459 // completes. To do that, we ref the mutable memtable as a writer, 1460 // preventing its flushing (and the flushing of all subsequent 1461 // flushables in the queue). Once we've acquired the manifest lock 1462 // to add the ingested sstables to the LSM, we can unref as we're 1463 // guaranteed that the flush won't edit the LSM before this ingest. 1464 mut = d.mu.mem.mutable 1465 mut.writerRef() 1466 mem.flushForced = true 1467 d.maybeScheduleFlush() 1468 return 1469 } 1470 // Since there aren't too many memtables already queued up, we can 1471 // slide the ingested sstables on top of the existing memtables. 1472 asFlushable = true 1473 err = d.handleIngestAsFlushable(loadResult.localMeta, seqNum) 1474 } 1475 1476 var ve *versionEdit 1477 apply := func(seqNum uint64) { 1478 if err != nil || asFlushable { 1479 // An error occurred during prepare. 1480 if mut != nil { 1481 if mut.writerUnref() { 1482 d.mu.Lock() 1483 d.maybeScheduleFlush() 1484 d.mu.Unlock() 1485 } 1486 } 1487 return 1488 } 1489 1490 // Update the sequence numbers for all ingested sstables' 1491 // metadata. When the version edit is applied, the metadata is 1492 // written to the manifest, persisting the sequence number. 1493 // The sstables themselves are left unmodified. 1494 if err = ingestUpdateSeqNum( 1495 d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult, 1496 ); err != nil { 1497 if mut != nil { 1498 if mut.writerUnref() { 1499 d.mu.Lock() 1500 d.maybeScheduleFlush() 1501 d.mu.Unlock() 1502 } 1503 } 1504 return 1505 } 1506 1507 // If we overlapped with a memtable in prepare wait for the flush to 1508 // finish. 1509 if mem != nil { 1510 <-mem.flushed 1511 } 1512 1513 // Assign the sstables to the correct level in the LSM and apply the 1514 // version edit. 1515 ve, err = d.ingestApply(jobID, loadResult, targetLevelFunc, mut, exciseSpan) 1516 } 1517 1518 // Only one ingest can occur at a time because if not, one would block waiting 1519 // for the other to finish applying. This blocking would happen while holding 1520 // the commit mutex which would prevent unrelated batches from writing their 1521 // changes to the WAL and memtable. This will cause a bigger commit hiccup 1522 // during ingestion. 1523 d.commit.ingestSem <- struct{}{} 1524 d.commit.AllocateSeqNum(loadResult.fileCount, prepare, apply) 1525 <-d.commit.ingestSem 1526 1527 if err != nil { 1528 if err2 := ingestCleanup(d.objProvider, loadResult.localMeta); err2 != nil { 1529 d.opts.Logger.Errorf("ingest cleanup failed: %v", err2) 1530 } 1531 } else { 1532 // Since we either created a hard link to the ingesting files, or copied 1533 // them over, it is safe to remove the originals paths. 1534 for _, path := range loadResult.localPaths { 1535 if err2 := d.opts.FS.Remove(path); err2 != nil { 1536 d.opts.Logger.Errorf("ingest failed to remove original file: %s", err2) 1537 } 1538 } 1539 } 1540 1541 info := TableIngestInfo{ 1542 JobID: jobID, 1543 Err: err, 1544 flushable: asFlushable, 1545 } 1546 if len(loadResult.localMeta) > 0 { 1547 info.GlobalSeqNum = loadResult.localMeta[0].SmallestSeqNum 1548 } else if len(loadResult.sharedMeta) > 0 { 1549 info.GlobalSeqNum = loadResult.sharedMeta[0].SmallestSeqNum 1550 } else { 1551 info.GlobalSeqNum = loadResult.externalMeta[0].SmallestSeqNum 1552 } 1553 var stats IngestOperationStats 1554 if ve != nil { 1555 info.Tables = make([]struct { 1556 TableInfo 1557 Level int 1558 }, len(ve.NewFiles)) 1559 for i := range ve.NewFiles { 1560 e := &ve.NewFiles[i] 1561 info.Tables[i].Level = e.Level 1562 info.Tables[i].TableInfo = e.Meta.TableInfo() 1563 stats.Bytes += e.Meta.Size 1564 if e.Level == 0 { 1565 stats.ApproxIngestedIntoL0Bytes += e.Meta.Size 1566 } 1567 if i < len(metaFlushableOverlaps) && metaFlushableOverlaps[i] { 1568 stats.MemtableOverlappingFiles++ 1569 } 1570 } 1571 } else if asFlushable { 1572 // NB: If asFlushable == true, there are no shared sstables. 1573 info.Tables = make([]struct { 1574 TableInfo 1575 Level int 1576 }, len(loadResult.localMeta)) 1577 for i, f := range loadResult.localMeta { 1578 info.Tables[i].Level = -1 1579 info.Tables[i].TableInfo = f.TableInfo() 1580 stats.Bytes += f.Size 1581 // We don't have exact stats on which files will be ingested into 1582 // L0, because actual ingestion into the LSM has been deferred until 1583 // flush time. Instead, we infer based on memtable overlap. 1584 // 1585 // TODO(jackson): If we optimistically compute data overlap (#2112) 1586 // before entering the commit pipeline, we can use that overlap to 1587 // improve our approximation by incorporating overlap with L0, not 1588 // just memtables. 1589 if metaFlushableOverlaps[i] { 1590 stats.ApproxIngestedIntoL0Bytes += f.Size 1591 stats.MemtableOverlappingFiles++ 1592 } 1593 } 1594 } 1595 d.opts.EventListener.TableIngested(info) 1596 1597 return stats, err 1598 } 1599 1600 // excise updates ve to include a replacement of the file m with new virtual 1601 // sstables that exclude exciseSpan, returning a slice of newly-created files if 1602 // any. If the entirety of m is deleted by exciseSpan, no new sstables are added 1603 // and m is deleted. Note that ve is updated in-place. 1604 // 1605 // The manifest lock must be held when calling this method. 1606 func (d *DB) excise( 1607 exciseSpan KeyRange, m *fileMetadata, ve *versionEdit, level int, 1608 ) ([]manifest.NewFileEntry, error) { 1609 numCreatedFiles := 0 1610 // Check if there's actually an overlap between m and exciseSpan. 1611 if !exciseSpan.Overlaps(d.cmp, m) { 1612 return nil, nil 1613 } 1614 ve.DeletedFiles[deletedFileEntry{ 1615 Level: level, 1616 FileNum: m.FileNum, 1617 }] = m 1618 // Fast path: m sits entirely within the exciseSpan, so just delete it. 1619 if exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) { 1620 return nil, nil 1621 } 1622 var iter internalIterator 1623 var rangeDelIter keyspan.FragmentIterator 1624 var rangeKeyIter keyspan.FragmentIterator 1625 needsBacking := false 1626 // Create a file to the left of the excise span, if necessary. 1627 // The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)]. 1628 // 1629 // We create bounds that are tight on user keys, and we make the effort to find 1630 // the last key in the original sstable that's smaller than exciseSpan.Start 1631 // even though it requires some sstable reads. We could choose to create 1632 // virtual sstables on loose userKey bounds, in which case we could just set 1633 // leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest 1634 // issue with that approach would be that it'd lead to lots of small virtual 1635 // sstables in the LSM that have no guarantee on containing even a single user 1636 // key within the file bounds. This has the potential to increase both read and 1637 // write-amp as we will be opening up these sstables only to find no relevant 1638 // keys in the read path, and compacting sstables on top of them instead of 1639 // directly into the space occupied by them. We choose to incur the cost of 1640 // calculating tight bounds at this time instead of creating more work in the 1641 // future. 1642 // 1643 // TODO(bilal): Some of this work can happen without grabbing the manifest 1644 // lock; we could grab one currentVersion, release the lock, calculate excised 1645 // files, then grab the lock again and recalculate for just the files that 1646 // have changed since our previous calculation. Do this optimiaztino as part of 1647 // https://github.com/cockroachdb/pebble/issues/2112 . 1648 if d.cmp(m.Smallest.UserKey, exciseSpan.Start) < 0 { 1649 leftFile := &fileMetadata{ 1650 Virtual: true, 1651 FileBacking: m.FileBacking, 1652 FileNum: d.mu.versions.getNextFileNum(), 1653 // Note that these are loose bounds for smallest/largest seqnums, but they're 1654 // sufficient for maintaining correctness. 1655 SmallestSeqNum: m.SmallestSeqNum, 1656 LargestSeqNum: m.LargestSeqNum, 1657 } 1658 if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.SmallestPointKey) { 1659 // This file will contain point keys 1660 smallestPointKey := m.SmallestPointKey 1661 var err error 1662 iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{ 1663 CategoryAndQoS: sstable.CategoryAndQoS{ 1664 Category: "pebble-ingest", 1665 QoSLevel: sstable.LatencySensitiveQoSLevel, 1666 }, 1667 level: manifest.Level(level), 1668 }, internalIterOpts{}) 1669 if err != nil { 1670 return nil, err 1671 } 1672 var key *InternalKey 1673 if iter != nil { 1674 defer iter.Close() 1675 key, _ = iter.SeekLT(exciseSpan.Start, base.SeekLTFlagsNone) 1676 } else { 1677 iter = emptyIter 1678 } 1679 if key != nil { 1680 leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, key.Clone()) 1681 } 1682 // Store the min of (exciseSpan.Start, rdel.End) in lastRangeDel. This 1683 // needs to be a copy if the key is owned by the range del iter. 1684 var lastRangeDel []byte 1685 if rangeDelIter != nil { 1686 defer rangeDelIter.Close() 1687 rdel := rangeDelIter.SeekLT(exciseSpan.Start) 1688 if rdel != nil { 1689 lastRangeDel = append(lastRangeDel[:0], rdel.End...) 1690 if d.cmp(lastRangeDel, exciseSpan.Start) > 0 { 1691 lastRangeDel = exciseSpan.Start 1692 } 1693 } 1694 } else { 1695 rangeDelIter = emptyKeyspanIter 1696 } 1697 if lastRangeDel != nil { 1698 leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel)) 1699 } 1700 } 1701 if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.SmallestRangeKey) { 1702 // This file will contain range keys 1703 var err error 1704 smallestRangeKey := m.SmallestRangeKey 1705 rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) 1706 if err != nil { 1707 return nil, err 1708 } 1709 // Store the min of (exciseSpan.Start, rkey.End) in lastRangeKey. This 1710 // needs to be a copy if the key is owned by the range key iter. 1711 var lastRangeKey []byte 1712 var lastRangeKeyKind InternalKeyKind 1713 defer rangeKeyIter.Close() 1714 rkey := rangeKeyIter.SeekLT(exciseSpan.Start) 1715 if rkey != nil { 1716 lastRangeKey = append(lastRangeKey[:0], rkey.End...) 1717 if d.cmp(lastRangeKey, exciseSpan.Start) > 0 { 1718 lastRangeKey = exciseSpan.Start 1719 } 1720 lastRangeKeyKind = rkey.Keys[0].Kind() 1721 } 1722 if lastRangeKey != nil { 1723 leftFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, base.MakeExclusiveSentinelKey(lastRangeKeyKind, lastRangeKey)) 1724 } 1725 } 1726 if leftFile.HasRangeKeys || leftFile.HasPointKeys { 1727 var err error 1728 leftFile.Size, err = d.tableCache.estimateSize(m, leftFile.Smallest.UserKey, leftFile.Largest.UserKey) 1729 if err != nil { 1730 return nil, err 1731 } 1732 if leftFile.Size == 0 { 1733 // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size, 1734 // such as if the excised file only has range keys/dels and no point 1735 // keys. This can cause panics in places where we divide by file sizes. 1736 // Correct for it here. 1737 leftFile.Size = 1 1738 } 1739 if err := leftFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { 1740 return nil, err 1741 } 1742 leftFile.ValidateVirtual(m) 1743 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: leftFile}) 1744 needsBacking = true 1745 numCreatedFiles++ 1746 } 1747 } 1748 // Create a file to the right, if necessary. 1749 if exciseSpan.Contains(d.cmp, m.Largest) { 1750 // No key exists to the right of the excise span in this file. 1751 if needsBacking && !m.Virtual { 1752 // If m is virtual, then its file backing is already known to the manifest. 1753 // We don't need to create another file backing. Note that there must be 1754 // only one CreatedBackingTables entry per backing sstable. This is 1755 // indicated by the VersionEdit.CreatedBackingTables invariant. 1756 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 1757 } 1758 return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil 1759 } 1760 // Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest]. 1761 // 1762 // See comment before the definition of leftFile for the motivation behind 1763 // calculating tight user-key bounds. 1764 rightFile := &fileMetadata{ 1765 Virtual: true, 1766 FileBacking: m.FileBacking, 1767 FileNum: d.mu.versions.getNextFileNum(), 1768 // Note that these are loose bounds for smallest/largest seqnums, but they're 1769 // sufficient for maintaining correctness. 1770 SmallestSeqNum: m.SmallestSeqNum, 1771 LargestSeqNum: m.LargestSeqNum, 1772 } 1773 if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.LargestPointKey) { 1774 // This file will contain point keys 1775 largestPointKey := m.LargestPointKey 1776 var err error 1777 if iter == nil && rangeDelIter == nil { 1778 iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{ 1779 CategoryAndQoS: sstable.CategoryAndQoS{ 1780 Category: "pebble-ingest", 1781 QoSLevel: sstable.LatencySensitiveQoSLevel, 1782 }, 1783 level: manifest.Level(level), 1784 }, internalIterOpts{}) 1785 if err != nil { 1786 return nil, err 1787 } 1788 if iter != nil { 1789 defer iter.Close() 1790 } else { 1791 iter = emptyIter 1792 } 1793 if rangeDelIter != nil { 1794 defer rangeDelIter.Close() 1795 } else { 1796 rangeDelIter = emptyKeyspanIter 1797 } 1798 } 1799 key, _ := iter.SeekGE(exciseSpan.End, base.SeekGEFlagsNone) 1800 if key != nil { 1801 rightFile.ExtendPointKeyBounds(d.cmp, key.Clone(), largestPointKey) 1802 } 1803 // Store the max of (exciseSpan.End, rdel.Start) in firstRangeDel. This 1804 // needs to be a copy if the key is owned by the range del iter. 1805 var firstRangeDel []byte 1806 rdel := rangeDelIter.SeekGE(exciseSpan.End) 1807 if rdel != nil { 1808 firstRangeDel = append(firstRangeDel[:0], rdel.Start...) 1809 if d.cmp(firstRangeDel, exciseSpan.End) < 0 { 1810 firstRangeDel = exciseSpan.End 1811 } 1812 } 1813 if firstRangeDel != nil { 1814 smallestPointKey := rdel.SmallestKey() 1815 smallestPointKey.UserKey = firstRangeDel 1816 rightFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, largestPointKey) 1817 } 1818 } 1819 if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.LargestRangeKey) { 1820 // This file will contain range keys. 1821 largestRangeKey := m.LargestRangeKey 1822 if rangeKeyIter == nil { 1823 var err error 1824 rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) 1825 if err != nil { 1826 return nil, err 1827 } 1828 defer rangeKeyIter.Close() 1829 } 1830 // Store the max of (exciseSpan.End, rkey.Start) in firstRangeKey. This 1831 // needs to be a copy if the key is owned by the range key iter. 1832 var firstRangeKey []byte 1833 rkey := rangeKeyIter.SeekGE(exciseSpan.End) 1834 if rkey != nil { 1835 firstRangeKey = append(firstRangeKey[:0], rkey.Start...) 1836 if d.cmp(firstRangeKey, exciseSpan.End) < 0 { 1837 firstRangeKey = exciseSpan.End 1838 } 1839 } 1840 if firstRangeKey != nil { 1841 smallestRangeKey := rkey.SmallestKey() 1842 smallestRangeKey.UserKey = firstRangeKey 1843 // We call ExtendRangeKeyBounds so any internal boundType fields are 1844 // set correctly. Note that this is mildly wasteful as we'll be comparing 1845 // rightFile.{Smallest,Largest}RangeKey with themselves, which can be 1846 // avoided if we exported ExtendOverallKeyBounds or so. 1847 rightFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, largestRangeKey) 1848 } 1849 } 1850 if rightFile.HasRangeKeys || rightFile.HasPointKeys { 1851 var err error 1852 rightFile.Size, err = d.tableCache.estimateSize(m, rightFile.Smallest.UserKey, rightFile.Largest.UserKey) 1853 if err != nil { 1854 return nil, err 1855 } 1856 if rightFile.Size == 0 { 1857 // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size, 1858 // such as if the excised file only has range keys/dels and no point keys. 1859 // This can cause panics in places where we divide by file sizes. Correct 1860 // for it here. 1861 rightFile.Size = 1 1862 } 1863 rightFile.ValidateVirtual(m) 1864 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: rightFile}) 1865 needsBacking = true 1866 numCreatedFiles++ 1867 } 1868 1869 if needsBacking && !m.Virtual { 1870 // If m is virtual, then its file backing is already known to the manifest. 1871 // We don't need to create another file backing. Note that there must be 1872 // only one CreatedBackingTables entry per backing sstable. This is 1873 // indicated by the VersionEdit.CreatedBackingTables invariant. 1874 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 1875 } 1876 1877 if err := rightFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { 1878 return nil, err 1879 } 1880 return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil 1881 } 1882 1883 type ingestTargetLevelFunc func( 1884 newIters tableNewIters, 1885 newRangeKeyIter keyspan.TableNewSpanIter, 1886 iterOps IterOptions, 1887 comparer *Comparer, 1888 v *version, 1889 baseLevel int, 1890 compactions map[*compaction]struct{}, 1891 meta *fileMetadata, 1892 suggestSplit bool, 1893 ) (int, *fileMetadata, error) 1894 1895 type ingestSplitFile struct { 1896 // ingestFile is the file being ingested. 1897 ingestFile *fileMetadata 1898 // splitFile is the file that needs to be split to allow ingestFile to slot 1899 // into `level` level. 1900 splitFile *fileMetadata 1901 // The level where ingestFile will go (and where splitFile already is). 1902 level int 1903 } 1904 1905 // ingestSplit splits files specified in `files` and updates ve in-place to 1906 // account for existing files getting split into two virtual sstables. The map 1907 // `replacedFiles` contains an in-progress map of all files that have been 1908 // replaced with new virtual sstables in this version edit so far, which is also 1909 // updated in-place. 1910 // 1911 // d.mu as well as the manifest lock must be held when calling this method. 1912 func (d *DB) ingestSplit( 1913 ve *versionEdit, 1914 updateMetrics func(*fileMetadata, int, []newFileEntry), 1915 files []ingestSplitFile, 1916 replacedFiles map[base.FileNum][]newFileEntry, 1917 ) error { 1918 for _, s := range files { 1919 // replacedFiles can be thought of as a tree, where we start iterating with 1920 // s.splitFile and run its fileNum through replacedFiles, then find which of 1921 // the replaced files overlaps with s.ingestFile, which becomes the new 1922 // splitFile, then we check splitFile's replacements in replacedFiles again 1923 // for overlap with s.ingestFile, and so on until we either can't find the 1924 // current splitFile in replacedFiles (i.e. that's the file that now needs to 1925 // be split), or we don't find a file that overlaps with s.ingestFile, which 1926 // means a prior ingest split already produced enough room for s.ingestFile 1927 // to go into this level without necessitating another ingest split. 1928 splitFile := s.splitFile 1929 for splitFile != nil { 1930 replaced, ok := replacedFiles[splitFile.FileNum] 1931 if !ok { 1932 break 1933 } 1934 updatedSplitFile := false 1935 for i := range replaced { 1936 if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) { 1937 if updatedSplitFile { 1938 // This should never happen because the earlier ingestTargetLevel 1939 // function only finds split file candidates that are guaranteed to 1940 // have no data overlap, only boundary overlap. See the comments 1941 // in that method to see the definitions of data vs boundary 1942 // overlap. That, plus the fact that files in `replaced` are 1943 // guaranteed to have file bounds that are tight on user keys 1944 // (as that's what `d.excise` produces), means that the only case 1945 // where we overlap with two or more files in `replaced` is if we 1946 // actually had data overlap all along, or if the ingestion files 1947 // were overlapping, either of which is an invariant violation. 1948 panic("updated with two files in ingestSplit") 1949 } 1950 splitFile = replaced[i].Meta 1951 updatedSplitFile = true 1952 } 1953 } 1954 if !updatedSplitFile { 1955 // None of the replaced files overlapped with the file being ingested. 1956 // This can happen if we've already excised a span overlapping with 1957 // this file, or if we have consecutive ingested files that can slide 1958 // within the same gap between keys in an existing file. For instance, 1959 // if an existing file has keys a and g and we're ingesting b-c, d-e, 1960 // the first loop iteration will split the existing file into one that 1961 // ends in a and another that starts at g, and the second iteration will 1962 // fall into this case and require no splitting. 1963 // 1964 // No splitting necessary. 1965 splitFile = nil 1966 } 1967 } 1968 if splitFile == nil { 1969 continue 1970 } 1971 // NB: excise operates on [start, end). We're splitting at [start, end] 1972 // (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation 1973 // of exclusive vs inclusive end bounds should not make a difference here 1974 // as we're guaranteed to not have any data overlap between splitFile and 1975 // s.ingestFile, so panic if we do see a newly added file with an endKey 1976 // equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel() 1977 added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level) 1978 if err != nil { 1979 return err 1980 } 1981 if _, ok := ve.DeletedFiles[deletedFileEntry{ 1982 Level: s.level, 1983 FileNum: splitFile.FileNum, 1984 }]; !ok { 1985 panic("did not split file that was expected to be split") 1986 } 1987 replacedFiles[splitFile.FileNum] = added 1988 for i := range added { 1989 if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) { 1990 panic("ingest-time split produced a file that overlaps with ingested file") 1991 } 1992 } 1993 updateMetrics(splitFile, s.level, added) 1994 } 1995 // Flatten the version edit by removing any entries from ve.NewFiles that 1996 // are also in ve.DeletedFiles. 1997 newNewFiles := ve.NewFiles[:0] 1998 for i := range ve.NewFiles { 1999 fn := ve.NewFiles[i].Meta.FileNum 2000 deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn} 2001 if _, ok := ve.DeletedFiles[deEntry]; ok { 2002 delete(ve.DeletedFiles, deEntry) 2003 } else { 2004 newNewFiles = append(newNewFiles, ve.NewFiles[i]) 2005 } 2006 } 2007 ve.NewFiles = newNewFiles 2008 return nil 2009 } 2010 2011 func (d *DB) ingestApply( 2012 jobID int, 2013 lr ingestLoadResult, 2014 findTargetLevel ingestTargetLevelFunc, 2015 mut *memTable, 2016 exciseSpan KeyRange, 2017 ) (*versionEdit, error) { 2018 d.mu.Lock() 2019 defer d.mu.Unlock() 2020 2021 ve := &versionEdit{ 2022 NewFiles: make([]newFileEntry, lr.fileCount), 2023 } 2024 if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) { 2025 ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{} 2026 } 2027 metrics := make(map[int]*LevelMetrics) 2028 2029 // Lock the manifest for writing before we use the current version to 2030 // determine the target level. This prevents two concurrent ingestion jobs 2031 // from using the same version to determine the target level, and also 2032 // provides serialization with concurrent compaction and flush jobs. 2033 // logAndApply unconditionally releases the manifest lock, but any earlier 2034 // returns must unlock the manifest. 2035 d.mu.versions.logLock() 2036 2037 if mut != nil { 2038 // Unref the mutable memtable to allows its flush to proceed. Now that we've 2039 // acquired the manifest lock, we can be certain that if the mutable 2040 // memtable has received more recent conflicting writes, the flush won't 2041 // beat us to applying to the manifest resulting in sequence number 2042 // inversion. Even though we call maybeScheduleFlush right now, this flush 2043 // will apply after our ingestion. 2044 if mut.writerUnref() { 2045 d.maybeScheduleFlush() 2046 } 2047 } 2048 2049 shouldIngestSplit := d.opts.Experimental.IngestSplit != nil && 2050 d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables 2051 current := d.mu.versions.currentVersion() 2052 baseLevel := d.mu.versions.picker.getBaseLevel() 2053 iterOps := IterOptions{logger: d.opts.Logger} 2054 // filesToSplit is a list where each element is a pair consisting of a file 2055 // being ingested and a file being split to make room for an ingestion into 2056 // that level. Each ingested file will appear at most once in this list. It 2057 // is possible for split files to appear twice in this list. 2058 filesToSplit := make([]ingestSplitFile, 0) 2059 checkCompactions := false 2060 for i := 0; i < lr.fileCount; i++ { 2061 // Determine the lowest level in the LSM for which the sstable doesn't 2062 // overlap any existing files in the level. 2063 var m *fileMetadata 2064 sharedIdx := -1 2065 sharedLevel := -1 2066 externalFile := false 2067 if i < len(lr.localMeta) { 2068 // local file. 2069 m = lr.localMeta[i] 2070 } else if (i - len(lr.localMeta)) < len(lr.sharedMeta) { 2071 // shared file. 2072 sharedIdx = i - len(lr.localMeta) 2073 m = lr.sharedMeta[sharedIdx] 2074 sharedLevel = int(lr.sharedLevels[sharedIdx]) 2075 } else { 2076 // external file. 2077 externalFile = true 2078 m = lr.externalMeta[i-(len(lr.localMeta)+len(lr.sharedMeta))] 2079 } 2080 f := &ve.NewFiles[i] 2081 var err error 2082 if sharedIdx >= 0 { 2083 f.Level = sharedLevel 2084 if f.Level < sharedLevelsStart { 2085 panic("cannot slot a shared file higher than the highest shared level") 2086 } 2087 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 2088 } else { 2089 if externalFile { 2090 ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) 2091 } 2092 var splitFile *fileMetadata 2093 if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) { 2094 // This file fits perfectly within the excise span. We can slot it at 2095 // L6, or sharedLevelsStart - 1 if we have shared files. 2096 if len(lr.sharedMeta) > 0 { 2097 f.Level = sharedLevelsStart - 1 2098 if baseLevel > f.Level { 2099 f.Level = 0 2100 } 2101 } else { 2102 f.Level = 6 2103 } 2104 } else { 2105 // TODO(bilal): findTargetLevel does disk IO (reading files for data 2106 // overlap) even though we're holding onto d.mu. Consider unlocking 2107 // d.mu while we do this. We already hold versions.logLock so we should 2108 // not see any version applications while we're at this. The one 2109 // complication here would be pulling out the mu.compact.inProgress 2110 // check from findTargetLevel, as that requires d.mu to be held. 2111 f.Level, splitFile, err = findTargetLevel( 2112 d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit) 2113 } 2114 2115 if splitFile != nil { 2116 if invariants.Enabled { 2117 if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil { 2118 panic("splitFile returned is not in level it should be") 2119 } 2120 } 2121 // We take advantage of the fact that we won't drop the db mutex 2122 // between now and the call to logAndApply. So, no files should 2123 // get added to a new in-progress compaction at this point. We can 2124 // avoid having to iterate on in-progress compactions to cancel them 2125 // if none of the files being split have a compacting state. 2126 if splitFile.IsCompacting() { 2127 checkCompactions = true 2128 } 2129 filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level}) 2130 } 2131 } 2132 if err != nil { 2133 d.mu.versions.logUnlock() 2134 return nil, err 2135 } 2136 f.Meta = m 2137 levelMetrics := metrics[f.Level] 2138 if levelMetrics == nil { 2139 levelMetrics = &LevelMetrics{} 2140 metrics[f.Level] = levelMetrics 2141 } 2142 levelMetrics.NumFiles++ 2143 levelMetrics.Size += int64(m.Size) 2144 levelMetrics.BytesIngested += m.Size 2145 levelMetrics.TablesIngested++ 2146 } 2147 // replacedFiles maps files excised due to exciseSpan (or splitFiles returned 2148 // by ingestTargetLevel), to files that were created to replace it. This map 2149 // is used to resolve references to split files in filesToSplit, as it is 2150 // possible for a file that we want to split to no longer exist or have a 2151 // newer fileMetadata due to a split induced by another ingestion file, or an 2152 // excise. 2153 replacedFiles := make(map[base.FileNum][]newFileEntry) 2154 updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) { 2155 levelMetrics := metrics[level] 2156 if levelMetrics == nil { 2157 levelMetrics = &LevelMetrics{} 2158 metrics[level] = levelMetrics 2159 } 2160 levelMetrics.NumFiles-- 2161 levelMetrics.Size -= int64(m.Size) 2162 for i := range added { 2163 levelMetrics.NumFiles++ 2164 levelMetrics.Size += int64(added[i].Meta.Size) 2165 } 2166 } 2167 if exciseSpan.Valid() { 2168 // Iterate through all levels and find files that intersect with exciseSpan. 2169 // 2170 // TODO(bilal): We could drop the DB mutex here as we don't need it for 2171 // excises; we only need to hold the version lock which we already are 2172 // holding. However releasing the DB mutex could mess with the 2173 // ingestTargetLevel calculation that happened above, as it assumed that it 2174 // had a complete view of in-progress compactions that wouldn't change 2175 // until logAndApply is called. If we were to drop the mutex now, we could 2176 // schedule another in-progress compaction that would go into the chosen target 2177 // level and lead to file overlap within level (which would panic in 2178 // logAndApply). We should drop the db mutex here, do the excise, then 2179 // re-grab the DB mutex and rerun just the in-progress compaction check to 2180 // see if any new compactions are conflicting with our chosen target levels 2181 // for files, and if they are, we should signal those compactions to error 2182 // out. 2183 for level := range current.Levels { 2184 overlaps := current.Overlaps(level, d.cmp, exciseSpan.Start, exciseSpan.End, true /* exclusiveEnd */) 2185 iter := overlaps.Iter() 2186 2187 for m := iter.First(); m != nil; m = iter.Next() { 2188 newFiles, err := d.excise(exciseSpan, m, ve, level) 2189 if err != nil { 2190 return nil, err 2191 } 2192 2193 if _, ok := ve.DeletedFiles[deletedFileEntry{ 2194 Level: level, 2195 FileNum: m.FileNum, 2196 }]; !ok { 2197 // We did not excise this file. 2198 continue 2199 } 2200 replacedFiles[m.FileNum] = newFiles 2201 updateLevelMetricsOnExcise(m, level, newFiles) 2202 } 2203 } 2204 } 2205 if len(filesToSplit) > 0 { 2206 // For the same reasons as the above call to excise, we hold the db mutex 2207 // while calling this method. 2208 if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil { 2209 return nil, err 2210 } 2211 } 2212 if len(filesToSplit) > 0 || exciseSpan.Valid() { 2213 for c := range d.mu.compact.inProgress { 2214 if c.versionEditApplied { 2215 continue 2216 } 2217 // Check if this compaction overlaps with the excise span. Note that just 2218 // checking if the inputs individually overlap with the excise span 2219 // isn't sufficient; for instance, a compaction could have [a,b] and [e,f] 2220 // as inputs and write it all out as [a,b,e,f] in one sstable. If we're 2221 // doing a [c,d) excise at the same time as this compaction, we will have 2222 // to error out the whole compaction as we can't guarantee it hasn't/won't 2223 // write a file overlapping with the excise span. 2224 if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) { 2225 c.cancel.Store(true) 2226 } 2227 // Check if this compaction's inputs have been replaced due to an 2228 // ingest-time split. In that case, cancel the compaction as a newly picked 2229 // compaction would need to include any new files that slid in between 2230 // previously-existing files. Note that we cancel any compaction that has a 2231 // file that was ingest-split as an input, even if it started before this 2232 // ingestion. 2233 if checkCompactions { 2234 for i := range c.inputs { 2235 iter := c.inputs[i].files.Iter() 2236 for f := iter.First(); f != nil; f = iter.Next() { 2237 if _, ok := replacedFiles[f.FileNum]; ok { 2238 c.cancel.Store(true) 2239 break 2240 } 2241 } 2242 } 2243 } 2244 } 2245 // Check for any EventuallyFileOnlySnapshots that could be watching for 2246 // an excise on this span. 2247 if exciseSpan.Valid() { 2248 for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next { 2249 if s.efos == nil { 2250 continue 2251 } 2252 efos := s.efos 2253 // TODO(bilal): We can make this faster by taking advantage of the sorted 2254 // nature of protectedRanges to do a sort.Search, or even maintaining a 2255 // global list of all protected ranges instead of having to peer into every 2256 // snapshot. 2257 for i := range efos.protectedRanges { 2258 if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) { 2259 efos.excised.Store(true) 2260 break 2261 } 2262 } 2263 } 2264 } 2265 } 2266 if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo { 2267 return d.getInProgressCompactionInfoLocked(nil) 2268 }); err != nil { 2269 return nil, err 2270 } 2271 2272 d.mu.versions.metrics.Ingest.Count++ 2273 2274 d.updateReadStateLocked(d.opts.DebugCheck) 2275 // updateReadStateLocked could have generated obsolete tables, schedule a 2276 // cleanup job if necessary. 2277 d.deleteObsoleteFiles(jobID) 2278 d.updateTableStatsLocked(ve.NewFiles) 2279 // The ingestion may have pushed a level over the threshold for compaction, 2280 // so check to see if one is necessary and schedule it. 2281 d.maybeScheduleCompaction() 2282 var toValidate []manifest.NewFileEntry 2283 dedup := make(map[base.DiskFileNum]struct{}) 2284 for _, entry := range ve.NewFiles { 2285 if _, ok := dedup[entry.Meta.FileBacking.DiskFileNum]; !ok { 2286 toValidate = append(toValidate, entry) 2287 dedup[entry.Meta.FileBacking.DiskFileNum] = struct{}{} 2288 } 2289 } 2290 d.maybeValidateSSTablesLocked(toValidate) 2291 return ve, nil 2292 } 2293 2294 // maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending 2295 // queue of files to be validated, when the feature is enabled. 2296 // 2297 // Note that if two entries with the same backing file are added twice, then the 2298 // block checksums for the backing file will be validated twice. 2299 // 2300 // DB.mu must be locked when calling. 2301 func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) { 2302 // Only add to the validation queue when the feature is enabled. 2303 if !d.opts.Experimental.ValidateOnIngest { 2304 return 2305 } 2306 2307 d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...) 2308 if d.shouldValidateSSTablesLocked() { 2309 go d.validateSSTables() 2310 } 2311 } 2312 2313 // shouldValidateSSTablesLocked returns true if SSTable validation should run. 2314 // DB.mu must be locked when calling. 2315 func (d *DB) shouldValidateSSTablesLocked() bool { 2316 return !d.mu.tableValidation.validating && 2317 d.closed.Load() == nil && 2318 d.opts.Experimental.ValidateOnIngest && 2319 len(d.mu.tableValidation.pending) > 0 2320 } 2321 2322 // validateSSTables runs a round of validation on the tables in the pending 2323 // queue. 2324 func (d *DB) validateSSTables() { 2325 d.mu.Lock() 2326 if !d.shouldValidateSSTablesLocked() { 2327 d.mu.Unlock() 2328 return 2329 } 2330 2331 pending := d.mu.tableValidation.pending 2332 d.mu.tableValidation.pending = nil 2333 d.mu.tableValidation.validating = true 2334 jobID := d.mu.nextJobID 2335 d.mu.nextJobID++ 2336 rs := d.loadReadState() 2337 2338 // Drop DB.mu before performing IO. 2339 d.mu.Unlock() 2340 2341 // Validate all tables in the pending queue. This could lead to a situation 2342 // where we are starving IO from other tasks due to having to page through 2343 // all the blocks in all the sstables in the queue. 2344 // TODO(travers): Add some form of pacing to avoid IO starvation. 2345 2346 // If we fail to validate any files due to reasons other than uncovered 2347 // corruption, accumulate them and re-queue them for another attempt. 2348 var retry []manifest.NewFileEntry 2349 2350 for _, f := range pending { 2351 // The file may have been moved or deleted since it was ingested, in 2352 // which case we skip. 2353 if !rs.current.Contains(f.Level, d.cmp, f.Meta) { 2354 // Assume the file was moved to a lower level. It is rare enough 2355 // that a table is moved or deleted between the time it was ingested 2356 // and the time the validation routine runs that the overall cost of 2357 // this inner loop is tolerably low, when amortized over all 2358 // ingested tables. 2359 found := false 2360 for i := f.Level + 1; i < numLevels; i++ { 2361 if rs.current.Contains(i, d.cmp, f.Meta) { 2362 found = true 2363 break 2364 } 2365 } 2366 if !found { 2367 continue 2368 } 2369 } 2370 2371 var err error 2372 if f.Meta.Virtual { 2373 err = d.tableCache.withVirtualReader( 2374 f.Meta.VirtualMeta(), func(v sstable.VirtualReader) error { 2375 return v.ValidateBlockChecksumsOnBacking() 2376 }) 2377 } else { 2378 err = d.tableCache.withReader( 2379 f.Meta.PhysicalMeta(), func(r *sstable.Reader) error { 2380 return r.ValidateBlockChecksums() 2381 }) 2382 } 2383 2384 if err != nil { 2385 if IsCorruptionError(err) { 2386 // TODO(travers): Hook into the corruption reporting pipeline, once 2387 // available. See pebble#1192. 2388 d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err) 2389 } else { 2390 // If there was some other, possibly transient, error that 2391 // caused table validation to fail inform the EventListener and 2392 // move on. We remember the table so that we can retry it in a 2393 // subsequent table validation job. 2394 // 2395 // TODO(jackson): If the error is not transient, this will retry 2396 // validation indefinitely. While not great, it's the same 2397 // behavior as erroring flushes and compactions. We should 2398 // address this as a part of #270. 2399 d.opts.EventListener.BackgroundError(err) 2400 retry = append(retry, f) 2401 continue 2402 } 2403 } 2404 2405 d.opts.EventListener.TableValidated(TableValidatedInfo{ 2406 JobID: jobID, 2407 Meta: f.Meta, 2408 }) 2409 } 2410 rs.unref() 2411 d.mu.Lock() 2412 defer d.mu.Unlock() 2413 d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, retry...) 2414 d.mu.tableValidation.validating = false 2415 d.mu.tableValidation.cond.Broadcast() 2416 if d.shouldValidateSSTablesLocked() { 2417 go d.validateSSTables() 2418 } 2419 }