github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/ingest.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "sort" 9 "time" 10 11 "github.com/cockroachdb/errors" 12 "github.com/zuoyebang/bitalostable/internal/base" 13 "github.com/zuoyebang/bitalostable/internal/keyspan" 14 "github.com/zuoyebang/bitalostable/internal/manifest" 15 "github.com/zuoyebang/bitalostable/internal/private" 16 "github.com/zuoyebang/bitalostable/sstable" 17 "github.com/zuoyebang/bitalostable/vfs" 18 ) 19 20 func sstableKeyCompare(userCmp Compare, a, b InternalKey) int { 21 c := userCmp(a.UserKey, b.UserKey) 22 if c != 0 { 23 return c 24 } 25 if a.Trailer == InternalKeyRangeDeleteSentinel { 26 if b.Trailer != InternalKeyRangeDeleteSentinel { 27 return -1 28 } 29 } else if b.Trailer == InternalKeyRangeDeleteSentinel { 30 return 1 31 } 32 return 0 33 } 34 35 func ingestValidateKey(opts *Options, key *InternalKey) error { 36 if key.Kind() == InternalKeyKindInvalid { 37 return base.CorruptionErrorf("bitalostable: external sstable has corrupted key: %s", 38 key.Pretty(opts.Comparer.FormatKey)) 39 } 40 if key.SeqNum() != 0 { 41 return base.CorruptionErrorf("bitalostable: external sstable has non-zero seqnum: %s", 42 key.Pretty(opts.Comparer.FormatKey)) 43 } 44 return nil 45 } 46 47 func ingestLoad1( 48 opts *Options, fmv FormatMajorVersion, path string, cacheID uint64, fileNum FileNum, 49 ) (*fileMetadata, error) { 50 stat, err := opts.FS.Stat(path) 51 if err != nil { 52 return nil, err 53 } 54 55 f, err := opts.FS.Open(path) 56 if err != nil { 57 return nil, err 58 } 59 60 cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption) 61 r, err := sstable.NewReader(f, opts.MakeReaderOptions(), cacheOpts) 62 if err != nil { 63 return nil, err 64 } 65 defer r.Close() 66 67 // Avoid ingesting tables with format versions this DB doesn't support. 68 tf, err := r.TableFormat() 69 if err != nil { 70 return nil, err 71 } 72 if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() { 73 return nil, errors.Newf( 74 "bitalostable: table format %s is not within range supported at DB format major version %d, (%s,%s)", 75 tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(), 76 ) 77 } 78 79 meta := &fileMetadata{} 80 meta.FileNum = fileNum 81 meta.Size = uint64(stat.Size()) 82 meta.CreationTime = time.Now().Unix() 83 84 // Avoid loading into the table cache for collecting stats if we 85 // don't need to. If there are no range deletions, we have all the 86 // information to compute the stats here. 87 // 88 // This is helpful in tests for avoiding awkwardness around deletion of 89 // ingested files from MemFS. MemFS implements the Windows semantics of 90 // disallowing removal of an open file. Under MemFS, if we don't populate 91 // meta.Stats here, the file will be loaded into the table cache for 92 // calculating stats before we can remove the original link. 93 maybeSetStatsFromProperties(meta, &r.Properties) 94 95 { 96 iter, err := r.NewIter(nil /* lower */, nil /* upper */) 97 if err != nil { 98 return nil, err 99 } 100 defer iter.Close() 101 var smallest InternalKey 102 if key, _ := iter.First(); key != nil { 103 if err := ingestValidateKey(opts, key); err != nil { 104 return nil, err 105 } 106 smallest = (*key).Clone() 107 } 108 if err := iter.Error(); err != nil { 109 return nil, err 110 } 111 if key, _ := iter.Last(); key != nil { 112 if err := ingestValidateKey(opts, key); err != nil { 113 return nil, err 114 } 115 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone()) 116 } 117 if err := iter.Error(); err != nil { 118 return nil, err 119 } 120 } 121 122 iter, err := r.NewRawRangeDelIter() 123 if err != nil { 124 return nil, err 125 } 126 if iter != nil { 127 defer iter.Close() 128 var smallest InternalKey 129 if s := iter.First(); s != nil { 130 key := s.SmallestKey() 131 if err := ingestValidateKey(opts, &key); err != nil { 132 return nil, err 133 } 134 smallest = key.Clone() 135 } 136 if err := iter.Error(); err != nil { 137 return nil, err 138 } 139 if s := iter.Last(); s != nil { 140 k := s.SmallestKey() 141 if err := ingestValidateKey(opts, &k); err != nil { 142 return nil, err 143 } 144 largest := s.LargestKey().Clone() 145 meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest) 146 } 147 } 148 149 // Update the range-key bounds for the table. 150 { 151 iter, err := r.NewRawRangeKeyIter() 152 if err != nil { 153 return nil, err 154 } 155 if iter != nil { 156 defer iter.Close() 157 var smallest InternalKey 158 if s := iter.First(); s != nil { 159 key := s.SmallestKey() 160 if err := ingestValidateKey(opts, &key); err != nil { 161 return nil, err 162 } 163 smallest = key.Clone() 164 } 165 if err := iter.Error(); err != nil { 166 return nil, err 167 } 168 if s := iter.Last(); s != nil { 169 k := s.SmallestKey() 170 if err := ingestValidateKey(opts, &k); err != nil { 171 return nil, err 172 } 173 // As range keys are fragmented, the end key of the last range key in 174 // the table provides the upper bound for the table. 175 largest := s.LargestKey().Clone() 176 meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest) 177 } 178 if err := iter.Error(); err != nil { 179 return nil, err 180 } 181 } 182 } 183 184 if !meta.HasPointKeys && !meta.HasRangeKeys { 185 return nil, nil 186 } 187 188 // Sanity check that the various bounds on the file were set consistently. 189 if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { 190 return nil, err 191 } 192 193 return meta, nil 194 } 195 196 func ingestLoad( 197 opts *Options, fmv FormatMajorVersion, paths []string, cacheID uint64, pending []FileNum, 198 ) ([]*fileMetadata, []string, error) { 199 meta := make([]*fileMetadata, 0, len(paths)) 200 newPaths := make([]string, 0, len(paths)) 201 for i := range paths { 202 m, err := ingestLoad1(opts, fmv, paths[i], cacheID, pending[i]) 203 if err != nil { 204 return nil, nil, err 205 } 206 if m != nil { 207 meta = append(meta, m) 208 newPaths = append(newPaths, paths[i]) 209 } 210 } 211 return meta, newPaths, nil 212 } 213 214 // Struct for sorting metadatas by smallest user keys, while ensuring the 215 // matching path also gets swapped to the same index. For use in 216 // ingestSortAndVerify. 217 type metaAndPaths struct { 218 meta []*fileMetadata 219 paths []string 220 cmp Compare 221 } 222 223 func (m metaAndPaths) Len() int { 224 return len(m.meta) 225 } 226 227 func (m metaAndPaths) Less(i, j int) bool { 228 return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0 229 } 230 231 func (m metaAndPaths) Swap(i, j int) { 232 m.meta[i], m.meta[j] = m.meta[j], m.meta[i] 233 m.paths[i], m.paths[j] = m.paths[j], m.paths[i] 234 } 235 236 func ingestSortAndVerify(cmp Compare, meta []*fileMetadata, paths []string) error { 237 if len(meta) <= 1 { 238 return nil 239 } 240 241 sort.Sort(&metaAndPaths{ 242 meta: meta, 243 paths: paths, 244 cmp: cmp, 245 }) 246 247 for i := 1; i < len(meta); i++ { 248 if sstableKeyCompare(cmp, meta[i-1].Largest, meta[i].Smallest) >= 0 { 249 return errors.New("bitalostable: external sstables have overlapping ranges") 250 } 251 } 252 return nil 253 } 254 255 func ingestCleanup(fs vfs.FS, dirname string, meta []*fileMetadata) error { 256 var firstErr error 257 for i := range meta { 258 target := base.MakeFilepath(fs, dirname, fileTypeTable, meta[i].FileNum) 259 if err := fs.Remove(target); err != nil { 260 firstErr = firstError(firstErr, err) 261 } 262 } 263 return firstErr 264 } 265 266 func ingestLink( 267 jobID int, opts *Options, dirname string, paths []string, meta []*fileMetadata, 268 ) error { 269 // Wrap the normal filesystem with one which wraps newly created files with 270 // vfs.NewSyncingFile. 271 fs := syncingFS{ 272 FS: opts.FS, 273 syncOpts: vfs.SyncingFileOptions{ 274 NoSyncOnClose: opts.NoSyncOnClose, 275 BytesPerSync: opts.BytesPerSync, 276 }, 277 } 278 279 for i := range paths { 280 target := base.MakeFilepath(fs, dirname, fileTypeTable, meta[i].FileNum) 281 var err error 282 if _, ok := opts.FS.(*vfs.MemFS); ok && opts.DebugCheck != nil { 283 // The combination of MemFS+Ingest+DebugCheck produces awkwardness around 284 // the subsequent deletion of files. The problem is that MemFS implements 285 // the Windows semantics of disallowing removal of an open file. This is 286 // desirable because it helps catch bugs where we violate the 287 // requirements of the Windows semantics. The normal practice for Ingest 288 // is for the caller to remove the source files after the ingest 289 // completes successfully. Unfortunately, Options.DebugCheck causes 290 // ingest to run DB.CheckLevels() before the ingest finishes, and 291 // DB.CheckLevels() populates the table cache with the newly ingested 292 // files. 293 // 294 // The combination of MemFS+Ingest+DebugCheck is primarily used in 295 // tests. As a workaround, disable hard linking this combination 296 // occurs. See https://github.com/zuoyebang/bitalostable/issues/495. 297 err = vfs.Copy(fs, paths[i], target) 298 } else { 299 err = vfs.LinkOrCopy(fs, paths[i], target) 300 } 301 if err != nil { 302 if err2 := ingestCleanup(fs, dirname, meta[:i]); err2 != nil { 303 opts.Logger.Infof("ingest cleanup failed: %v", err2) 304 } 305 return err 306 } 307 if opts.EventListener.TableCreated != nil { 308 opts.EventListener.TableCreated(TableCreateInfo{ 309 JobID: jobID, 310 Reason: "ingesting", 311 Path: target, 312 FileNum: meta[i].FileNum, 313 }) 314 } 315 } 316 317 return nil 318 } 319 320 func ingestMemtableOverlaps(cmp Compare, mem flushable, meta []*fileMetadata) bool { 321 iter := mem.newIter(nil) 322 rangeDelIter := mem.newRangeDelIter(nil) 323 defer iter.Close() 324 325 if rangeDelIter != nil { 326 defer rangeDelIter.Close() 327 } 328 329 for _, m := range meta { 330 if overlapWithIterator(iter, &rangeDelIter, m, cmp) { 331 return true 332 } 333 } 334 return false 335 } 336 337 func ingestUpdateSeqNum( 338 cmp Compare, format base.FormatKey, seqNum uint64, meta []*fileMetadata, 339 ) error { 340 setSeqFn := func(k base.InternalKey) base.InternalKey { 341 return base.MakeInternalKey(k.UserKey, seqNum, k.Kind()) 342 } 343 for _, m := range meta { 344 // NB: we set the fields directly here, rather than via their Extend* 345 // methods, as we are updating sequence numbers. 346 if m.HasPointKeys { 347 m.SmallestPointKey = setSeqFn(m.SmallestPointKey) 348 } 349 if m.HasRangeKeys { 350 m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey) 351 } 352 m.Smallest = setSeqFn(m.Smallest) 353 // Only update the seqnum for the largest key if that key is not an 354 // "exclusive sentinel" (i.e. a range deletion sentinel or a range key 355 // boundary), as doing so effectively drops the exclusive sentinel (by 356 // lowering the seqnum from the max value), and extends the bounds of the 357 // table. 358 // NB: as the largest range key is always an exclusive sentinel, it is never 359 // updated. 360 if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() { 361 m.LargestPointKey = setSeqFn(m.LargestPointKey) 362 } 363 if !m.Largest.IsExclusiveSentinel() { 364 m.Largest = setSeqFn(m.Largest) 365 } 366 // Setting smallestSeqNum == largestSeqNum triggers the setting of 367 // Properties.GlobalSeqNum when an sstable is loaded. 368 m.SmallestSeqNum = seqNum 369 m.LargestSeqNum = seqNum 370 // Ensure the new bounds are consistent. 371 if err := m.Validate(cmp, format); err != nil { 372 return err 373 } 374 seqNum++ 375 } 376 return nil 377 } 378 379 func overlapWithIterator( 380 iter internalIterator, rangeDelIter *keyspan.FragmentIterator, meta *fileMetadata, cmp Compare, 381 ) bool { 382 // Check overlap with point operations. 383 // 384 // When using levelIter, it seeks to the SST whose boundaries 385 // contain meta.Smallest.UserKey(S). 386 // It then tries to find a point in that SST that is >= S. 387 // If there's no such point it means the SST ends in a tombstone in which case 388 // levelIter.SeekGE generates a boundary range del sentinel. 389 // The comparison of this boundary with meta.Largest(L) below 390 // is subtle but maintains correctness. 391 // 1) boundary < L, 392 // since boundary is also > S (initial seek), 393 // whatever the boundary's start key may be, we're always overlapping. 394 // 2) boundary > L, 395 // overlap with boundary cannot be determined since we don't know boundary's start key. 396 // We require checking for overlap with rangeDelIter. 397 // 3) boundary == L and L is not sentinel, 398 // means boundary < L and hence is similar to 1). 399 // 4) boundary == L and L is sentinel, 400 // we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap. 401 key, _ := iter.SeekGE(meta.Smallest.UserKey, base.SeekGEFlagsNone) 402 if key != nil { 403 c := sstableKeyCompare(cmp, *key, meta.Largest) 404 if c <= 0 { 405 return true 406 } 407 } 408 409 // Check overlap with range deletions. 410 if rangeDelIter == nil || *rangeDelIter == nil { 411 return false 412 } 413 rangeDelItr := *rangeDelIter 414 rangeDel := rangeDelItr.SeekLT(meta.Smallest.UserKey) 415 if rangeDel == nil { 416 rangeDel = rangeDelItr.Next() 417 } 418 for ; rangeDel != nil; rangeDel = rangeDelItr.Next() { 419 key := rangeDel.SmallestKey() 420 c := sstableKeyCompare(cmp, key, meta.Largest) 421 if c > 0 { 422 // The start of the tombstone is after the largest key in the 423 // ingested table. 424 return false 425 } 426 if cmp(rangeDel.End, meta.Smallest.UserKey) > 0 { 427 // The end of the tombstone is greater than the smallest in the 428 // table. Note that the tombstone end key is exclusive, thus ">0" 429 // instead of ">=0". 430 return true 431 } 432 } 433 return false 434 } 435 436 func ingestTargetLevel( 437 newIters tableNewIters, 438 iterOps IterOptions, 439 cmp Compare, 440 v *version, 441 baseLevel int, 442 compactions map[*compaction]struct{}, 443 meta *fileMetadata, 444 ) (int, error) { 445 // Find the lowest level which does not have any files which overlap meta. We 446 // search from L0 to L6 looking for whether there are any files in the level 447 // which overlap meta. We want the "lowest" level (where lower means 448 // increasing level number) in order to reduce write amplification. 449 // 450 // There are 2 kinds of overlap we need to check for: file boundary overlap 451 // and data overlap. Data overlap implies file boundary overlap. Note that it 452 // is always possible to ingest into L0. 453 // 454 // To place meta at level i where i > 0: 455 // - there must not be any data overlap with levels <= i, since that will 456 // violate the sequence number invariant. 457 // - no file boundary overlap with level i, since that will violate the 458 // invariant that files do not overlap in levels i > 0. 459 // 460 // The file boundary overlap check is simpler to conceptualize. Consider the 461 // following example, in which the ingested file lies completely before or 462 // after the file being considered. 463 // 464 // |--| |--| ingested file: [a,b] or [f,g] 465 // |-----| existing file: [c,e] 466 // _____________________ 467 // a b c d e f g 468 // 469 // In both cases the ingested file can move to considering the next level. 470 // 471 // File boundary overlap does not necessarily imply data overlap. The check 472 // for data overlap is a little more nuanced. Consider the following examples: 473 // 474 // 1. No data overlap: 475 // 476 // |-| |--| ingested file: [cc-d] or [ee-ff] 477 // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] 478 // _____________________ 479 // a b c d e f g 480 // 481 // In this case the ingested files can "fall through" this level. The checks 482 // continue at the next level. 483 // 484 // 2. Data overlap: 485 // 486 // |--| ingested file: [d-e] 487 // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] 488 // _____________________ 489 // a b c d e f g 490 // 491 // In this case the file cannot be ingested into this level as the point 'dd' 492 // is in the way. 493 // 494 // It is worth noting that the check for data overlap is only approximate. In 495 // the previous example, the ingested table [d-e] could contain only the 496 // points 'd' and 'e', in which case the table would be eligible for 497 // considering lower levels. However, such a fine-grained check would need to 498 // be exhaustive (comparing points and ranges in both the ingested existing 499 // tables) and such a check is prohibitively expensive. Thus Pebble treats any 500 // existing point that falls within the ingested table bounds as being "data 501 // overlap". 502 503 targetLevel := 0 504 505 // Do we overlap with keys in L0? 506 iter := v.Levels[0].Iter() 507 for meta0 := iter.First(); meta0 != nil; meta0 = iter.Next() { 508 c1 := sstableKeyCompare(cmp, meta.Smallest, meta0.Largest) 509 c2 := sstableKeyCompare(cmp, meta.Largest, meta0.Smallest) 510 if c1 > 0 || c2 < 0 { 511 continue 512 } 513 514 iter, rangeDelIter, err := newIters(iter.Current(), nil, internalIterOpts{}) 515 if err != nil { 516 return 0, err 517 } 518 overlap := overlapWithIterator(iter, &rangeDelIter, meta, cmp) 519 iter.Close() 520 if rangeDelIter != nil { 521 rangeDelIter.Close() 522 } 523 if overlap { 524 return targetLevel, nil 525 } 526 } 527 528 level := baseLevel 529 for ; level < numLevels; level++ { 530 levelIter := newLevelIter(iterOps, cmp, nil /* split */, newIters, 531 v.Levels[level].Iter(), manifest.Level(level), nil) 532 var rangeDelIter keyspan.FragmentIterator 533 // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE 534 // sets it up for the target file. 535 levelIter.initRangeDel(&rangeDelIter) 536 overlap := overlapWithIterator(levelIter, &rangeDelIter, meta, cmp) 537 levelIter.Close() // Closes range del iter as well. 538 if overlap { 539 return targetLevel, nil 540 } 541 542 // Check boundary overlap. 543 boundaryOverlaps := v.Overlaps(level, cmp, meta.Smallest.UserKey, 544 meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) 545 if !boundaryOverlaps.Empty() { 546 continue 547 } 548 549 // Check boundary overlap with any ongoing compactions. 550 // 551 // We cannot check for data overlap with the new SSTs compaction will 552 // produce since compaction hasn't been done yet. However, there's no need 553 // to check since all keys in them will either be from c.startLevel or 554 // c.outputLevel, both levels having their data overlap already tested 555 // negative (else we'd have returned earlier). 556 overlaps := false 557 for c := range compactions { 558 if c.outputLevel == nil || level != c.outputLevel.level { 559 continue 560 } 561 if cmp(meta.Smallest.UserKey, c.largest.UserKey) <= 0 && 562 cmp(meta.Largest.UserKey, c.smallest.UserKey) >= 0 { 563 overlaps = true 564 break 565 } 566 } 567 if !overlaps { 568 targetLevel = level 569 } 570 } 571 return targetLevel, nil 572 } 573 574 // Ingest ingests a set of sstables into the DB. Ingestion of the files is 575 // atomic and semantically equivalent to creating a single batch containing all 576 // of the mutations in the sstables. Ingestion may require the memtable to be 577 // flushed. The ingested sstable files are moved into the DB and must reside on 578 // the same filesystem as the DB. Sstables can be created for ingestion using 579 // sstable.Writer. On success, Ingest removes the input paths. 580 // 581 // All sstables *must* be Sync()'d by the caller after all bytes are written 582 // and before its file handle is closed; failure to do so could violate 583 // durability or lead to corrupted on-disk state. This method cannot, in a 584 // platform-and-FS-agnostic way, ensure that all sstables in the input are 585 // properly synced to disk. Opening new file handles and Sync()-ing them 586 // does not always guarantee durability; see the discussion here on that: 587 // https://github.com/zuoyebang/bitalostable/pull/835#issuecomment-663075379 588 // 589 // Ingestion loads each sstable into the lowest level of the LSM which it 590 // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, 591 // ingestion forces the memtable to flush, and then waits for the flush to 592 // occur. 593 // 594 // The steps for ingestion are: 595 // 596 // 1. Allocate file numbers for every sstable being ingested. 597 // 2. Load the metadata for all sstables being ingest. 598 // 3. Sort the sstables by smallest key, verifying non overlap. 599 // 4. Hard link (or copy) the sstables into the DB directory. 600 // 5. Allocate a sequence number to use for all of the entries in the 601 // sstables. This is the step where overlap with memtables is 602 // determined. If there is overlap, we remember the most recent memtable 603 // that overlaps. 604 // 6. Update the sequence number in the ingested sstables. 605 // 7. Wait for the most recent memtable that overlaps to flush (if any). 606 // 8. Add the ingested sstables to the version (DB.ingestApply). 607 // 9. Publish the ingestion sequence number. 608 // 609 // Note that if the mutable memtable overlaps with ingestion, a flush of the 610 // memtable is forced equivalent to DB.Flush. Additionally, subsequent 611 // mutations that get sequence numbers larger than the ingestion sequence 612 // number get queued up behind the ingestion waiting for it to complete. This 613 // can produce a noticeable hiccup in performance. See 614 // https://github.com/zuoyebang/bitalostable/issues/25 for an idea for how to fix 615 // this hiccup. 616 func (d *DB) Ingest(paths []string) error { 617 if err := d.closed.Load(); err != nil { 618 panic(err) 619 } 620 if d.opts.ReadOnly { 621 return ErrReadOnly 622 } 623 _, err := d.ingest(paths, ingestTargetLevel) 624 return err 625 } 626 627 // IngestOperationStats provides some information about where in the LSM the 628 // bytes were ingested. 629 type IngestOperationStats struct { 630 // Bytes is the total bytes in the ingested sstables. 631 Bytes uint64 632 // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested 633 // into L0. 634 // Currently, this value is completely accurate, but we are allowing this to 635 // be approximate once https://github.com/zuoyebang/bitalostable/issues/25 is 636 // implemented. 637 ApproxIngestedIntoL0Bytes uint64 638 } 639 640 // IngestWithStats does the same as Ingest, and additionally returns 641 // IngestOperationStats. 642 func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) { 643 if err := d.closed.Load(); err != nil { 644 panic(err) 645 } 646 if d.opts.ReadOnly { 647 return IngestOperationStats{}, ErrReadOnly 648 } 649 return d.ingest(paths, ingestTargetLevel) 650 } 651 652 func (d *DB) ingest( 653 paths []string, targetLevelFunc ingestTargetLevelFunc, 654 ) (IngestOperationStats, error) { 655 // Allocate file numbers for all of the files being ingested and mark them as 656 // pending in order to prevent them from being deleted. Note that this causes 657 // the file number ordering to be out of alignment with sequence number 658 // ordering. The sorting of L0 tables by sequence number avoids relying on 659 // that (busted) invariant. 660 d.mu.Lock() 661 pendingOutputs := make([]FileNum, len(paths)) 662 for i := range paths { 663 pendingOutputs[i] = d.mu.versions.getNextFileNum() 664 } 665 jobID := d.mu.nextJobID 666 d.mu.nextJobID++ 667 d.mu.Unlock() 668 669 // Load the metadata for all of the files being ingested. This step detects 670 // and elides empty sstables. 671 meta, paths, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, d.cacheID, pendingOutputs) 672 if err != nil { 673 return IngestOperationStats{}, err 674 } 675 if len(meta) == 0 { 676 // All of the sstables to be ingested were empty. Nothing to do. 677 return IngestOperationStats{}, nil 678 } 679 680 // Verify the sstables do not overlap. 681 if err := ingestSortAndVerify(d.cmp, meta, paths); err != nil { 682 return IngestOperationStats{}, err 683 } 684 685 // Hard link the sstables into the DB directory. Since the sstables aren't 686 // referenced by a version, they won't be used. If the hard linking fails 687 // (e.g. because the files reside on a different filesystem), ingestLink will 688 // fall back to copying, and if that fails we undo our work and return an 689 // error. 690 if err := ingestLink(jobID, d.opts, d.dirname, paths, meta); err != nil { 691 return IngestOperationStats{}, err 692 } 693 // Fsync the directory we added the tables to. We need to do this at some 694 // point before we update the MANIFEST (via logAndApply), otherwise a crash 695 // can have the tables referenced in the MANIFEST, but not present in the 696 // directory. 697 if err := d.dataDir.Sync(); err != nil { 698 return IngestOperationStats{}, err 699 } 700 701 var mem *flushableEntry 702 prepare := func() { 703 // Note that d.commit.mu is held by commitPipeline when calling prepare. 704 705 d.mu.Lock() 706 defer d.mu.Unlock() 707 708 // Check to see if any files overlap with any of the memtables. The queue 709 // is ordered from oldest to newest with the mutable memtable being the 710 // last element in the slice. We want to wait for the newest table that 711 // overlaps. 712 for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { 713 m := d.mu.mem.queue[i] 714 if ingestMemtableOverlaps(d.cmp, m, meta) { 715 mem = m 716 if mem.flushable == d.mu.mem.mutable { 717 err = d.makeRoomForWrite(nil, true) 718 } 719 mem.flushForced = true 720 d.maybeScheduleFlush(true) 721 return 722 } 723 } 724 } 725 726 var ve *versionEdit 727 apply := func(seqNum uint64) { 728 if err != nil { 729 // An error occurred during prepare. 730 return 731 } 732 733 // Update the sequence number for all of the sstables in the 734 // metadata. Writing the metadata to the manifest when the 735 // version edit is applied is the mechanism that persists the 736 // sequence number. The sstables themselves are left unmodified. 737 if err = ingestUpdateSeqNum( 738 d.cmp, d.opts.Comparer.FormatKey, seqNum, meta, 739 ); err != nil { 740 return 741 } 742 743 // If we overlapped with a memtable in prepare wait for the flush to 744 // finish. 745 if mem != nil { 746 <-mem.flushed 747 } 748 749 // Assign the sstables to the correct level in the LSM and apply the 750 // version edit. 751 ve, err = d.ingestApply(jobID, meta, targetLevelFunc) 752 } 753 754 d.commit.AllocateSeqNum(len(meta), prepare, apply) 755 756 if err != nil { 757 if err2 := ingestCleanup(d.opts.FS, d.dirname, meta); err2 != nil { 758 d.opts.Logger.Infof("ingest cleanup failed: %v", err2) 759 } 760 } else { 761 for _, path := range paths { 762 if err2 := d.opts.FS.Remove(path); err2 != nil { 763 d.opts.Logger.Infof("ingest failed to remove original file: %s", err2) 764 } 765 } 766 } 767 768 info := TableIngestInfo{ 769 JobID: jobID, 770 GlobalSeqNum: meta[0].SmallestSeqNum, 771 Err: err, 772 } 773 var stats IngestOperationStats 774 if ve != nil { 775 info.Tables = make([]struct { 776 TableInfo 777 Level int 778 }, len(ve.NewFiles)) 779 for i := range ve.NewFiles { 780 e := &ve.NewFiles[i] 781 info.Tables[i].Level = e.Level 782 info.Tables[i].TableInfo = e.Meta.TableInfo() 783 stats.Bytes += e.Meta.Size 784 if e.Level == 0 { 785 stats.ApproxIngestedIntoL0Bytes += e.Meta.Size 786 } 787 } 788 } 789 d.opts.EventListener.TableIngested(info) 790 791 return stats, err 792 } 793 794 type ingestTargetLevelFunc func( 795 newIters tableNewIters, 796 iterOps IterOptions, 797 cmp Compare, 798 v *version, 799 baseLevel int, 800 compactions map[*compaction]struct{}, 801 meta *fileMetadata, 802 ) (int, error) 803 804 func (d *DB) ingestApply( 805 jobID int, meta []*fileMetadata, findTargetLevel ingestTargetLevelFunc, 806 ) (*versionEdit, error) { 807 d.mu.Lock() 808 defer d.mu.Unlock() 809 810 ve := &versionEdit{ 811 NewFiles: make([]newFileEntry, len(meta)), 812 } 813 metrics := make(map[int]*LevelMetrics) 814 815 // Lock the manifest for writing before we use the current version to 816 // determine the target level. This prevents two concurrent ingestion jobs 817 // from using the same version to determine the target level, and also 818 // provides serialization with concurrent compaction and flush jobs. 819 // logAndApply unconditionally releases the manifest lock, but any earlier 820 // returns must unlock the manifest. 821 d.mu.versions.logLock() 822 current := d.mu.versions.currentVersion() 823 baseLevel := d.mu.versions.picker.getBaseLevel() 824 iterOps := IterOptions{logger: d.opts.Logger} 825 for i := range meta { 826 // Determine the lowest level in the LSM for which the sstable doesn't 827 // overlap any existing files in the level. 828 m := meta[i] 829 f := &ve.NewFiles[i] 830 var err error 831 f.Level, err = findTargetLevel(d.newIters, iterOps, d.cmp, current, baseLevel, d.mu.compact.inProgress, m) 832 if err != nil { 833 d.mu.versions.logUnlock() 834 return nil, err 835 } 836 f.Meta = m 837 levelMetrics := metrics[f.Level] 838 if levelMetrics == nil { 839 levelMetrics = &LevelMetrics{} 840 metrics[f.Level] = levelMetrics 841 } 842 levelMetrics.NumFiles++ 843 levelMetrics.Size += int64(m.Size) 844 levelMetrics.BytesIngested += m.Size 845 levelMetrics.TablesIngested++ 846 } 847 if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo { 848 return d.getInProgressCompactionInfoLocked(nil) 849 }); err != nil { 850 return nil, err 851 } 852 d.updateReadStateLocked(d.opts.DebugCheck) 853 d.updateTableStatsLocked(ve.NewFiles) 854 d.deleteObsoleteFiles(jobID, false /* waitForOngoing */) 855 // The ingestion may have pushed a level over the threshold for compaction, 856 // so check to see if one is necessary and schedule it. 857 d.maybeScheduleCompaction() 858 d.maybeValidateSSTablesLocked(ve.NewFiles) 859 return ve, nil 860 } 861 862 // maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending 863 // queue of files to be validated, when the feature is enabled. 864 // DB.mu must be locked when calling. 865 func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) { 866 // Only add to the validation queue when the feature is enabled. 867 if !d.opts.Experimental.ValidateOnIngest { 868 return 869 } 870 871 d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...) 872 if d.shouldValidateSSTablesLocked() { 873 go d.validateSSTables() 874 } 875 } 876 877 // shouldValidateSSTablesLocked returns true if SSTable validation should run. 878 // DB.mu must be locked when calling. 879 func (d *DB) shouldValidateSSTablesLocked() bool { 880 return !d.mu.tableValidation.validating && 881 d.closed.Load() == nil && 882 d.opts.Experimental.ValidateOnIngest && 883 len(d.mu.tableValidation.pending) > 0 884 } 885 886 // validateSSTables runs a round of validation on the tables in the pending 887 // queue. 888 func (d *DB) validateSSTables() { 889 d.mu.Lock() 890 if !d.shouldValidateSSTablesLocked() { 891 d.mu.Unlock() 892 return 893 } 894 895 pending := d.mu.tableValidation.pending 896 d.mu.tableValidation.pending = nil 897 d.mu.tableValidation.validating = true 898 jobID := d.mu.nextJobID 899 d.mu.nextJobID++ 900 rs := d.loadReadState() 901 902 // Drop DB.mu before performing IO. 903 d.mu.Unlock() 904 905 // Validate all tables in the pending queue. This could lead to a situation 906 // where we are starving IO from other tasks due to having to page through 907 // all the blocks in all the sstables in the queue. 908 // TODO(travers): Add some form of pacing to avoid IO starvation. 909 for _, f := range pending { 910 // The file may have been moved or deleted since it was ingested, in 911 // which case we skip. 912 if !rs.current.Contains(f.Level, d.cmp, f.Meta) { 913 // Assume the file was moved to a lower level. It is rare enough 914 // that a table is moved or deleted between the time it was ingested 915 // and the time the validation routine runs that the overall cost of 916 // this inner loop is tolerably low, when amortized over all 917 // ingested tables. 918 found := false 919 for i := f.Level + 1; i < numLevels; i++ { 920 if rs.current.Contains(i, d.cmp, f.Meta) { 921 found = true 922 break 923 } 924 } 925 if !found { 926 continue 927 } 928 } 929 930 err := d.tableCache.withReader(f.Meta, func(r *sstable.Reader) error { 931 return r.ValidateBlockChecksums() 932 }) 933 if err != nil { 934 // TODO(travers): Hook into the corruption reporting pipeline, once 935 // available. See bitalostable#1192. 936 d.opts.Logger.Fatalf("bitalostable: encountered corruption during ingestion: %s", err) 937 } 938 939 d.opts.EventListener.TableValidated(TableValidatedInfo{ 940 JobID: jobID, 941 Meta: f.Meta, 942 }) 943 } 944 rs.unref() 945 946 d.mu.Lock() 947 defer d.mu.Unlock() 948 d.mu.tableValidation.validating = false 949 d.mu.tableValidation.cond.Broadcast() 950 if d.shouldValidateSSTablesLocked() { 951 go d.validateSSTables() 952 } 953 }