github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/manifest/version.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 "fmt" 10 "sort" 11 "strconv" 12 "strings" 13 "sync" 14 "sync/atomic" 15 "unicode" 16 17 "github.com/cockroachdb/errors" 18 "github.com/cockroachdb/pebble/internal/base" 19 "github.com/cockroachdb/pebble/internal/invariants" 20 ) 21 22 // Compare exports the base.Compare type. 23 type Compare = base.Compare 24 25 // InternalKey exports the base.InternalKey type. 26 type InternalKey = base.InternalKey 27 28 // TableInfo contains the common information for table related events. 29 type TableInfo struct { 30 // FileNum is the internal DB identifier for the table. 31 FileNum base.FileNum 32 // Size is the size of the file in bytes. 33 Size uint64 34 // Smallest is the smallest internal key in the table. 35 Smallest InternalKey 36 // Largest is the largest internal key in the table. 37 Largest InternalKey 38 // SmallestSeqNum is the smallest sequence number in the table. 39 SmallestSeqNum uint64 40 // LargestSeqNum is the largest sequence number in the table. 41 LargestSeqNum uint64 42 } 43 44 // TableStats contains statistics on a table used for compaction heuristics, 45 // and export via Metrics. 46 type TableStats struct { 47 // The total number of entries in the table. 48 NumEntries uint64 49 // The number of point and range deletion entries in the table. 50 NumDeletions uint64 51 // NumRangeKeySets is the total number of range key sets in the table. 52 // 53 // NB: If there's a chance that the sstable contains any range key sets, 54 // then NumRangeKeySets must be > 0. 55 NumRangeKeySets uint64 56 // Estimate of the total disk space that may be dropped by this table's 57 // point deletions by compacting them. 58 PointDeletionsBytesEstimate uint64 59 // Estimate of the total disk space that may be dropped by this table's 60 // range deletions by compacting them. This estimate is at data-block 61 // granularity and is not updated if compactions beneath the table reduce 62 // the amount of reclaimable disk space. It also does not account for 63 // overlapping data in L0 and ignores L0 sublevels, but the error that 64 // introduces is expected to be small. 65 // 66 // Tables in the bottommost level of the LSM may have a nonzero estimate if 67 // snapshots or move compactions prevented the elision of their range 68 // tombstones. A table in the bottommost level that was ingested into L6 69 // will have a zero estimate, because the file's sequence numbers indicate 70 // that the tombstone cannot drop any data contained within the file itself. 71 RangeDeletionsBytesEstimate uint64 72 // Total size of value blocks and value index block. 73 ValueBlocksSize uint64 74 } 75 76 // boundType represents the type of key (point or range) present as the smallest 77 // and largest keys. 78 type boundType uint8 79 80 const ( 81 boundTypePointKey boundType = iota + 1 82 boundTypeRangeKey 83 ) 84 85 // CompactionState is the compaction state of a file. 86 // 87 // The following shows the valid state transitions: 88 // 89 // NotCompacting --> Compacting --> Compacted 90 // ^ | 91 // | | 92 // +-------<-------+ 93 // 94 // Input files to a compaction transition to Compacting when a compaction is 95 // picked. A file that has finished compacting typically transitions into the 96 // Compacted state, at which point it is effectively obsolete ("zombied") and 97 // will eventually be removed from the LSM. A file that has been move-compacted 98 // will transition from Compacting back into the NotCompacting state, signaling 99 // that the file may be selected for a subsequent compaction. A failed 100 // compaction will result in all input tables transitioning from Compacting to 101 // NotCompacting. 102 // 103 // This state is in-memory only. It is not persisted to the manifest. 104 type CompactionState uint8 105 106 // CompactionStates. 107 const ( 108 CompactionStateNotCompacting CompactionState = iota 109 CompactionStateCompacting 110 CompactionStateCompacted 111 ) 112 113 // String implements fmt.Stringer. 114 func (s CompactionState) String() string { 115 switch s { 116 case CompactionStateNotCompacting: 117 return "NotCompacting" 118 case CompactionStateCompacting: 119 return "Compacting" 120 case CompactionStateCompacted: 121 return "Compacted" 122 default: 123 panic(fmt.Sprintf("pebble: unknown compaction state %d", s)) 124 } 125 } 126 127 // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of 128 // some version. FileMetadata does not contain the actual level of the sst, 129 // since such leveled-ssts can move across levels in different versions, while 130 // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical 131 // and virtual. Underlying both leveled-ssts is a backing-sst, for which the 132 // only state is FileBacking. A backing-sst is level-less. It is possible for a 133 // backing-sst to be referred to by a physical sst in one version and by one or 134 // more virtual ssts in one or more versions. A backing-sst becomes obsolete 135 // and can be deleted once it is no longer required by any physical or virtual 136 // sst in any version. 137 // 138 // We maintain some invariants: 139 // 140 // 1. Each physical and virtual sst will have a unique FileMetadata.FileNum, 141 // and there will be exactly one FileMetadata associated with the FileNum. 142 // 143 // 2. Within a version, a backing-sst is either only referred to by one 144 // physical sst or one or more virtual ssts. 145 // 146 // 3. Once a backing-sst is referred to by a virtual sst in the latest version, 147 // it cannot go back to being referred to by a physical sst in any future 148 // version. 149 // 150 // Once a physical sst is no longer needed by any version, we will no longer 151 // maintain the file metadata associated with it. We will still maintain the 152 // FileBacking associated with the physical sst if the backing sst is required 153 // by any virtual ssts in any version. 154 type FileMetadata struct { 155 // AllowedSeeks is used to determine if a file should be picked for 156 // a read triggered compaction. It is decremented when read sampling 157 // in pebble.Iterator after every after every positioning operation 158 // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc). 159 AllowedSeeks atomic.Int64 160 161 // statsValid indicates if stats have been loaded for the table. The 162 // TableStats structure is populated only if valid is true. 163 statsValid atomic.Bool 164 165 // FileBacking is the state which backs either a physical or virtual 166 // sstables. 167 FileBacking *FileBacking 168 169 // InitAllowedSeeks is the inital value of allowed seeks. This is used 170 // to re-set allowed seeks on a file once it hits 0. 171 InitAllowedSeeks int64 172 // FileNum is the file number. 173 // 174 // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum. 175 FileNum base.FileNum 176 // Size is the size of the file, in bytes. Size is an approximate value for 177 // virtual sstables. 178 // 179 // INVARIANTS: 180 // - When !FileMetadata.Virtual, Size == FileBacking.Size. 181 // - Size should be non-zero. Size 0 virtual sstables must not be created. 182 Size uint64 183 // File creation time in seconds since the epoch (1970-01-01 00:00:00 184 // UTC). For ingested sstables, this corresponds to the time the file was 185 // ingested. For virtual sstables, this corresponds to the wall clock time 186 // when the FileMetadata for the virtual sstable was first created. 187 CreationTime int64 188 // Lower and upper bounds for the smallest and largest sequence numbers in 189 // the table, across both point and range keys. For physical sstables, these 190 // values are tight bounds. For virtual sstables, there is no guarantee that 191 // there will be keys with SmallestSeqNum or LargestSeqNum within virtual 192 // sstable bounds. 193 SmallestSeqNum uint64 194 LargestSeqNum uint64 195 // SmallestPointKey and LargestPointKey are the inclusive bounds for the 196 // internal point keys stored in the table. This includes RANGEDELs, which 197 // alter point keys. 198 // NB: these field should be set using ExtendPointKeyBounds. They are left 199 // exported for reads as an optimization. 200 SmallestPointKey InternalKey 201 LargestPointKey InternalKey 202 // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the 203 // internal range keys stored in the table. 204 // NB: these field should be set using ExtendRangeKeyBounds. They are left 205 // exported for reads as an optimization. 206 SmallestRangeKey InternalKey 207 LargestRangeKey InternalKey 208 // Smallest and Largest are the inclusive bounds for the internal keys stored 209 // in the table, across both point and range keys. 210 // NB: these fields are derived from their point and range key equivalents, 211 // and are updated via the MaybeExtend{Point,Range}KeyBounds methods. 212 Smallest InternalKey 213 Largest InternalKey 214 // Stats describe table statistics. Protected by DB.mu. 215 // 216 // For virtual sstables, set stats upon virtual sstable creation as 217 // asynchronous computation of stats is not currently supported. 218 // 219 // TODO(bananabrick): To support manifest replay for virtual sstables, we 220 // probably need to compute virtual sstable stats asynchronously. Otherwise, 221 // we'd have to write virtual sstable stats to the version edit. 222 Stats TableStats 223 224 // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and 225 // pick L0 compactions. Only accurate for the most recent Version. 226 SubLevel int 227 L0Index int 228 minIntervalIndex int 229 maxIntervalIndex int 230 231 // NB: the alignment of this struct is 8 bytes. We pack all the bools to 232 // ensure an optimal packing. 233 234 // IsIntraL0Compacting is set to True if this file is part of an intra-L0 235 // compaction. When it's true, IsCompacting must also return true. If 236 // Compacting is true and IsIntraL0Compacting is false for an L0 file, the 237 // file must be part of a compaction to Lbase. 238 IsIntraL0Compacting bool 239 CompactionState CompactionState 240 // True if compaction of this file has been explicitly requested. 241 // Previously, RocksDB and earlier versions of Pebble allowed this 242 // flag to be set by a user table property collector. Some earlier 243 // versions of Pebble respected this flag, while other more recent 244 // versions ignored this flag. 245 // 246 // More recently this flag has been repurposed to facilitate the 247 // compaction of 'atomic compaction units'. Files marked for 248 // compaction are compacted in a rewrite compaction at the lowest 249 // possible compaction priority. 250 // 251 // NB: A count of files marked for compaction is maintained on 252 // Version, and compaction picking reads cached annotations 253 // determined by this field. 254 // 255 // Protected by DB.mu. 256 MarkedForCompaction bool 257 // HasPointKeys tracks whether the table contains point keys (including 258 // RANGEDELs). If a table contains only range deletions, HasPointsKeys is 259 // still true. 260 HasPointKeys bool 261 // HasRangeKeys tracks whether the table contains any range keys. 262 HasRangeKeys bool 263 // smallestSet and largestSet track whether the overall bounds have been set. 264 boundsSet bool 265 // boundTypeSmallest and boundTypeLargest provide an indication as to which 266 // key type (point or range) corresponds to the smallest and largest overall 267 // table bounds. 268 boundTypeSmallest, boundTypeLargest boundType 269 // Virtual is true if the FileMetadata belongs to a virtual sstable. 270 Virtual bool 271 } 272 273 // PhysicalFileMeta is used by functions which want a guarantee that their input 274 // belongs to a physical sst and not a virtual sst. 275 // 276 // NB: This type should only be constructed by calling 277 // FileMetadata.PhysicalMeta. 278 type PhysicalFileMeta struct { 279 *FileMetadata 280 } 281 282 // VirtualFileMeta is used by functions which want a guarantee that their input 283 // belongs to a virtual sst and not a physical sst. 284 // 285 // A VirtualFileMeta inherits all the same fields as a FileMetadata. These 286 // fields have additional invariants imposed on them, and/or slightly varying 287 // meanings: 288 // - Smallest and Largest (and their counterparts 289 // {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a 290 // key at that exact bound. We make the effort to determine the next smallest 291 // or largest key in an sstable after virtualizing it, to maintain this 292 // tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it 293 // could mean that a rangedel or range key ends at that user key, or has been 294 // truncated to that user key. 295 // - One invariant is that if a rangedel or range key is truncated on its 296 // upper bound, the virtual sstable *must* have a rangedel or range key 297 // sentinel key as its upper bound. This is because truncation yields 298 // an exclusive upper bound for the rangedel/rangekey, and if there are 299 // any points at that exclusive upper bound within the same virtual 300 // sstable, those could get uncovered by this truncation. We enforce this 301 // invariant in calls to keyspan.Truncate. 302 // - Size is an estimate of the size of the virtualized portion of this sstable. 303 // The underlying file's size is stored in FileBacking.Size, though it could 304 // also be estimated or could correspond to just the referenced portion of 305 // a file (eg. if the file originated on another node). 306 // - Size must be > 0. 307 // - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables. 308 // This means that all keys in the virtual sstable must have seqnums within 309 // [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's 310 // a key with a seqnum at either of the bounds. Calculating tight seqnum 311 // bounds would be too expensive and deliver little value. 312 // 313 // NB: This type should only be constructed by calling FileMetadata.VirtualMeta. 314 type VirtualFileMeta struct { 315 *FileMetadata 316 } 317 318 // PhysicalMeta should be the only source of creating the PhysicalFileMeta 319 // wrapper type. 320 func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta { 321 if m.Virtual { 322 panic("pebble: file metadata does not belong to a physical sstable") 323 } 324 return PhysicalFileMeta{ 325 m, 326 } 327 } 328 329 // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper 330 // type. 331 func (m *FileMetadata) VirtualMeta() VirtualFileMeta { 332 if !m.Virtual { 333 panic("pebble: file metadata does not belong to a virtual sstable") 334 } 335 return VirtualFileMeta{ 336 m, 337 } 338 } 339 340 // FileBacking either backs a single physical sstable, or one or more virtual 341 // sstables. 342 // 343 // See the comment above the FileMetadata type for sstable terminology. 344 type FileBacking struct { 345 // Reference count for the backing file on disk: incremented when a 346 // physical or virtual sstable which is backed by the FileBacking is 347 // added to a version and decremented when the version is unreferenced. 348 // We ref count in order to determine when it is safe to delete a 349 // backing sst file from disk. The backing file is obsolete when the 350 // reference count falls to zero. 351 refs atomic.Int32 352 // latestVersionRefs are the references to the FileBacking in the 353 // latest version. This reference can be through a single physical 354 // sstable in the latest version, or one or more virtual sstables in the 355 // latest version. 356 // 357 // INVARIANT: latestVersionRefs <= refs. 358 latestVersionRefs atomic.Int32 359 // VirtualizedSize is set iff the backing sst is only referred to by 360 // virtual ssts in the latest version. VirtualizedSize is the sum of the 361 // virtual sstable sizes of all of the virtual sstables in the latest 362 // version which are backed by the physical sstable. When a virtual 363 // sstable is removed from the latest version, we will decrement the 364 // VirtualizedSize. During compaction picking, we'll compensate a 365 // virtual sstable file size by 366 // (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs. 367 // The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize 368 // is high, then the space amplification due to virtual sstables is 369 // high, and we should pick the virtual sstable with a higher priority. 370 // 371 // TODO(bananabrick): Compensate the virtual sstable file size using 372 // the VirtualizedSize during compaction picking and test. 373 VirtualizedSize atomic.Uint64 374 DiskFileNum base.DiskFileNum 375 Size uint64 376 } 377 378 // InitPhysicalBacking allocates and sets the FileBacking which is required by a 379 // physical sstable FileMetadata. 380 // 381 // Ensure that the state required by FileBacking, such as the FileNum, is 382 // already set on the FileMetadata before InitPhysicalBacking is called. 383 // Calling InitPhysicalBacking only after the relevant state has been set in the 384 // FileMetadata is not necessary in tests which don't rely on FileBacking. 385 func (m *FileMetadata) InitPhysicalBacking() { 386 if m.Virtual { 387 panic("pebble: virtual sstables should use a pre-existing FileBacking") 388 } 389 if m.FileBacking == nil { 390 m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()} 391 } 392 } 393 394 // InitProviderBacking creates a new FileBacking for a file backed by 395 // an objstorage.Provider. 396 func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) { 397 if !m.Virtual { 398 panic("pebble: provider-backed sstables must be virtual") 399 } 400 if m.FileBacking == nil { 401 m.FileBacking = &FileBacking{DiskFileNum: fileNum} 402 } 403 } 404 405 // ValidateVirtual should be called once the FileMetadata for a virtual sstable 406 // is created to verify that the fields of the virtual sstable are sound. 407 func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) { 408 if !m.Virtual { 409 panic("pebble: invalid virtual sstable") 410 } 411 412 if createdFrom.SmallestSeqNum != m.SmallestSeqNum { 413 panic("pebble: invalid smallest sequence number for virtual sstable") 414 } 415 416 if createdFrom.LargestSeqNum != m.LargestSeqNum { 417 panic("pebble: invalid largest sequence number for virtual sstable") 418 } 419 420 if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking { 421 panic("pebble: invalid physical sstable state for virtual sstable") 422 } 423 424 if m.Size == 0 { 425 panic("pebble: virtual sstable size must be set upon creation") 426 } 427 } 428 429 // Refs returns the refcount of backing sstable. 430 func (m *FileMetadata) Refs() int32 { 431 return m.FileBacking.refs.Load() 432 } 433 434 // Ref increments the ref count associated with the backing sstable. 435 func (m *FileMetadata) Ref() { 436 m.FileBacking.refs.Add(1) 437 } 438 439 // Unref decrements the ref count associated with the backing sstable. 440 func (m *FileMetadata) Unref() int32 { 441 v := m.FileBacking.refs.Add(-1) 442 if invariants.Enabled && v < 0 { 443 panic("pebble: invalid FileMetadata refcounting") 444 } 445 return v 446 } 447 448 // LatestRef increments the latest ref count associated with the backing 449 // sstable. 450 func (m *FileMetadata) LatestRef() { 451 m.FileBacking.latestVersionRefs.Add(1) 452 453 if m.Virtual { 454 m.FileBacking.VirtualizedSize.Add(m.Size) 455 } 456 } 457 458 // LatestUnref decrements the latest ref count associated with the backing 459 // sstable. 460 func (m *FileMetadata) LatestUnref() int32 { 461 if m.Virtual { 462 m.FileBacking.VirtualizedSize.Add(-m.Size) 463 } 464 465 v := m.FileBacking.latestVersionRefs.Add(-1) 466 if invariants.Enabled && v < 0 { 467 panic("pebble: invalid FileMetadata latest refcounting") 468 } 469 return v 470 } 471 472 // LatestRefs returns the latest ref count associated with the backing sstable. 473 func (m *FileMetadata) LatestRefs() int32 { 474 return m.FileBacking.latestVersionRefs.Load() 475 } 476 477 // SetCompactionState transitions this file's compaction state to the given 478 // state. Protected by DB.mu. 479 func (m *FileMetadata) SetCompactionState(to CompactionState) { 480 if invariants.Enabled { 481 transitionErr := func() error { 482 return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to) 483 } 484 switch m.CompactionState { 485 case CompactionStateNotCompacting: 486 if to != CompactionStateCompacting { 487 panic(transitionErr()) 488 } 489 case CompactionStateCompacting: 490 if to != CompactionStateCompacted && to != CompactionStateNotCompacting { 491 panic(transitionErr()) 492 } 493 case CompactionStateCompacted: 494 panic(transitionErr()) 495 default: 496 panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState)) 497 } 498 } 499 m.CompactionState = to 500 } 501 502 // IsCompacting returns true if this file's compaction state is 503 // CompactionStateCompacting. Protected by DB.mu. 504 func (m *FileMetadata) IsCompacting() bool { 505 return m.CompactionState == CompactionStateCompacting 506 } 507 508 // StatsValid returns true if the table stats have been populated. If StatValid 509 // returns true, the Stats field may be read (with or without holding the 510 // database mutex). 511 func (m *FileMetadata) StatsValid() bool { 512 return m.statsValid.Load() 513 } 514 515 // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu 516 // while populating TableStats and calling StatsMarkValud. Once stats are 517 // populated, they must not be mutated. 518 func (m *FileMetadata) StatsMarkValid() { 519 m.statsValid.Store(true) 520 } 521 522 // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds 523 // and overall table bounds with the given smallest and largest keys. The 524 // smallest and largest bounds may not be extended if the table already has a 525 // bound that is smaller or larger, respectively. The receiver is returned. 526 // NB: calling this method should be preferred to manually setting the bounds by 527 // manipulating the fields directly, to maintain certain invariants. 528 func (m *FileMetadata) ExtendPointKeyBounds( 529 cmp Compare, smallest, largest InternalKey, 530 ) *FileMetadata { 531 // Update the point key bounds. 532 if !m.HasPointKeys { 533 m.SmallestPointKey, m.LargestPointKey = smallest, largest 534 m.HasPointKeys = true 535 } else { 536 if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 { 537 m.SmallestPointKey = smallest 538 } 539 if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 { 540 m.LargestPointKey = largest 541 } 542 } 543 // Update the overall bounds. 544 m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey) 545 return m 546 } 547 548 // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds 549 // and overall table bounds with the given smallest and largest keys. The 550 // smallest and largest bounds may not be extended if the table already has a 551 // bound that is smaller or larger, respectively. The receiver is returned. 552 // NB: calling this method should be preferred to manually setting the bounds by 553 // manipulating the fields directly, to maintain certain invariants. 554 func (m *FileMetadata) ExtendRangeKeyBounds( 555 cmp Compare, smallest, largest InternalKey, 556 ) *FileMetadata { 557 // Update the range key bounds. 558 if !m.HasRangeKeys { 559 m.SmallestRangeKey, m.LargestRangeKey = smallest, largest 560 m.HasRangeKeys = true 561 } else { 562 if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 { 563 m.SmallestRangeKey = smallest 564 } 565 if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 { 566 m.LargestRangeKey = largest 567 } 568 } 569 // Update the overall bounds. 570 m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey) 571 return m 572 } 573 574 // extendOverallBounds attempts to extend the overall table lower and upper 575 // bounds. The given bounds may not be used if a lower or upper bound already 576 // exists that is smaller or larger than the given keys, respectively. The given 577 // boundType will be used if the bounds are updated. 578 func (m *FileMetadata) extendOverallBounds( 579 cmp Compare, smallest, largest InternalKey, bTyp boundType, 580 ) { 581 if !m.boundsSet { 582 m.Smallest, m.Largest = smallest, largest 583 m.boundsSet = true 584 m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp 585 } else { 586 if base.InternalCompare(cmp, smallest, m.Smallest) < 0 { 587 m.Smallest = smallest 588 m.boundTypeSmallest = bTyp 589 } 590 if base.InternalCompare(cmp, largest, m.Largest) > 0 { 591 m.Largest = largest 592 m.boundTypeLargest = bTyp 593 } 594 } 595 } 596 597 // Overlaps returns true if the file key range overlaps with the given range. 598 func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool { 599 if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) { 600 // f is completely before the specified range; no overlap. 601 return false 602 } 603 if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) { 604 // f is completely after the specified range; no overlap. 605 return false 606 } 607 return true 608 } 609 610 // ContainedWithinSpan returns true if the file key range completely overlaps with the 611 // given range ("end" is assumed to exclusive). 612 func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool { 613 lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end) 614 return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel())) 615 } 616 617 // ContainsKeyType returns whether or not the file contains keys of the provided 618 // type. 619 func (m *FileMetadata) ContainsKeyType(kt KeyType) bool { 620 switch kt { 621 case KeyTypePointAndRange: 622 return true 623 case KeyTypePoint: 624 return m.HasPointKeys 625 case KeyTypeRange: 626 return m.HasRangeKeys 627 default: 628 panic("unrecognized key type") 629 } 630 } 631 632 // SmallestBound returns the file's smallest bound of the key type. It returns a 633 // false second return value if the file does not contain any keys of the key 634 // type. 635 func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) { 636 switch kt { 637 case KeyTypePointAndRange: 638 return &m.Smallest, true 639 case KeyTypePoint: 640 return &m.SmallestPointKey, m.HasPointKeys 641 case KeyTypeRange: 642 return &m.SmallestRangeKey, m.HasRangeKeys 643 default: 644 panic("unrecognized key type") 645 } 646 } 647 648 // LargestBound returns the file's largest bound of the key type. It returns a 649 // false second return value if the file does not contain any keys of the key 650 // type. 651 func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) { 652 switch kt { 653 case KeyTypePointAndRange: 654 return &m.Largest, true 655 case KeyTypePoint: 656 return &m.LargestPointKey, m.HasPointKeys 657 case KeyTypeRange: 658 return &m.LargestRangeKey, m.HasRangeKeys 659 default: 660 panic("unrecognized key type") 661 } 662 } 663 664 const ( 665 maskContainsPointKeys = 1 << 0 666 maskSmallest = 1 << 1 667 maskLargest = 1 << 2 668 ) 669 670 // boundsMarker returns a marker byte whose bits encode the following 671 // information (in order from least significant bit): 672 // - if the table contains point keys 673 // - if the table's smallest key is a point key 674 // - if the table's largest key is a point key 675 func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) { 676 if m.HasPointKeys { 677 sentinel |= maskContainsPointKeys 678 } 679 switch m.boundTypeSmallest { 680 case boundTypePointKey: 681 sentinel |= maskSmallest 682 case boundTypeRangeKey: 683 // No op - leave bit unset. 684 default: 685 return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum) 686 } 687 switch m.boundTypeLargest { 688 case boundTypePointKey: 689 sentinel |= maskLargest 690 case boundTypeRangeKey: 691 // No op - leave bit unset. 692 default: 693 return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum) 694 } 695 return 696 } 697 698 // String implements fmt.Stringer, printing the file number and the overall 699 // table bounds. 700 func (m *FileMetadata) String() string { 701 return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest) 702 } 703 704 // DebugString returns a verbose representation of FileMetadata, typically for 705 // use in tests and debugging, returning the file number and the point, range 706 // and overall bounds for the table. 707 func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string { 708 var b bytes.Buffer 709 fmt.Fprintf(&b, "%s:[%s-%s]", 710 m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format)) 711 if !verbose { 712 return b.String() 713 } 714 fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum) 715 if m.HasPointKeys { 716 fmt.Fprintf(&b, " points:[%s-%s]", 717 m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format)) 718 } 719 if m.HasRangeKeys { 720 fmt.Fprintf(&b, " ranges:[%s-%s]", 721 m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format)) 722 } 723 return b.String() 724 } 725 726 // ParseFileMetadataDebug parses a FileMetadata from its DebugString 727 // representation. 728 func ParseFileMetadataDebug(s string) (*FileMetadata, error) { 729 // Split lines of the form: 730 // 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...] 731 fields := strings.FieldsFunc(s, func(c rune) bool { 732 switch c { 733 case ':', '[', '-', ']': 734 return true 735 default: 736 return unicode.IsSpace(c) // NB: also trim whitespace padding. 737 } 738 }) 739 if len(fields)%3 != 0 { 740 return nil, errors.Newf("malformed input: %s", s) 741 } 742 m := &FileMetadata{} 743 for len(fields) > 0 { 744 prefix := fields[0] 745 if prefix == "seqnums" { 746 smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64) 747 if err != nil { 748 return m, errors.Newf("malformed input: %s: %s", s, err) 749 } 750 largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64) 751 if err != nil { 752 return m, errors.Newf("malformed input: %s: %s", s, err) 753 } 754 m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum 755 fields = fields[3:] 756 continue 757 } 758 smallest := base.ParsePrettyInternalKey(fields[1]) 759 largest := base.ParsePrettyInternalKey(fields[2]) 760 switch prefix { 761 case "points": 762 m.SmallestPointKey, m.LargestPointKey = smallest, largest 763 m.HasPointKeys = true 764 case "ranges": 765 m.SmallestRangeKey, m.LargestRangeKey = smallest, largest 766 m.HasRangeKeys = true 767 default: 768 fileNum, err := strconv.ParseUint(prefix, 10, 64) 769 if err != nil { 770 return m, errors.Newf("malformed input: %s: %s", s, err) 771 } 772 m.FileNum = base.FileNum(fileNum) 773 m.Smallest, m.Largest = smallest, largest 774 m.boundsSet = true 775 } 776 fields = fields[3:] 777 } 778 // By default, when the parser sees just the overall bounds, we set the point 779 // keys. This preserves backwards compatability with existing test cases that 780 // specify only the overall bounds. 781 if !m.HasPointKeys && !m.HasRangeKeys { 782 m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest 783 m.HasPointKeys = true 784 } 785 m.InitPhysicalBacking() 786 return m, nil 787 } 788 789 // Validate validates the metadata for consistency with itself, returning an 790 // error if inconsistent. 791 func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error { 792 // Combined range and point key validation. 793 794 if !m.HasPointKeys && !m.HasRangeKeys { 795 return base.CorruptionErrorf("file %s has neither point nor range keys", 796 errors.Safe(m.FileNum)) 797 } 798 if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 { 799 return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s", 800 errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey), 801 m.Largest.Pretty(formatKey)) 802 } 803 if m.SmallestSeqNum > m.LargestSeqNum { 804 return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d", 805 errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum) 806 } 807 808 // Point key validation. 809 810 if m.HasPointKeys { 811 if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 { 812 return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s", 813 errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey), 814 m.LargestPointKey.Pretty(formatKey)) 815 } 816 if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 || 817 base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 { 818 return base.CorruptionErrorf( 819 "file %s has inconsistent point key bounds relative to overall bounds: "+ 820 "overall = [%s-%s], point keys = [%s-%s]", 821 errors.Safe(m.FileNum), 822 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey), 823 m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey), 824 ) 825 } 826 } 827 828 // Range key validation. 829 830 if m.HasRangeKeys { 831 if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 { 832 return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s", 833 errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey), 834 m.LargestRangeKey.Pretty(formatKey)) 835 } 836 if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 || 837 base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 { 838 return base.CorruptionErrorf( 839 "file %s has inconsistent range key bounds relative to overall bounds: "+ 840 "overall = [%s-%s], range keys = [%s-%s]", 841 errors.Safe(m.FileNum), 842 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey), 843 m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey), 844 ) 845 } 846 } 847 848 // Ensure that FileMetadata.Init was called. 849 if m.FileBacking == nil { 850 return base.CorruptionErrorf("file metadata FileBacking not set") 851 } 852 853 return nil 854 } 855 856 // TableInfo returns a subset of the FileMetadata state formatted as a 857 // TableInfo. 858 func (m *FileMetadata) TableInfo() TableInfo { 859 return TableInfo{ 860 FileNum: m.FileNum, 861 Size: m.Size, 862 Smallest: m.Smallest, 863 Largest: m.Largest, 864 SmallestSeqNum: m.SmallestSeqNum, 865 LargestSeqNum: m.LargestSeqNum, 866 } 867 } 868 869 func cmpUint64(a, b uint64) int { 870 switch { 871 case a < b: 872 return -1 873 case a > b: 874 return +1 875 default: 876 return 0 877 } 878 } 879 880 func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int { 881 // NB: This is the same ordering that RocksDB uses for L0 files. 882 883 // Sort first by largest sequence number. 884 if m.LargestSeqNum != b.LargestSeqNum { 885 return cmpUint64(m.LargestSeqNum, b.LargestSeqNum) 886 } 887 // Then by smallest sequence number. 888 if m.SmallestSeqNum != b.SmallestSeqNum { 889 return cmpUint64(m.SmallestSeqNum, b.SmallestSeqNum) 890 } 891 // Break ties by file number. 892 return cmpUint64(uint64(m.FileNum), uint64(b.FileNum)) 893 } 894 895 func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool { 896 return m.cmpSeqNum(b) < 0 897 } 898 899 func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int { 900 return base.InternalCompare(cmp, m.Smallest, b.Smallest) 901 } 902 903 // KeyRange returns the minimum smallest and maximum largest internalKey for 904 // all the FileMetadata in iters. 905 func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) { 906 first := true 907 for _, iter := range iters { 908 for meta := iter.First(); meta != nil; meta = iter.Next() { 909 if first { 910 first = false 911 smallest, largest = meta.Smallest, meta.Largest 912 continue 913 } 914 if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 { 915 smallest = meta.Smallest 916 } 917 if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 { 918 largest = meta.Largest 919 } 920 } 921 } 922 return smallest, largest 923 } 924 925 type bySeqNum []*FileMetadata 926 927 func (b bySeqNum) Len() int { return len(b) } 928 func (b bySeqNum) Less(i, j int) bool { 929 return b[i].lessSeqNum(b[j]) 930 } 931 func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 932 933 // SortBySeqNum sorts the specified files by increasing sequence number. 934 func SortBySeqNum(files []*FileMetadata) { 935 sort.Sort(bySeqNum(files)) 936 } 937 938 type bySmallest struct { 939 files []*FileMetadata 940 cmp Compare 941 } 942 943 func (b bySmallest) Len() int { return len(b.files) } 944 func (b bySmallest) Less(i, j int) bool { 945 return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0 946 } 947 func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] } 948 949 // SortBySmallest sorts the specified files by smallest key using the supplied 950 // comparison function to order user keys. 951 func SortBySmallest(files []*FileMetadata, cmp Compare) { 952 sort.Sort(bySmallest{files, cmp}) 953 } 954 955 func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice { 956 startIter := iter.Clone() 957 { 958 startIterFile := startIter.SeekGE(cmp, start) 959 // SeekGE compares user keys. The user key `start` may be equal to the 960 // f.Largest because f.Largest is a range deletion sentinel, indicating 961 // that the user key `start` is NOT contained within the file f. If 962 // that's the case, we can narrow the overlapping bounds to exclude the 963 // file with the sentinel. 964 if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() && 965 cmp(startIterFile.Largest.UserKey, start) == 0 { 966 startIterFile = startIter.Next() 967 } 968 _ = startIterFile // Ignore unused assignment. 969 } 970 971 endIter := iter.Clone() 972 { 973 endIterFile := endIter.SeekGE(cmp, end) 974 975 if !exclusiveEnd { 976 // endIter is now pointing at the *first* file with a largest key >= end. 977 // If there are multiple files including the user key `end`, we want all 978 // of them, so move forward. 979 for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 { 980 endIterFile = endIter.Next() 981 } 982 } 983 984 // LevelSlice uses inclusive bounds, so if we seeked to the end sentinel 985 // or nexted too far because Largest.UserKey equaled `end`, go back. 986 // 987 // Consider !exclusiveEnd and end = 'f', with the following file bounds: 988 // 989 // [b,d] [e, f] [f, f] [g, h] 990 // 991 // the above for loop will Next until it arrives at [g, h]. We need to 992 // observe that g > f, and Prev to the file with bounds [f, f]. 993 if endIterFile == nil { 994 endIterFile = endIter.Prev() 995 } else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd { 996 endIterFile = endIter.Prev() 997 } 998 _ = endIterFile // Ignore unused assignment. 999 } 1000 return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter) 1001 } 1002 1003 // NumLevels is the number of levels a Version contains. 1004 const NumLevels = 7 1005 1006 // NewVersion constructs a new Version with the provided files. It requires 1007 // the provided files are already well-ordered. It's intended for testing. 1008 func NewVersion( 1009 cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata, 1010 ) *Version { 1011 var v Version 1012 for l := range files { 1013 // NB: We specifically insert `files` into the B-Tree in the order 1014 // they appear within `files`. Some tests depend on this behavior in 1015 // order to test consistency checking, etc. Once we've constructed the 1016 // initial B-Tree, we swap out the btreeCmp for the correct one. 1017 // TODO(jackson): Adjust or remove the tests and remove this. 1018 v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l]) 1019 v.Levels[l].level = l 1020 if l == 0 { 1021 v.Levels[l].tree.cmp = btreeCmpSeqNum 1022 } else { 1023 v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp) 1024 } 1025 for _, f := range files[l] { 1026 v.Levels[l].totalSize += f.Size 1027 } 1028 } 1029 if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 1030 panic(err) 1031 } 1032 return &v 1033 } 1034 1035 // Version is a collection of file metadata for on-disk tables at various 1036 // levels. In-memory DBs are written to level-0 tables, and compactions 1037 // migrate data from level N to level N+1. The tables map internal keys (which 1038 // are a user key, a delete or set bit, and a sequence number) to user values. 1039 // 1040 // The tables at level 0 are sorted by largest sequence number. Due to file 1041 // ingestion, there may be overlap in the ranges of sequence numbers contain in 1042 // level 0 sstables. In particular, it is valid for one level 0 sstable to have 1043 // the seqnum range [1,100] while an adjacent sstable has the seqnum range 1044 // [50,50]. This occurs when the [50,50] table was ingested and given a global 1045 // seqnum. The ingestion code will have ensured that the [50,50] sstable will 1046 // not have any keys that overlap with the [1,100] in the seqnum range 1047 // [1,49]. The range of internal keys [fileMetadata.smallest, 1048 // fileMetadata.largest] in each level 0 table may overlap. 1049 // 1050 // The tables at any non-0 level are sorted by their internal key range and any 1051 // two tables at the same non-0 level do not overlap. 1052 // 1053 // The internal key ranges of two tables at different levels X and Y may 1054 // overlap, for any X != Y. 1055 // 1056 // Finally, for every internal key in a table at level X, there is no internal 1057 // key in a higher level table that has both the same user key and a higher 1058 // sequence number. 1059 type Version struct { 1060 refs atomic.Int32 1061 1062 // The level 0 sstables are organized in a series of sublevels. Similar to 1063 // the seqnum invariant in normal levels, there is no internal key in a 1064 // higher level table that has both the same user key and a higher sequence 1065 // number. Within a sublevel, tables are sorted by their internal key range 1066 // and any two tables at the same sublevel do not overlap. Unlike the normal 1067 // levels, sublevel n contains older tables (lower sequence numbers) than 1068 // sublevel n+1. 1069 // 1070 // The L0Sublevels struct is mostly used for compaction picking. As most 1071 // internal data structures in it are only necessary for compaction picking 1072 // and not for iterator creation, the reference to L0Sublevels is nil'd 1073 // after this version becomes the non-newest version, to reduce memory 1074 // usage. 1075 // 1076 // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files 1077 // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to 1078 // a reference to that slice, as that slice is necessary for iterator 1079 // creation and needs to outlast L0Sublevels. 1080 L0Sublevels *L0Sublevels 1081 L0SublevelFiles []LevelSlice 1082 1083 Levels [NumLevels]LevelMetadata 1084 1085 // RangeKeyLevels holds a subset of the same files as Levels that contain range 1086 // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this 1087 // duplication should be minimal, as range keys are expected to be rare. 1088 RangeKeyLevels [NumLevels]LevelMetadata 1089 1090 // The callback to invoke when the last reference to a version is 1091 // removed. Will be called with list.mu held. 1092 Deleted func(obsolete []*FileBacking) 1093 1094 // Stats holds aggregated stats about the version maintained from 1095 // version to version. 1096 Stats struct { 1097 // MarkedForCompaction records the count of files marked for 1098 // compaction within the version. 1099 MarkedForCompaction int 1100 } 1101 1102 // The list the version is linked into. 1103 list *VersionList 1104 1105 // The next/prev link for the versionList doubly-linked list of versions. 1106 prev, next *Version 1107 } 1108 1109 // String implements fmt.Stringer, printing the FileMetadata for each level in 1110 // the Version. 1111 func (v *Version) String() string { 1112 return v.string(base.DefaultFormatter, false) 1113 } 1114 1115 // DebugString returns an alternative format to String() which includes sequence 1116 // number and kind information for the sstable boundaries. 1117 func (v *Version) DebugString(format base.FormatKey) string { 1118 return v.string(format, true) 1119 } 1120 1121 func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string { 1122 var buf bytes.Buffer 1123 for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- { 1124 fmt.Fprintf(&buf, "0.%d:\n", sublevel) 1125 sublevels[sublevel].Each(func(f *FileMetadata) { 1126 fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose)) 1127 }) 1128 } 1129 return buf.String() 1130 } 1131 1132 func (v *Version) string(format base.FormatKey, verbose bool) string { 1133 var buf bytes.Buffer 1134 if len(v.L0SublevelFiles) > 0 { 1135 fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles)) 1136 } 1137 for level := 1; level < NumLevels; level++ { 1138 if v.Levels[level].Empty() { 1139 continue 1140 } 1141 fmt.Fprintf(&buf, "%d:\n", level) 1142 iter := v.Levels[level].Iter() 1143 for f := iter.First(); f != nil; f = iter.Next() { 1144 fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose)) 1145 } 1146 } 1147 return buf.String() 1148 } 1149 1150 // ParseVersionDebug parses a Version from its DebugString output. 1151 func ParseVersionDebug( 1152 cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string, 1153 ) (*Version, error) { 1154 var level int 1155 var files [NumLevels][]*FileMetadata 1156 for _, l := range strings.Split(s, "\n") { 1157 l = strings.TrimSpace(l) 1158 1159 switch l[:2] { 1160 case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:": 1161 var err error 1162 level, err = strconv.Atoi(l[:1]) 1163 if err != nil { 1164 return nil, err 1165 } 1166 default: 1167 m, err := ParseFileMetadataDebug(l) 1168 if err != nil { 1169 return nil, err 1170 } 1171 // If we only parsed overall bounds, default to setting the point bounds. 1172 if !m.HasPointKeys && !m.HasRangeKeys { 1173 m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest 1174 m.HasPointKeys = true 1175 } 1176 files[level] = append(files[level], m) 1177 } 1178 } 1179 // Reverse the order of L0 files. This ensures we construct the same 1180 // sublevels. (They're printed from higher sublevel to lower, which means in 1181 // a partial order that represents newest to oldest). 1182 for i := 0; i < len(files[0])/2; i++ { 1183 files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i] 1184 } 1185 return NewVersion(cmp, formatKey, flushSplitBytes, files), nil 1186 } 1187 1188 // Refs returns the number of references to the version. 1189 func (v *Version) Refs() int32 { 1190 return v.refs.Load() 1191 } 1192 1193 // Ref increments the version refcount. 1194 func (v *Version) Ref() { 1195 v.refs.Add(1) 1196 } 1197 1198 // Unref decrements the version refcount. If the last reference to the version 1199 // was removed, the version is removed from the list of versions and the 1200 // Deleted callback is invoked. Requires that the VersionList mutex is NOT 1201 // locked. 1202 func (v *Version) Unref() { 1203 if v.refs.Add(-1) == 0 { 1204 l := v.list 1205 l.mu.Lock() 1206 l.Remove(v) 1207 v.Deleted(v.unrefFiles()) 1208 l.mu.Unlock() 1209 } 1210 } 1211 1212 // UnrefLocked decrements the version refcount. If the last reference to the 1213 // version was removed, the version is removed from the list of versions and 1214 // the Deleted callback is invoked. Requires that the VersionList mutex is 1215 // already locked. 1216 func (v *Version) UnrefLocked() { 1217 if v.refs.Add(-1) == 0 { 1218 v.list.Remove(v) 1219 v.Deleted(v.unrefFiles()) 1220 } 1221 } 1222 1223 func (v *Version) unrefFiles() []*FileBacking { 1224 var obsolete []*FileBacking 1225 for _, lm := range v.Levels { 1226 obsolete = append(obsolete, lm.release()...) 1227 } 1228 for _, lm := range v.RangeKeyLevels { 1229 obsolete = append(obsolete, lm.release()...) 1230 } 1231 return obsolete 1232 } 1233 1234 // Next returns the next version in the list of versions. 1235 func (v *Version) Next() *Version { 1236 return v.next 1237 } 1238 1239 // InitL0Sublevels initializes the L0Sublevels 1240 func (v *Version) InitL0Sublevels( 1241 cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, 1242 ) error { 1243 var err error 1244 v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes) 1245 if err == nil && v.L0Sublevels != nil { 1246 v.L0SublevelFiles = v.L0Sublevels.Levels 1247 } 1248 return err 1249 } 1250 1251 // Contains returns a boolean indicating whether the provided file exists in 1252 // the version at the given level. If level is non-zero then Contains binary 1253 // searches among the files. If level is zero, Contains scans the entire 1254 // level. 1255 func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool { 1256 iter := v.Levels[level].Iter() 1257 if level > 0 { 1258 overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey, 1259 m.Largest.IsExclusiveSentinel()) 1260 iter = overlaps.Iter() 1261 } 1262 for f := iter.First(); f != nil; f = iter.Next() { 1263 if f == m { 1264 return true 1265 } 1266 } 1267 return false 1268 } 1269 1270 // Overlaps returns all elements of v.files[level] whose user key range 1271 // intersects the given range. If level is non-zero then the user key ranges of 1272 // v.files[level] are assumed to not overlap (although they may touch). If level 1273 // is zero then that assumption cannot be made, and the [start, end] range is 1274 // expanded to the union of those matching ranges so far and the computation is 1275 // repeated until [start, end] stabilizes. 1276 // The returned files are a subsequence of the input files, i.e., the ordering 1277 // is not changed. 1278 func (v *Version) Overlaps( 1279 level int, cmp Compare, start, end []byte, exclusiveEnd bool, 1280 ) LevelSlice { 1281 if level == 0 { 1282 // Indices that have been selected as overlapping. 1283 l0 := v.Levels[level] 1284 l0Iter := l0.Iter() 1285 selectedIndices := make([]bool, l0.Len()) 1286 numSelected := 0 1287 var slice LevelSlice 1288 for { 1289 restart := false 1290 for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { 1291 selected := selectedIndices[i] 1292 if selected { 1293 continue 1294 } 1295 if !meta.Overlaps(cmp, start, end, exclusiveEnd) { 1296 // meta is completely outside the specified range; skip it. 1297 continue 1298 } 1299 // Overlaps. 1300 selectedIndices[i] = true 1301 numSelected++ 1302 1303 smallest := meta.Smallest.UserKey 1304 largest := meta.Largest.UserKey 1305 // Since level == 0, check if the newly added fileMetadata has 1306 // expanded the range. We expand the range immediately for files 1307 // we have remaining to check in this loop. All already checked 1308 // and unselected files will need to be rechecked via the 1309 // restart below. 1310 if cmp(smallest, start) < 0 { 1311 start = smallest 1312 restart = true 1313 } 1314 if v := cmp(largest, end); v > 0 { 1315 end = largest 1316 exclusiveEnd = meta.Largest.IsExclusiveSentinel() 1317 restart = true 1318 } else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() { 1319 // Only update the exclusivity of our existing `end` 1320 // bound. 1321 exclusiveEnd = false 1322 restart = true 1323 } 1324 } 1325 1326 if !restart { 1327 // Construct a B-Tree containing only the matching items. 1328 var tr btree 1329 tr.cmp = v.Levels[level].tree.cmp 1330 for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { 1331 if selectedIndices[i] { 1332 err := tr.Insert(meta) 1333 if err != nil { 1334 panic(err) 1335 } 1336 } 1337 } 1338 slice = newLevelSlice(tr.Iter()) 1339 // TODO(jackson): Avoid the oddity of constructing and 1340 // immediately releasing a B-Tree. Make LevelSlice an 1341 // interface? 1342 tr.Release() 1343 break 1344 } 1345 // Continue looping to retry the files that were not selected. 1346 } 1347 return slice 1348 } 1349 1350 return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd) 1351 } 1352 1353 // CheckOrdering checks that the files are consistent with respect to 1354 // increasing file numbers (for level 0 files) and increasing and non- 1355 // overlapping internal key ranges (for level non-0 files). 1356 func (v *Version) CheckOrdering( 1357 cmp Compare, format base.FormatKey, order OrderingInvariants, 1358 ) error { 1359 for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { 1360 sublevelIter := v.L0SublevelFiles[sublevel].Iter() 1361 // Sublevels have NEVER allowed split user keys, so we can pass 1362 // ProhibitSplitUserKeys. 1363 if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter, ProhibitSplitUserKeys); err != nil { 1364 return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format)) 1365 } 1366 } 1367 1368 for level, lm := range v.Levels { 1369 if err := CheckOrdering(cmp, format, Level(level), lm.Iter(), order); err != nil { 1370 return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format)) 1371 } 1372 } 1373 return nil 1374 } 1375 1376 // VersionList holds a list of versions. The versions are ordered from oldest 1377 // to newest. 1378 type VersionList struct { 1379 mu *sync.Mutex 1380 root Version 1381 } 1382 1383 // Init initializes the version list. 1384 func (l *VersionList) Init(mu *sync.Mutex) { 1385 l.mu = mu 1386 l.root.next = &l.root 1387 l.root.prev = &l.root 1388 } 1389 1390 // Empty returns true if the list is empty, and false otherwise. 1391 func (l *VersionList) Empty() bool { 1392 return l.root.next == &l.root 1393 } 1394 1395 // Front returns the oldest version in the list. Note that this version is only 1396 // valid if Empty() returns true. 1397 func (l *VersionList) Front() *Version { 1398 return l.root.next 1399 } 1400 1401 // Back returns the newest version in the list. Note that this version is only 1402 // valid if Empty() returns true. 1403 func (l *VersionList) Back() *Version { 1404 return l.root.prev 1405 } 1406 1407 // PushBack adds a new version to the back of the list. This new version 1408 // becomes the "newest" version in the list. 1409 func (l *VersionList) PushBack(v *Version) { 1410 if v.list != nil || v.prev != nil || v.next != nil { 1411 panic("pebble: version list is inconsistent") 1412 } 1413 v.prev = l.root.prev 1414 v.prev.next = v 1415 v.next = &l.root 1416 v.next.prev = v 1417 v.list = l 1418 // Let L0Sublevels on the second newest version get GC'd, as it is no longer 1419 // necessary. See the comment in Version. 1420 v.prev.L0Sublevels = nil 1421 } 1422 1423 // Remove removes the specified version from the list. 1424 func (l *VersionList) Remove(v *Version) { 1425 if v == &l.root { 1426 panic("pebble: cannot remove version list root node") 1427 } 1428 if v.list != l { 1429 panic("pebble: version list is inconsistent") 1430 } 1431 v.prev.next = v.next 1432 v.next.prev = v.prev 1433 v.next = nil // avoid memory leaks 1434 v.prev = nil // avoid memory leaks 1435 v.list = nil // avoid memory leaks 1436 } 1437 1438 // OrderingInvariants dictates the file ordering invariants active. 1439 type OrderingInvariants int8 1440 1441 const ( 1442 // ProhibitSplitUserKeys indicates that adjacent files within a level cannot 1443 // contain the same user key. 1444 ProhibitSplitUserKeys OrderingInvariants = iota 1445 // AllowSplitUserKeys indicates that adjacent files within a level may 1446 // contain the same user key. This is only allowed by historical format 1447 // major versions. 1448 // 1449 // TODO(jackson): Remove. 1450 AllowSplitUserKeys 1451 ) 1452 1453 // CheckOrdering checks that the files are consistent with respect to 1454 // seqnums (for level 0 files -- see detailed comment below) and increasing and non- 1455 // overlapping internal key ranges (for non-level 0 files). 1456 // 1457 // The ordering field may be passed AllowSplitUserKeys to allow adjacent files that are both 1458 // inclusive of the same user key. Pebble no longer creates version edits 1459 // installing such files, and Pebble databases with sufficiently high format 1460 // major version should no longer have any such files within their LSM. 1461 // TODO(jackson): Remove AllowSplitUserKeys when we remove support for the 1462 // earlier format major versions. 1463 func CheckOrdering( 1464 cmp Compare, format base.FormatKey, level Level, files LevelIterator, ordering OrderingInvariants, 1465 ) error { 1466 // The invariants to check for L0 sublevels are the same as the ones to 1467 // check for all other levels. However, if L0 is not organized into 1468 // sublevels, or if all L0 files are being passed in, we do the legacy L0 1469 // checks, defined in the detailed comment below. 1470 if level == Level(0) { 1471 // We have 2 kinds of files: 1472 // - Files with exactly one sequence number: these could be either ingested files 1473 // or flushed files. We cannot tell the difference between them based on FileMetadata, 1474 // so our consistency checking here uses the weaker checks assuming it is a narrow 1475 // flushed file. We cannot error on ingested files having sequence numbers coincident 1476 // with flushed files as the seemingly ingested file could just be a flushed file 1477 // with just one key in it which is a truncated range tombstone sharing sequence numbers 1478 // with other files in the same flush. 1479 // - Files with multiple sequence numbers: these are necessarily flushed files. 1480 // 1481 // Three cases of overlapping sequence numbers: 1482 // Case 1: 1483 // An ingested file contained in the sequence numbers of the flushed file -- it must be 1484 // fully contained (not coincident with either end of the flushed file) since the memtable 1485 // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence 1486 // num b, and the memtable got a subsequent update that was given sequence num b+1, before 1487 // being flushed. 1488 // 1489 // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and 1490 // third file are inconsistent with each other. So comparing adjacent files is insufficient 1491 // for consistency checking. 1492 // 1493 // Visually we have something like 1494 // x------y x-----------yx-------------y (flushed files where x, y are the endpoints) 1495 // y y y y (y's represent ingested files) 1496 // And these are ordered in increasing order of y. Note that y's must be unique. 1497 // 1498 // Case 2: 1499 // A flushed file that did not overlap in keys with any file in any level, but does overlap 1500 // in the file key intervals. This file is placed in L0 since it overlaps in the file 1501 // key intervals but since it has no overlapping data, it is assigned a sequence number 1502 // of 0 in RocksDB. We handle this case for compatibility with RocksDB. 1503 // 1504 // Case 3: 1505 // A sequence of flushed files that overlap in sequence numbers with one another, 1506 // but do not overlap in keys inside the sstables. These files correspond to 1507 // partitioned flushes or the results of intra-L0 compactions of partitioned 1508 // flushes. 1509 // 1510 // Since these types of SSTables violate most other sequence number 1511 // overlap invariants, and handling this case is important for compatibility 1512 // with future versions of pebble, this method relaxes most L0 invariant 1513 // checks. 1514 1515 var prev *FileMetadata 1516 for f := files.First(); f != nil; f, prev = files.Next(), f { 1517 if prev == nil { 1518 continue 1519 } 1520 // Validate that the sorting is sane. 1521 if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum { 1522 // Multiple files satisfying case 2 mentioned above. 1523 } else if !prev.lessSeqNum(f) { 1524 return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>", 1525 errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1526 errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum), 1527 errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum)) 1528 } 1529 } 1530 } else { 1531 var prev *FileMetadata 1532 for f := files.First(); f != nil; f, prev = files.Next(), f { 1533 if err := f.Validate(cmp, format); err != nil { 1534 return errors.Wrapf(err, "%s ", level) 1535 } 1536 if prev != nil { 1537 if prev.cmpSmallestKey(f, cmp) >= 0 { 1538 return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]", 1539 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1540 prev.Smallest.Pretty(format), prev.Largest.Pretty(format), 1541 f.Smallest.Pretty(format), f.Largest.Pretty(format)) 1542 } 1543 1544 // What's considered "overlapping" is dependent on the format 1545 // major version. If ordering=ProhibitSplitUserKeys, then both 1546 // files cannot contain keys with the same user keys. If the 1547 // bounds have the same user key, the previous file's boundary 1548 // must have a Trailer indicating that it's exclusive. 1549 switch ordering { 1550 case AllowSplitUserKeys: 1551 if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 { 1552 return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]", 1553 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1554 prev.Smallest.Pretty(format), prev.Largest.Pretty(format), 1555 f.Smallest.Pretty(format), f.Largest.Pretty(format)) 1556 } 1557 case ProhibitSplitUserKeys: 1558 if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) { 1559 return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]", 1560 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1561 prev.Smallest.Pretty(format), prev.Largest.Pretty(format), 1562 f.Smallest.Pretty(format), f.Largest.Pretty(format)) 1563 } 1564 default: 1565 panic("unreachable") 1566 } 1567 } 1568 } 1569 } 1570 return nil 1571 }