github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/version.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 stdcmp "cmp" 10 "fmt" 11 "sort" 12 "strconv" 13 "strings" 14 "sync" 15 "sync/atomic" 16 "unicode" 17 18 "github.com/cockroachdb/errors" 19 "github.com/cockroachdb/pebble/internal/base" 20 "github.com/cockroachdb/pebble/internal/invariants" 21 ) 22 23 // Compare exports the base.Compare type. 24 type Compare = base.Compare 25 26 // InternalKey exports the base.InternalKey type. 27 type InternalKey = base.InternalKey 28 29 // TableInfo contains the common information for table related events. 30 type TableInfo struct { 31 // FileNum is the internal DB identifier for the table. 32 FileNum base.FileNum 33 // Size is the size of the file in bytes. 34 Size uint64 35 // Smallest is the smallest internal key in the table. 36 Smallest InternalKey 37 // Largest is the largest internal key in the table. 38 Largest InternalKey 39 // SmallestSeqNum is the smallest sequence number in the table. 40 SmallestSeqNum uint64 41 // LargestSeqNum is the largest sequence number in the table. 42 LargestSeqNum uint64 43 } 44 45 // TableStats contains statistics on a table used for compaction heuristics, 46 // and export via Metrics. 47 type TableStats struct { 48 // The total number of entries in the table. 49 NumEntries uint64 50 // The number of point and range deletion entries in the table. 51 NumDeletions uint64 52 // NumRangeKeySets is the total number of range key sets in the table. 53 // 54 // NB: If there's a chance that the sstable contains any range key sets, 55 // then NumRangeKeySets must be > 0. 56 NumRangeKeySets uint64 57 // Estimate of the total disk space that may be dropped by this table's 58 // point deletions by compacting them. 59 PointDeletionsBytesEstimate uint64 60 // Estimate of the total disk space that may be dropped by this table's 61 // range deletions by compacting them. This estimate is at data-block 62 // granularity and is not updated if compactions beneath the table reduce 63 // the amount of reclaimable disk space. It also does not account for 64 // overlapping data in L0 and ignores L0 sublevels, but the error that 65 // introduces is expected to be small. 66 // 67 // Tables in the bottommost level of the LSM may have a nonzero estimate if 68 // snapshots or move compactions prevented the elision of their range 69 // tombstones. A table in the bottommost level that was ingested into L6 70 // will have a zero estimate, because the file's sequence numbers indicate 71 // that the tombstone cannot drop any data contained within the file itself. 72 RangeDeletionsBytesEstimate uint64 73 // Total size of value blocks and value index block. 74 ValueBlocksSize uint64 75 } 76 77 // boundType represents the type of key (point or range) present as the smallest 78 // and largest keys. 79 type boundType uint8 80 81 const ( 82 boundTypePointKey boundType = iota + 1 83 boundTypeRangeKey 84 ) 85 86 // CompactionState is the compaction state of a file. 87 // 88 // The following shows the valid state transitions: 89 // 90 // NotCompacting --> Compacting --> Compacted 91 // ^ | 92 // | | 93 // +-------<-------+ 94 // 95 // Input files to a compaction transition to Compacting when a compaction is 96 // picked. A file that has finished compacting typically transitions into the 97 // Compacted state, at which point it is effectively obsolete ("zombied") and 98 // will eventually be removed from the LSM. A file that has been move-compacted 99 // will transition from Compacting back into the NotCompacting state, signaling 100 // that the file may be selected for a subsequent compaction. A failed 101 // compaction will result in all input tables transitioning from Compacting to 102 // NotCompacting. 103 // 104 // This state is in-memory only. It is not persisted to the manifest. 105 type CompactionState uint8 106 107 // CompactionStates. 108 const ( 109 CompactionStateNotCompacting CompactionState = iota 110 CompactionStateCompacting 111 CompactionStateCompacted 112 ) 113 114 // String implements fmt.Stringer. 115 func (s CompactionState) String() string { 116 switch s { 117 case CompactionStateNotCompacting: 118 return "NotCompacting" 119 case CompactionStateCompacting: 120 return "Compacting" 121 case CompactionStateCompacted: 122 return "Compacted" 123 default: 124 panic(fmt.Sprintf("pebble: unknown compaction state %d", s)) 125 } 126 } 127 128 // FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of 129 // some version. FileMetadata does not contain the actual level of the sst, 130 // since such leveled-ssts can move across levels in different versions, while 131 // sharing the same FileMetadata. There are two kinds of leveled-ssts, physical 132 // and virtual. Underlying both leveled-ssts is a backing-sst, for which the 133 // only state is FileBacking. A backing-sst is level-less. It is possible for a 134 // backing-sst to be referred to by a physical sst in one version and by one or 135 // more virtual ssts in one or more versions. A backing-sst becomes obsolete 136 // and can be deleted once it is no longer required by any physical or virtual 137 // sst in any version. 138 // 139 // We maintain some invariants: 140 // 141 // 1. Each physical and virtual sst will have a unique FileMetadata.FileNum, 142 // and there will be exactly one FileMetadata associated with the FileNum. 143 // 144 // 2. Within a version, a backing-sst is either only referred to by one 145 // physical sst or one or more virtual ssts. 146 // 147 // 3. Once a backing-sst is referred to by a virtual sst in the latest version, 148 // it cannot go back to being referred to by a physical sst in any future 149 // version. 150 // 151 // Once a physical sst is no longer needed by any version, we will no longer 152 // maintain the file metadata associated with it. We will still maintain the 153 // FileBacking associated with the physical sst if the backing sst is required 154 // by any virtual ssts in any version. 155 type FileMetadata struct { 156 // AllowedSeeks is used to determine if a file should be picked for 157 // a read triggered compaction. It is decremented when read sampling 158 // in pebble.Iterator after every after every positioning operation 159 // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc). 160 AllowedSeeks atomic.Int64 161 162 // statsValid indicates if stats have been loaded for the table. The 163 // TableStats structure is populated only if valid is true. 164 statsValid atomic.Bool 165 166 // FileBacking is the state which backs either a physical or virtual 167 // sstables. 168 FileBacking *FileBacking 169 170 // InitAllowedSeeks is the inital value of allowed seeks. This is used 171 // to re-set allowed seeks on a file once it hits 0. 172 InitAllowedSeeks int64 173 // FileNum is the file number. 174 // 175 // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum. 176 FileNum base.FileNum 177 // Size is the size of the file, in bytes. Size is an approximate value for 178 // virtual sstables. 179 // 180 // INVARIANTS: 181 // - When !FileMetadata.Virtual, Size == FileBacking.Size. 182 // - Size should be non-zero. Size 0 virtual sstables must not be created. 183 Size uint64 184 // File creation time in seconds since the epoch (1970-01-01 00:00:00 185 // UTC). For ingested sstables, this corresponds to the time the file was 186 // ingested. For virtual sstables, this corresponds to the wall clock time 187 // when the FileMetadata for the virtual sstable was first created. 188 CreationTime int64 189 // Lower and upper bounds for the smallest and largest sequence numbers in 190 // the table, across both point and range keys. For physical sstables, these 191 // values are tight bounds. For virtual sstables, there is no guarantee that 192 // there will be keys with SmallestSeqNum or LargestSeqNum within virtual 193 // sstable bounds. 194 SmallestSeqNum uint64 195 LargestSeqNum uint64 196 // SmallestPointKey and LargestPointKey are the inclusive bounds for the 197 // internal point keys stored in the table. This includes RANGEDELs, which 198 // alter point keys. 199 // NB: these field should be set using ExtendPointKeyBounds. They are left 200 // exported for reads as an optimization. 201 SmallestPointKey InternalKey 202 LargestPointKey InternalKey 203 // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the 204 // internal range keys stored in the table. 205 // NB: these field should be set using ExtendRangeKeyBounds. They are left 206 // exported for reads as an optimization. 207 SmallestRangeKey InternalKey 208 LargestRangeKey InternalKey 209 // Smallest and Largest are the inclusive bounds for the internal keys stored 210 // in the table, across both point and range keys. 211 // NB: these fields are derived from their point and range key equivalents, 212 // and are updated via the MaybeExtend{Point,Range}KeyBounds methods. 213 Smallest InternalKey 214 Largest InternalKey 215 // Stats describe table statistics. Protected by DB.mu. 216 // 217 // For virtual sstables, set stats upon virtual sstable creation as 218 // asynchronous computation of stats is not currently supported. 219 // 220 // TODO(bananabrick): To support manifest replay for virtual sstables, we 221 // probably need to compute virtual sstable stats asynchronously. Otherwise, 222 // we'd have to write virtual sstable stats to the version edit. 223 Stats TableStats 224 225 // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and 226 // pick L0 compactions. Only accurate for the most recent Version. 227 SubLevel int 228 L0Index int 229 minIntervalIndex int 230 maxIntervalIndex int 231 232 // NB: the alignment of this struct is 8 bytes. We pack all the bools to 233 // ensure an optimal packing. 234 235 // IsIntraL0Compacting is set to True if this file is part of an intra-L0 236 // compaction. When it's true, IsCompacting must also return true. If 237 // Compacting is true and IsIntraL0Compacting is false for an L0 file, the 238 // file must be part of a compaction to Lbase. 239 IsIntraL0Compacting bool 240 CompactionState CompactionState 241 // True if compaction of this file has been explicitly requested. 242 // Previously, RocksDB and earlier versions of Pebble allowed this 243 // flag to be set by a user table property collector. Some earlier 244 // versions of Pebble respected this flag, while other more recent 245 // versions ignored this flag. 246 // 247 // More recently this flag has been repurposed to facilitate the 248 // compaction of 'atomic compaction units'. Files marked for 249 // compaction are compacted in a rewrite compaction at the lowest 250 // possible compaction priority. 251 // 252 // NB: A count of files marked for compaction is maintained on 253 // Version, and compaction picking reads cached annotations 254 // determined by this field. 255 // 256 // Protected by DB.mu. 257 MarkedForCompaction bool 258 // HasPointKeys tracks whether the table contains point keys (including 259 // RANGEDELs). If a table contains only range deletions, HasPointsKeys is 260 // still true. 261 HasPointKeys bool 262 // HasRangeKeys tracks whether the table contains any range keys. 263 HasRangeKeys bool 264 // smallestSet and largestSet track whether the overall bounds have been set. 265 boundsSet bool 266 // boundTypeSmallest and boundTypeLargest provide an indication as to which 267 // key type (point or range) corresponds to the smallest and largest overall 268 // table bounds. 269 boundTypeSmallest, boundTypeLargest boundType 270 // Virtual is true if the FileMetadata belongs to a virtual sstable. 271 Virtual bool 272 } 273 274 // PhysicalFileMeta is used by functions which want a guarantee that their input 275 // belongs to a physical sst and not a virtual sst. 276 // 277 // NB: This type should only be constructed by calling 278 // FileMetadata.PhysicalMeta. 279 type PhysicalFileMeta struct { 280 *FileMetadata 281 } 282 283 // VirtualFileMeta is used by functions which want a guarantee that their input 284 // belongs to a virtual sst and not a physical sst. 285 // 286 // A VirtualFileMeta inherits all the same fields as a FileMetadata. These 287 // fields have additional invariants imposed on them, and/or slightly varying 288 // meanings: 289 // - Smallest and Largest (and their counterparts 290 // {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a 291 // key at that exact bound. We make the effort to determine the next smallest 292 // or largest key in an sstable after virtualizing it, to maintain this 293 // tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it 294 // could mean that a rangedel or range key ends at that user key, or has been 295 // truncated to that user key. 296 // - One invariant is that if a rangedel or range key is truncated on its 297 // upper bound, the virtual sstable *must* have a rangedel or range key 298 // sentinel key as its upper bound. This is because truncation yields 299 // an exclusive upper bound for the rangedel/rangekey, and if there are 300 // any points at that exclusive upper bound within the same virtual 301 // sstable, those could get uncovered by this truncation. We enforce this 302 // invariant in calls to keyspan.Truncate. 303 // - Size is an estimate of the size of the virtualized portion of this sstable. 304 // The underlying file's size is stored in FileBacking.Size, though it could 305 // also be estimated or could correspond to just the referenced portion of 306 // a file (eg. if the file originated on another node). 307 // - Size must be > 0. 308 // - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables. 309 // This means that all keys in the virtual sstable must have seqnums within 310 // [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's 311 // a key with a seqnum at either of the bounds. Calculating tight seqnum 312 // bounds would be too expensive and deliver little value. 313 // 314 // NB: This type should only be constructed by calling FileMetadata.VirtualMeta. 315 type VirtualFileMeta struct { 316 *FileMetadata 317 } 318 319 // PhysicalMeta should be the only source of creating the PhysicalFileMeta 320 // wrapper type. 321 func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta { 322 if m.Virtual { 323 panic("pebble: file metadata does not belong to a physical sstable") 324 } 325 return PhysicalFileMeta{ 326 m, 327 } 328 } 329 330 // VirtualMeta should be the only source of creating the VirtualFileMeta wrapper 331 // type. 332 func (m *FileMetadata) VirtualMeta() VirtualFileMeta { 333 if !m.Virtual { 334 panic("pebble: file metadata does not belong to a virtual sstable") 335 } 336 return VirtualFileMeta{ 337 m, 338 } 339 } 340 341 // FileBacking either backs a single physical sstable, or one or more virtual 342 // sstables. 343 // 344 // See the comment above the FileMetadata type for sstable terminology. 345 type FileBacking struct { 346 // Reference count for the backing file on disk: incremented when a 347 // physical or virtual sstable which is backed by the FileBacking is 348 // added to a version and decremented when the version is unreferenced. 349 // We ref count in order to determine when it is safe to delete a 350 // backing sst file from disk. The backing file is obsolete when the 351 // reference count falls to zero. 352 refs atomic.Int32 353 // latestVersionRefs are the references to the FileBacking in the 354 // latest version. This reference can be through a single physical 355 // sstable in the latest version, or one or more virtual sstables in the 356 // latest version. 357 // 358 // INVARIANT: latestVersionRefs <= refs. 359 latestVersionRefs atomic.Int32 360 // VirtualizedSize is set iff the backing sst is only referred to by 361 // virtual ssts in the latest version. VirtualizedSize is the sum of the 362 // virtual sstable sizes of all of the virtual sstables in the latest 363 // version which are backed by the physical sstable. When a virtual 364 // sstable is removed from the latest version, we will decrement the 365 // VirtualizedSize. During compaction picking, we'll compensate a 366 // virtual sstable file size by 367 // (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs. 368 // The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize 369 // is high, then the space amplification due to virtual sstables is 370 // high, and we should pick the virtual sstable with a higher priority. 371 // 372 // TODO(bananabrick): Compensate the virtual sstable file size using 373 // the VirtualizedSize during compaction picking and test. 374 VirtualizedSize atomic.Uint64 375 DiskFileNum base.DiskFileNum 376 Size uint64 377 } 378 379 // InitPhysicalBacking allocates and sets the FileBacking which is required by a 380 // physical sstable FileMetadata. 381 // 382 // Ensure that the state required by FileBacking, such as the FileNum, is 383 // already set on the FileMetadata before InitPhysicalBacking is called. 384 // Calling InitPhysicalBacking only after the relevant state has been set in the 385 // FileMetadata is not necessary in tests which don't rely on FileBacking. 386 func (m *FileMetadata) InitPhysicalBacking() { 387 if m.Virtual { 388 panic("pebble: virtual sstables should use a pre-existing FileBacking") 389 } 390 if m.FileBacking == nil { 391 m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()} 392 } 393 } 394 395 // InitProviderBacking creates a new FileBacking for a file backed by 396 // an objstorage.Provider. 397 func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) { 398 if !m.Virtual { 399 panic("pebble: provider-backed sstables must be virtual") 400 } 401 if m.FileBacking == nil { 402 m.FileBacking = &FileBacking{DiskFileNum: fileNum} 403 } 404 } 405 406 // ValidateVirtual should be called once the FileMetadata for a virtual sstable 407 // is created to verify that the fields of the virtual sstable are sound. 408 func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) { 409 if !m.Virtual { 410 panic("pebble: invalid virtual sstable") 411 } 412 413 if createdFrom.SmallestSeqNum != m.SmallestSeqNum { 414 panic("pebble: invalid smallest sequence number for virtual sstable") 415 } 416 417 if createdFrom.LargestSeqNum != m.LargestSeqNum { 418 panic("pebble: invalid largest sequence number for virtual sstable") 419 } 420 421 if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking { 422 panic("pebble: invalid physical sstable state for virtual sstable") 423 } 424 425 if m.Size == 0 { 426 panic("pebble: virtual sstable size must be set upon creation") 427 } 428 } 429 430 // Refs returns the refcount of backing sstable. 431 func (m *FileMetadata) Refs() int32 { 432 return m.FileBacking.refs.Load() 433 } 434 435 // Ref increments the ref count associated with the backing sstable. 436 func (m *FileMetadata) Ref() { 437 m.FileBacking.refs.Add(1) 438 } 439 440 // Unref decrements the ref count associated with the backing sstable. 441 func (m *FileMetadata) Unref() int32 { 442 v := m.FileBacking.refs.Add(-1) 443 if invariants.Enabled && v < 0 { 444 panic("pebble: invalid FileMetadata refcounting") 445 } 446 return v 447 } 448 449 // LatestRef increments the latest ref count associated with the backing 450 // sstable. 451 func (m *FileMetadata) LatestRef() { 452 m.FileBacking.latestVersionRefs.Add(1) 453 454 if m.Virtual { 455 m.FileBacking.VirtualizedSize.Add(m.Size) 456 } 457 } 458 459 // LatestUnref decrements the latest ref count associated with the backing 460 // sstable. 461 func (m *FileMetadata) LatestUnref() int32 { 462 if m.Virtual { 463 m.FileBacking.VirtualizedSize.Add(-m.Size) 464 } 465 466 v := m.FileBacking.latestVersionRefs.Add(-1) 467 if invariants.Enabled && v < 0 { 468 panic("pebble: invalid FileMetadata latest refcounting") 469 } 470 return v 471 } 472 473 // LatestRefs returns the latest ref count associated with the backing sstable. 474 func (m *FileMetadata) LatestRefs() int32 { 475 return m.FileBacking.latestVersionRefs.Load() 476 } 477 478 // SetCompactionState transitions this file's compaction state to the given 479 // state. Protected by DB.mu. 480 func (m *FileMetadata) SetCompactionState(to CompactionState) { 481 if invariants.Enabled { 482 transitionErr := func() error { 483 return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to) 484 } 485 switch m.CompactionState { 486 case CompactionStateNotCompacting: 487 if to != CompactionStateCompacting { 488 panic(transitionErr()) 489 } 490 case CompactionStateCompacting: 491 if to != CompactionStateCompacted && to != CompactionStateNotCompacting { 492 panic(transitionErr()) 493 } 494 case CompactionStateCompacted: 495 panic(transitionErr()) 496 default: 497 panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState)) 498 } 499 } 500 m.CompactionState = to 501 } 502 503 // IsCompacting returns true if this file's compaction state is 504 // CompactionStateCompacting. Protected by DB.mu. 505 func (m *FileMetadata) IsCompacting() bool { 506 return m.CompactionState == CompactionStateCompacting 507 } 508 509 // StatsValid returns true if the table stats have been populated. If StatValid 510 // returns true, the Stats field may be read (with or without holding the 511 // database mutex). 512 func (m *FileMetadata) StatsValid() bool { 513 return m.statsValid.Load() 514 } 515 516 // StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu 517 // while populating TableStats and calling StatsMarkValud. Once stats are 518 // populated, they must not be mutated. 519 func (m *FileMetadata) StatsMarkValid() { 520 m.statsValid.Store(true) 521 } 522 523 // ExtendPointKeyBounds attempts to extend the lower and upper point key bounds 524 // and overall table bounds with the given smallest and largest keys. The 525 // smallest and largest bounds may not be extended if the table already has a 526 // bound that is smaller or larger, respectively. The receiver is returned. 527 // NB: calling this method should be preferred to manually setting the bounds by 528 // manipulating the fields directly, to maintain certain invariants. 529 func (m *FileMetadata) ExtendPointKeyBounds( 530 cmp Compare, smallest, largest InternalKey, 531 ) *FileMetadata { 532 // Update the point key bounds. 533 if !m.HasPointKeys { 534 m.SmallestPointKey, m.LargestPointKey = smallest, largest 535 m.HasPointKeys = true 536 } else { 537 if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 { 538 m.SmallestPointKey = smallest 539 } 540 if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 { 541 m.LargestPointKey = largest 542 } 543 } 544 // Update the overall bounds. 545 m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey) 546 return m 547 } 548 549 // ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds 550 // and overall table bounds with the given smallest and largest keys. The 551 // smallest and largest bounds may not be extended if the table already has a 552 // bound that is smaller or larger, respectively. The receiver is returned. 553 // NB: calling this method should be preferred to manually setting the bounds by 554 // manipulating the fields directly, to maintain certain invariants. 555 func (m *FileMetadata) ExtendRangeKeyBounds( 556 cmp Compare, smallest, largest InternalKey, 557 ) *FileMetadata { 558 // Update the range key bounds. 559 if !m.HasRangeKeys { 560 m.SmallestRangeKey, m.LargestRangeKey = smallest, largest 561 m.HasRangeKeys = true 562 } else { 563 if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 { 564 m.SmallestRangeKey = smallest 565 } 566 if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 { 567 m.LargestRangeKey = largest 568 } 569 } 570 // Update the overall bounds. 571 m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey) 572 return m 573 } 574 575 // extendOverallBounds attempts to extend the overall table lower and upper 576 // bounds. The given bounds may not be used if a lower or upper bound already 577 // exists that is smaller or larger than the given keys, respectively. The given 578 // boundType will be used if the bounds are updated. 579 func (m *FileMetadata) extendOverallBounds( 580 cmp Compare, smallest, largest InternalKey, bTyp boundType, 581 ) { 582 if !m.boundsSet { 583 m.Smallest, m.Largest = smallest, largest 584 m.boundsSet = true 585 m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp 586 } else { 587 if base.InternalCompare(cmp, smallest, m.Smallest) < 0 { 588 m.Smallest = smallest 589 m.boundTypeSmallest = bTyp 590 } 591 if base.InternalCompare(cmp, largest, m.Largest) > 0 { 592 m.Largest = largest 593 m.boundTypeLargest = bTyp 594 } 595 } 596 } 597 598 // Overlaps returns true if the file key range overlaps with the given range. 599 func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool { 600 if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) { 601 // f is completely before the specified range; no overlap. 602 return false 603 } 604 if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) { 605 // f is completely after the specified range; no overlap. 606 return false 607 } 608 return true 609 } 610 611 // ContainedWithinSpan returns true if the file key range completely overlaps with the 612 // given range ("end" is assumed to exclusive). 613 func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool { 614 lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end) 615 return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel())) 616 } 617 618 // ContainsKeyType returns whether or not the file contains keys of the provided 619 // type. 620 func (m *FileMetadata) ContainsKeyType(kt KeyType) bool { 621 switch kt { 622 case KeyTypePointAndRange: 623 return true 624 case KeyTypePoint: 625 return m.HasPointKeys 626 case KeyTypeRange: 627 return m.HasRangeKeys 628 default: 629 panic("unrecognized key type") 630 } 631 } 632 633 // SmallestBound returns the file's smallest bound of the key type. It returns a 634 // false second return value if the file does not contain any keys of the key 635 // type. 636 func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) { 637 switch kt { 638 case KeyTypePointAndRange: 639 return &m.Smallest, true 640 case KeyTypePoint: 641 return &m.SmallestPointKey, m.HasPointKeys 642 case KeyTypeRange: 643 return &m.SmallestRangeKey, m.HasRangeKeys 644 default: 645 panic("unrecognized key type") 646 } 647 } 648 649 // LargestBound returns the file's largest bound of the key type. It returns a 650 // false second return value if the file does not contain any keys of the key 651 // type. 652 func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) { 653 switch kt { 654 case KeyTypePointAndRange: 655 return &m.Largest, true 656 case KeyTypePoint: 657 return &m.LargestPointKey, m.HasPointKeys 658 case KeyTypeRange: 659 return &m.LargestRangeKey, m.HasRangeKeys 660 default: 661 panic("unrecognized key type") 662 } 663 } 664 665 const ( 666 maskContainsPointKeys = 1 << 0 667 maskSmallest = 1 << 1 668 maskLargest = 1 << 2 669 ) 670 671 // boundsMarker returns a marker byte whose bits encode the following 672 // information (in order from least significant bit): 673 // - if the table contains point keys 674 // - if the table's smallest key is a point key 675 // - if the table's largest key is a point key 676 func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) { 677 if m.HasPointKeys { 678 sentinel |= maskContainsPointKeys 679 } 680 switch m.boundTypeSmallest { 681 case boundTypePointKey: 682 sentinel |= maskSmallest 683 case boundTypeRangeKey: 684 // No op - leave bit unset. 685 default: 686 return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum) 687 } 688 switch m.boundTypeLargest { 689 case boundTypePointKey: 690 sentinel |= maskLargest 691 case boundTypeRangeKey: 692 // No op - leave bit unset. 693 default: 694 return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum) 695 } 696 return 697 } 698 699 // String implements fmt.Stringer, printing the file number and the overall 700 // table bounds. 701 func (m *FileMetadata) String() string { 702 return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest) 703 } 704 705 // DebugString returns a verbose representation of FileMetadata, typically for 706 // use in tests and debugging, returning the file number and the point, range 707 // and overall bounds for the table. 708 func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string { 709 var b bytes.Buffer 710 fmt.Fprintf(&b, "%s:[%s-%s]", 711 m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format)) 712 if !verbose { 713 return b.String() 714 } 715 fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum) 716 if m.HasPointKeys { 717 fmt.Fprintf(&b, " points:[%s-%s]", 718 m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format)) 719 } 720 if m.HasRangeKeys { 721 fmt.Fprintf(&b, " ranges:[%s-%s]", 722 m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format)) 723 } 724 return b.String() 725 } 726 727 // ParseFileMetadataDebug parses a FileMetadata from its DebugString 728 // representation. 729 func ParseFileMetadataDebug(s string) (*FileMetadata, error) { 730 // Split lines of the form: 731 // 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...] 732 fields := strings.FieldsFunc(s, func(c rune) bool { 733 switch c { 734 case ':', '[', '-', ']': 735 return true 736 default: 737 return unicode.IsSpace(c) // NB: also trim whitespace padding. 738 } 739 }) 740 if len(fields)%3 != 0 { 741 return nil, errors.Newf("malformed input: %s", s) 742 } 743 m := &FileMetadata{} 744 for len(fields) > 0 { 745 prefix := fields[0] 746 if prefix == "seqnums" { 747 smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64) 748 if err != nil { 749 return m, errors.Newf("malformed input: %s: %s", s, err) 750 } 751 largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64) 752 if err != nil { 753 return m, errors.Newf("malformed input: %s: %s", s, err) 754 } 755 m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum 756 fields = fields[3:] 757 continue 758 } 759 smallest := base.ParsePrettyInternalKey(fields[1]) 760 largest := base.ParsePrettyInternalKey(fields[2]) 761 switch prefix { 762 case "points": 763 m.SmallestPointKey, m.LargestPointKey = smallest, largest 764 m.HasPointKeys = true 765 case "ranges": 766 m.SmallestRangeKey, m.LargestRangeKey = smallest, largest 767 m.HasRangeKeys = true 768 default: 769 fileNum, err := strconv.ParseUint(prefix, 10, 64) 770 if err != nil { 771 return m, errors.Newf("malformed input: %s: %s", s, err) 772 } 773 m.FileNum = base.FileNum(fileNum) 774 m.Smallest, m.Largest = smallest, largest 775 m.boundsSet = true 776 } 777 fields = fields[3:] 778 } 779 // By default, when the parser sees just the overall bounds, we set the point 780 // keys. This preserves backwards compatability with existing test cases that 781 // specify only the overall bounds. 782 if !m.HasPointKeys && !m.HasRangeKeys { 783 m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest 784 m.HasPointKeys = true 785 } 786 m.InitPhysicalBacking() 787 return m, nil 788 } 789 790 // Validate validates the metadata for consistency with itself, returning an 791 // error if inconsistent. 792 func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error { 793 // Combined range and point key validation. 794 795 if !m.HasPointKeys && !m.HasRangeKeys { 796 return base.CorruptionErrorf("file %s has neither point nor range keys", 797 errors.Safe(m.FileNum)) 798 } 799 if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 { 800 return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s", 801 errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey), 802 m.Largest.Pretty(formatKey)) 803 } 804 if m.SmallestSeqNum > m.LargestSeqNum { 805 return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d", 806 errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum) 807 } 808 809 // Point key validation. 810 811 if m.HasPointKeys { 812 if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 { 813 return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s", 814 errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey), 815 m.LargestPointKey.Pretty(formatKey)) 816 } 817 if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 || 818 base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 { 819 return base.CorruptionErrorf( 820 "file %s has inconsistent point key bounds relative to overall bounds: "+ 821 "overall = [%s-%s], point keys = [%s-%s]", 822 errors.Safe(m.FileNum), 823 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey), 824 m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey), 825 ) 826 } 827 } 828 829 // Range key validation. 830 831 if m.HasRangeKeys { 832 if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 { 833 return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s", 834 errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey), 835 m.LargestRangeKey.Pretty(formatKey)) 836 } 837 if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 || 838 base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 { 839 return base.CorruptionErrorf( 840 "file %s has inconsistent range key bounds relative to overall bounds: "+ 841 "overall = [%s-%s], range keys = [%s-%s]", 842 errors.Safe(m.FileNum), 843 m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey), 844 m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey), 845 ) 846 } 847 } 848 849 // Ensure that FileMetadata.Init was called. 850 if m.FileBacking == nil { 851 return base.CorruptionErrorf("file metadata FileBacking not set") 852 } 853 854 return nil 855 } 856 857 // TableInfo returns a subset of the FileMetadata state formatted as a 858 // TableInfo. 859 func (m *FileMetadata) TableInfo() TableInfo { 860 return TableInfo{ 861 FileNum: m.FileNum, 862 Size: m.Size, 863 Smallest: m.Smallest, 864 Largest: m.Largest, 865 SmallestSeqNum: m.SmallestSeqNum, 866 LargestSeqNum: m.LargestSeqNum, 867 } 868 } 869 870 func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int { 871 // NB: This is the same ordering that RocksDB uses for L0 files. 872 873 // Sort first by largest sequence number. 874 if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 { 875 return v 876 } 877 // Then by smallest sequence number. 878 if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 { 879 return v 880 } 881 // Break ties by file number. 882 return stdcmp.Compare(m.FileNum, b.FileNum) 883 } 884 885 func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool { 886 return m.cmpSeqNum(b) < 0 887 } 888 889 func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int { 890 return base.InternalCompare(cmp, m.Smallest, b.Smallest) 891 } 892 893 // KeyRange returns the minimum smallest and maximum largest internalKey for 894 // all the FileMetadata in iters. 895 func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) { 896 first := true 897 for _, iter := range iters { 898 for meta := iter.First(); meta != nil; meta = iter.Next() { 899 if first { 900 first = false 901 smallest, largest = meta.Smallest, meta.Largest 902 continue 903 } 904 if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 { 905 smallest = meta.Smallest 906 } 907 if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 { 908 largest = meta.Largest 909 } 910 } 911 } 912 return smallest, largest 913 } 914 915 type bySeqNum []*FileMetadata 916 917 func (b bySeqNum) Len() int { return len(b) } 918 func (b bySeqNum) Less(i, j int) bool { 919 return b[i].lessSeqNum(b[j]) 920 } 921 func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 922 923 // SortBySeqNum sorts the specified files by increasing sequence number. 924 func SortBySeqNum(files []*FileMetadata) { 925 sort.Sort(bySeqNum(files)) 926 } 927 928 type bySmallest struct { 929 files []*FileMetadata 930 cmp Compare 931 } 932 933 func (b bySmallest) Len() int { return len(b.files) } 934 func (b bySmallest) Less(i, j int) bool { 935 return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0 936 } 937 func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] } 938 939 // SortBySmallest sorts the specified files by smallest key using the supplied 940 // comparison function to order user keys. 941 func SortBySmallest(files []*FileMetadata, cmp Compare) { 942 sort.Sort(bySmallest{files, cmp}) 943 } 944 945 func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice { 946 startIter := iter.Clone() 947 { 948 startIterFile := startIter.SeekGE(cmp, start) 949 // SeekGE compares user keys. The user key `start` may be equal to the 950 // f.Largest because f.Largest is a range deletion sentinel, indicating 951 // that the user key `start` is NOT contained within the file f. If 952 // that's the case, we can narrow the overlapping bounds to exclude the 953 // file with the sentinel. 954 if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() && 955 cmp(startIterFile.Largest.UserKey, start) == 0 { 956 startIterFile = startIter.Next() 957 } 958 _ = startIterFile // Ignore unused assignment. 959 } 960 961 endIter := iter.Clone() 962 { 963 endIterFile := endIter.SeekGE(cmp, end) 964 965 if !exclusiveEnd { 966 // endIter is now pointing at the *first* file with a largest key >= end. 967 // If there are multiple files including the user key `end`, we want all 968 // of them, so move forward. 969 for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 { 970 endIterFile = endIter.Next() 971 } 972 } 973 974 // LevelSlice uses inclusive bounds, so if we seeked to the end sentinel 975 // or nexted too far because Largest.UserKey equaled `end`, go back. 976 // 977 // Consider !exclusiveEnd and end = 'f', with the following file bounds: 978 // 979 // [b,d] [e, f] [f, f] [g, h] 980 // 981 // the above for loop will Next until it arrives at [g, h]. We need to 982 // observe that g > f, and Prev to the file with bounds [f, f]. 983 if endIterFile == nil { 984 endIterFile = endIter.Prev() 985 } else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd { 986 endIterFile = endIter.Prev() 987 } 988 _ = endIterFile // Ignore unused assignment. 989 } 990 return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter) 991 } 992 993 // NumLevels is the number of levels a Version contains. 994 const NumLevels = 7 995 996 // NewVersion constructs a new Version with the provided files. It requires 997 // the provided files are already well-ordered. It's intended for testing. 998 func NewVersion( 999 cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata, 1000 ) *Version { 1001 var v Version 1002 for l := range files { 1003 // NB: We specifically insert `files` into the B-Tree in the order 1004 // they appear within `files`. Some tests depend on this behavior in 1005 // order to test consistency checking, etc. Once we've constructed the 1006 // initial B-Tree, we swap out the btreeCmp for the correct one. 1007 // TODO(jackson): Adjust or remove the tests and remove this. 1008 v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l]) 1009 v.Levels[l].level = l 1010 if l == 0 { 1011 v.Levels[l].tree.cmp = btreeCmpSeqNum 1012 } else { 1013 v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp) 1014 } 1015 for _, f := range files[l] { 1016 v.Levels[l].totalSize += f.Size 1017 } 1018 } 1019 if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 1020 panic(err) 1021 } 1022 return &v 1023 } 1024 1025 // Version is a collection of file metadata for on-disk tables at various 1026 // levels. In-memory DBs are written to level-0 tables, and compactions 1027 // migrate data from level N to level N+1. The tables map internal keys (which 1028 // are a user key, a delete or set bit, and a sequence number) to user values. 1029 // 1030 // The tables at level 0 are sorted by largest sequence number. Due to file 1031 // ingestion, there may be overlap in the ranges of sequence numbers contain in 1032 // level 0 sstables. In particular, it is valid for one level 0 sstable to have 1033 // the seqnum range [1,100] while an adjacent sstable has the seqnum range 1034 // [50,50]. This occurs when the [50,50] table was ingested and given a global 1035 // seqnum. The ingestion code will have ensured that the [50,50] sstable will 1036 // not have any keys that overlap with the [1,100] in the seqnum range 1037 // [1,49]. The range of internal keys [fileMetadata.smallest, 1038 // fileMetadata.largest] in each level 0 table may overlap. 1039 // 1040 // The tables at any non-0 level are sorted by their internal key range and any 1041 // two tables at the same non-0 level do not overlap. 1042 // 1043 // The internal key ranges of two tables at different levels X and Y may 1044 // overlap, for any X != Y. 1045 // 1046 // Finally, for every internal key in a table at level X, there is no internal 1047 // key in a higher level table that has both the same user key and a higher 1048 // sequence number. 1049 type Version struct { 1050 refs atomic.Int32 1051 1052 // The level 0 sstables are organized in a series of sublevels. Similar to 1053 // the seqnum invariant in normal levels, there is no internal key in a 1054 // higher level table that has both the same user key and a higher sequence 1055 // number. Within a sublevel, tables are sorted by their internal key range 1056 // and any two tables at the same sublevel do not overlap. Unlike the normal 1057 // levels, sublevel n contains older tables (lower sequence numbers) than 1058 // sublevel n+1. 1059 // 1060 // The L0Sublevels struct is mostly used for compaction picking. As most 1061 // internal data structures in it are only necessary for compaction picking 1062 // and not for iterator creation, the reference to L0Sublevels is nil'd 1063 // after this version becomes the non-newest version, to reduce memory 1064 // usage. 1065 // 1066 // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files 1067 // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to 1068 // a reference to that slice, as that slice is necessary for iterator 1069 // creation and needs to outlast L0Sublevels. 1070 L0Sublevels *L0Sublevels 1071 L0SublevelFiles []LevelSlice 1072 1073 Levels [NumLevels]LevelMetadata 1074 1075 // RangeKeyLevels holds a subset of the same files as Levels that contain range 1076 // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this 1077 // duplication should be minimal, as range keys are expected to be rare. 1078 RangeKeyLevels [NumLevels]LevelMetadata 1079 1080 // The callback to invoke when the last reference to a version is 1081 // removed. Will be called with list.mu held. 1082 Deleted func(obsolete []*FileBacking) 1083 1084 // Stats holds aggregated stats about the version maintained from 1085 // version to version. 1086 Stats struct { 1087 // MarkedForCompaction records the count of files marked for 1088 // compaction within the version. 1089 MarkedForCompaction int 1090 } 1091 1092 // The list the version is linked into. 1093 list *VersionList 1094 1095 // The next/prev link for the versionList doubly-linked list of versions. 1096 prev, next *Version 1097 } 1098 1099 // String implements fmt.Stringer, printing the FileMetadata for each level in 1100 // the Version. 1101 func (v *Version) String() string { 1102 return v.string(base.DefaultFormatter, false) 1103 } 1104 1105 // DebugString returns an alternative format to String() which includes sequence 1106 // number and kind information for the sstable boundaries. 1107 func (v *Version) DebugString(format base.FormatKey) string { 1108 return v.string(format, true) 1109 } 1110 1111 func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string { 1112 var buf bytes.Buffer 1113 for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- { 1114 fmt.Fprintf(&buf, "0.%d:\n", sublevel) 1115 sublevels[sublevel].Each(func(f *FileMetadata) { 1116 fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose)) 1117 }) 1118 } 1119 return buf.String() 1120 } 1121 1122 func (v *Version) string(format base.FormatKey, verbose bool) string { 1123 var buf bytes.Buffer 1124 if len(v.L0SublevelFiles) > 0 { 1125 fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles)) 1126 } 1127 for level := 1; level < NumLevels; level++ { 1128 if v.Levels[level].Empty() { 1129 continue 1130 } 1131 fmt.Fprintf(&buf, "%d:\n", level) 1132 iter := v.Levels[level].Iter() 1133 for f := iter.First(); f != nil; f = iter.Next() { 1134 fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose)) 1135 } 1136 } 1137 return buf.String() 1138 } 1139 1140 // ParseVersionDebug parses a Version from its DebugString output. 1141 func ParseVersionDebug( 1142 cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string, 1143 ) (*Version, error) { 1144 var level int 1145 var files [NumLevels][]*FileMetadata 1146 for _, l := range strings.Split(s, "\n") { 1147 l = strings.TrimSpace(l) 1148 1149 switch l[:2] { 1150 case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:": 1151 var err error 1152 level, err = strconv.Atoi(l[:1]) 1153 if err != nil { 1154 return nil, err 1155 } 1156 default: 1157 m, err := ParseFileMetadataDebug(l) 1158 if err != nil { 1159 return nil, err 1160 } 1161 // If we only parsed overall bounds, default to setting the point bounds. 1162 if !m.HasPointKeys && !m.HasRangeKeys { 1163 m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest 1164 m.HasPointKeys = true 1165 } 1166 files[level] = append(files[level], m) 1167 } 1168 } 1169 // Reverse the order of L0 files. This ensures we construct the same 1170 // sublevels. (They're printed from higher sublevel to lower, which means in 1171 // a partial order that represents newest to oldest). 1172 for i := 0; i < len(files[0])/2; i++ { 1173 files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i] 1174 } 1175 return NewVersion(cmp, formatKey, flushSplitBytes, files), nil 1176 } 1177 1178 // Refs returns the number of references to the version. 1179 func (v *Version) Refs() int32 { 1180 return v.refs.Load() 1181 } 1182 1183 // Ref increments the version refcount. 1184 func (v *Version) Ref() { 1185 v.refs.Add(1) 1186 } 1187 1188 // Unref decrements the version refcount. If the last reference to the version 1189 // was removed, the version is removed from the list of versions and the 1190 // Deleted callback is invoked. Requires that the VersionList mutex is NOT 1191 // locked. 1192 func (v *Version) Unref() { 1193 if v.refs.Add(-1) == 0 { 1194 l := v.list 1195 l.mu.Lock() 1196 l.Remove(v) 1197 v.Deleted(v.unrefFiles()) 1198 l.mu.Unlock() 1199 } 1200 } 1201 1202 // UnrefLocked decrements the version refcount. If the last reference to the 1203 // version was removed, the version is removed from the list of versions and 1204 // the Deleted callback is invoked. Requires that the VersionList mutex is 1205 // already locked. 1206 func (v *Version) UnrefLocked() { 1207 if v.refs.Add(-1) == 0 { 1208 v.list.Remove(v) 1209 v.Deleted(v.unrefFiles()) 1210 } 1211 } 1212 1213 func (v *Version) unrefFiles() []*FileBacking { 1214 var obsolete []*FileBacking 1215 for _, lm := range v.Levels { 1216 obsolete = append(obsolete, lm.release()...) 1217 } 1218 for _, lm := range v.RangeKeyLevels { 1219 obsolete = append(obsolete, lm.release()...) 1220 } 1221 return obsolete 1222 } 1223 1224 // Next returns the next version in the list of versions. 1225 func (v *Version) Next() *Version { 1226 return v.next 1227 } 1228 1229 // InitL0Sublevels initializes the L0Sublevels 1230 func (v *Version) InitL0Sublevels( 1231 cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, 1232 ) error { 1233 var err error 1234 v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes) 1235 if err == nil && v.L0Sublevels != nil { 1236 v.L0SublevelFiles = v.L0Sublevels.Levels 1237 } 1238 return err 1239 } 1240 1241 // Contains returns a boolean indicating whether the provided file exists in 1242 // the version at the given level. If level is non-zero then Contains binary 1243 // searches among the files. If level is zero, Contains scans the entire 1244 // level. 1245 func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool { 1246 iter := v.Levels[level].Iter() 1247 if level > 0 { 1248 overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey, 1249 m.Largest.IsExclusiveSentinel()) 1250 iter = overlaps.Iter() 1251 } 1252 for f := iter.First(); f != nil; f = iter.Next() { 1253 if f == m { 1254 return true 1255 } 1256 } 1257 return false 1258 } 1259 1260 // Overlaps returns all elements of v.files[level] whose user key range 1261 // intersects the given range. If level is non-zero then the user key ranges of 1262 // v.files[level] are assumed to not overlap (although they may touch). If level 1263 // is zero then that assumption cannot be made, and the [start, end] range is 1264 // expanded to the union of those matching ranges so far and the computation is 1265 // repeated until [start, end] stabilizes. 1266 // The returned files are a subsequence of the input files, i.e., the ordering 1267 // is not changed. 1268 func (v *Version) Overlaps( 1269 level int, cmp Compare, start, end []byte, exclusiveEnd bool, 1270 ) LevelSlice { 1271 if level == 0 { 1272 // Indices that have been selected as overlapping. 1273 l0 := v.Levels[level] 1274 l0Iter := l0.Iter() 1275 selectedIndices := make([]bool, l0.Len()) 1276 numSelected := 0 1277 var slice LevelSlice 1278 for { 1279 restart := false 1280 for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { 1281 selected := selectedIndices[i] 1282 if selected { 1283 continue 1284 } 1285 if !meta.Overlaps(cmp, start, end, exclusiveEnd) { 1286 // meta is completely outside the specified range; skip it. 1287 continue 1288 } 1289 // Overlaps. 1290 selectedIndices[i] = true 1291 numSelected++ 1292 1293 smallest := meta.Smallest.UserKey 1294 largest := meta.Largest.UserKey 1295 // Since level == 0, check if the newly added fileMetadata has 1296 // expanded the range. We expand the range immediately for files 1297 // we have remaining to check in this loop. All already checked 1298 // and unselected files will need to be rechecked via the 1299 // restart below. 1300 if cmp(smallest, start) < 0 { 1301 start = smallest 1302 restart = true 1303 } 1304 if v := cmp(largest, end); v > 0 { 1305 end = largest 1306 exclusiveEnd = meta.Largest.IsExclusiveSentinel() 1307 restart = true 1308 } else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() { 1309 // Only update the exclusivity of our existing `end` 1310 // bound. 1311 exclusiveEnd = false 1312 restart = true 1313 } 1314 } 1315 1316 if !restart { 1317 // Construct a B-Tree containing only the matching items. 1318 var tr btree 1319 tr.cmp = v.Levels[level].tree.cmp 1320 for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { 1321 if selectedIndices[i] { 1322 err := tr.Insert(meta) 1323 if err != nil { 1324 panic(err) 1325 } 1326 } 1327 } 1328 slice = newLevelSlice(tr.Iter()) 1329 // TODO(jackson): Avoid the oddity of constructing and 1330 // immediately releasing a B-Tree. Make LevelSlice an 1331 // interface? 1332 tr.Release() 1333 break 1334 } 1335 // Continue looping to retry the files that were not selected. 1336 } 1337 return slice 1338 } 1339 1340 return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd) 1341 } 1342 1343 // CheckOrdering checks that the files are consistent with respect to 1344 // increasing file numbers (for level 0 files) and increasing and non- 1345 // overlapping internal key ranges (for level non-0 files). 1346 func (v *Version) CheckOrdering( 1347 cmp Compare, format base.FormatKey, order OrderingInvariants, 1348 ) error { 1349 for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { 1350 sublevelIter := v.L0SublevelFiles[sublevel].Iter() 1351 // Sublevels have NEVER allowed split user keys, so we can pass 1352 // ProhibitSplitUserKeys. 1353 if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter, ProhibitSplitUserKeys); err != nil { 1354 return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format)) 1355 } 1356 } 1357 1358 for level, lm := range v.Levels { 1359 if err := CheckOrdering(cmp, format, Level(level), lm.Iter(), order); err != nil { 1360 return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format)) 1361 } 1362 } 1363 return nil 1364 } 1365 1366 // VersionList holds a list of versions. The versions are ordered from oldest 1367 // to newest. 1368 type VersionList struct { 1369 mu *sync.Mutex 1370 root Version 1371 } 1372 1373 // Init initializes the version list. 1374 func (l *VersionList) Init(mu *sync.Mutex) { 1375 l.mu = mu 1376 l.root.next = &l.root 1377 l.root.prev = &l.root 1378 } 1379 1380 // Empty returns true if the list is empty, and false otherwise. 1381 func (l *VersionList) Empty() bool { 1382 return l.root.next == &l.root 1383 } 1384 1385 // Front returns the oldest version in the list. Note that this version is only 1386 // valid if Empty() returns true. 1387 func (l *VersionList) Front() *Version { 1388 return l.root.next 1389 } 1390 1391 // Back returns the newest version in the list. Note that this version is only 1392 // valid if Empty() returns true. 1393 func (l *VersionList) Back() *Version { 1394 return l.root.prev 1395 } 1396 1397 // PushBack adds a new version to the back of the list. This new version 1398 // becomes the "newest" version in the list. 1399 func (l *VersionList) PushBack(v *Version) { 1400 if v.list != nil || v.prev != nil || v.next != nil { 1401 panic("pebble: version list is inconsistent") 1402 } 1403 v.prev = l.root.prev 1404 v.prev.next = v 1405 v.next = &l.root 1406 v.next.prev = v 1407 v.list = l 1408 // Let L0Sublevels on the second newest version get GC'd, as it is no longer 1409 // necessary. See the comment in Version. 1410 v.prev.L0Sublevels = nil 1411 } 1412 1413 // Remove removes the specified version from the list. 1414 func (l *VersionList) Remove(v *Version) { 1415 if v == &l.root { 1416 panic("pebble: cannot remove version list root node") 1417 } 1418 if v.list != l { 1419 panic("pebble: version list is inconsistent") 1420 } 1421 v.prev.next = v.next 1422 v.next.prev = v.prev 1423 v.next = nil // avoid memory leaks 1424 v.prev = nil // avoid memory leaks 1425 v.list = nil // avoid memory leaks 1426 } 1427 1428 // OrderingInvariants dictates the file ordering invariants active. 1429 type OrderingInvariants int8 1430 1431 const ( 1432 // ProhibitSplitUserKeys indicates that adjacent files within a level cannot 1433 // contain the same user key. 1434 ProhibitSplitUserKeys OrderingInvariants = iota 1435 // AllowSplitUserKeys indicates that adjacent files within a level may 1436 // contain the same user key. This is only allowed by historical format 1437 // major versions. 1438 // 1439 // TODO(jackson): Remove. 1440 AllowSplitUserKeys 1441 ) 1442 1443 // CheckOrdering checks that the files are consistent with respect to 1444 // seqnums (for level 0 files -- see detailed comment below) and increasing and non- 1445 // overlapping internal key ranges (for non-level 0 files). 1446 // 1447 // The ordering field may be passed AllowSplitUserKeys to allow adjacent files that are both 1448 // inclusive of the same user key. Pebble no longer creates version edits 1449 // installing such files, and Pebble databases with sufficiently high format 1450 // major version should no longer have any such files within their LSM. 1451 // TODO(jackson): Remove AllowSplitUserKeys when we remove support for the 1452 // earlier format major versions. 1453 func CheckOrdering( 1454 cmp Compare, format base.FormatKey, level Level, files LevelIterator, ordering OrderingInvariants, 1455 ) error { 1456 // The invariants to check for L0 sublevels are the same as the ones to 1457 // check for all other levels. However, if L0 is not organized into 1458 // sublevels, or if all L0 files are being passed in, we do the legacy L0 1459 // checks, defined in the detailed comment below. 1460 if level == Level(0) { 1461 // We have 2 kinds of files: 1462 // - Files with exactly one sequence number: these could be either ingested files 1463 // or flushed files. We cannot tell the difference between them based on FileMetadata, 1464 // so our consistency checking here uses the weaker checks assuming it is a narrow 1465 // flushed file. We cannot error on ingested files having sequence numbers coincident 1466 // with flushed files as the seemingly ingested file could just be a flushed file 1467 // with just one key in it which is a truncated range tombstone sharing sequence numbers 1468 // with other files in the same flush. 1469 // - Files with multiple sequence numbers: these are necessarily flushed files. 1470 // 1471 // Three cases of overlapping sequence numbers: 1472 // Case 1: 1473 // An ingested file contained in the sequence numbers of the flushed file -- it must be 1474 // fully contained (not coincident with either end of the flushed file) since the memtable 1475 // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence 1476 // num b, and the memtable got a subsequent update that was given sequence num b+1, before 1477 // being flushed. 1478 // 1479 // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and 1480 // third file are inconsistent with each other. So comparing adjacent files is insufficient 1481 // for consistency checking. 1482 // 1483 // Visually we have something like 1484 // x------y x-----------yx-------------y (flushed files where x, y are the endpoints) 1485 // y y y y (y's represent ingested files) 1486 // And these are ordered in increasing order of y. Note that y's must be unique. 1487 // 1488 // Case 2: 1489 // A flushed file that did not overlap in keys with any file in any level, but does overlap 1490 // in the file key intervals. This file is placed in L0 since it overlaps in the file 1491 // key intervals but since it has no overlapping data, it is assigned a sequence number 1492 // of 0 in RocksDB. We handle this case for compatibility with RocksDB. 1493 // 1494 // Case 3: 1495 // A sequence of flushed files that overlap in sequence numbers with one another, 1496 // but do not overlap in keys inside the sstables. These files correspond to 1497 // partitioned flushes or the results of intra-L0 compactions of partitioned 1498 // flushes. 1499 // 1500 // Since these types of SSTables violate most other sequence number 1501 // overlap invariants, and handling this case is important for compatibility 1502 // with future versions of pebble, this method relaxes most L0 invariant 1503 // checks. 1504 1505 var prev *FileMetadata 1506 for f := files.First(); f != nil; f, prev = files.Next(), f { 1507 if prev == nil { 1508 continue 1509 } 1510 // Validate that the sorting is sane. 1511 if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum { 1512 // Multiple files satisfying case 2 mentioned above. 1513 } else if !prev.lessSeqNum(f) { 1514 return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>", 1515 errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1516 errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum), 1517 errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum)) 1518 } 1519 } 1520 } else { 1521 var prev *FileMetadata 1522 for f := files.First(); f != nil; f, prev = files.Next(), f { 1523 if err := f.Validate(cmp, format); err != nil { 1524 return errors.Wrapf(err, "%s ", level) 1525 } 1526 if prev != nil { 1527 if prev.cmpSmallestKey(f, cmp) >= 0 { 1528 return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]", 1529 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1530 prev.Smallest.Pretty(format), prev.Largest.Pretty(format), 1531 f.Smallest.Pretty(format), f.Largest.Pretty(format)) 1532 } 1533 1534 // What's considered "overlapping" is dependent on the format 1535 // major version. If ordering=ProhibitSplitUserKeys, then both 1536 // files cannot contain keys with the same user keys. If the 1537 // bounds have the same user key, the previous file's boundary 1538 // must have a Trailer indicating that it's exclusive. 1539 switch ordering { 1540 case AllowSplitUserKeys: 1541 if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 { 1542 return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]", 1543 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1544 prev.Smallest.Pretty(format), prev.Largest.Pretty(format), 1545 f.Smallest.Pretty(format), f.Largest.Pretty(format)) 1546 } 1547 case ProhibitSplitUserKeys: 1548 if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) { 1549 return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]", 1550 errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), 1551 prev.Smallest.Pretty(format), prev.Largest.Pretty(format), 1552 f.Smallest.Pretty(format), f.Largest.Pretty(format)) 1553 } 1554 default: 1555 panic("unreachable") 1556 } 1557 } 1558 } 1559 } 1560 return nil 1561 }