github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/manifest/version.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 "fmt" 10 "sort" 11 "sync" 12 "sync/atomic" 13 14 "github.com/petermattis/pebble/internal/base" 15 ) 16 17 type Compare = base.Compare 18 type InternalKey = base.InternalKey 19 type Options = base.Options 20 type TableInfo = base.TableInfo 21 22 // FileMetadata holds the metadata for an on-disk table. 23 type FileMetadata struct { 24 // reference count for the file: incremented when a file is added to a 25 // version and decremented when the version is unreferenced. The file is 26 // obsolete when the reference count falls to zero. This is a pointer because 27 // fileMetadata is copied by value from version to version, but we want the 28 // reference count to be shared. 29 refs *int32 30 // FileNum is the file number. 31 FileNum uint64 32 // Size is the Size of the file, in bytes. 33 Size uint64 34 // Smallest and Largest are the inclusive bounds for the internal keys 35 // stored in the table. 36 Smallest InternalKey 37 Largest InternalKey 38 // Smallest and largest sequence numbers in the table. 39 SmallestSeqNum uint64 40 LargestSeqNum uint64 41 // true if client asked us nicely to compact this file. 42 MarkedForCompaction bool 43 } 44 45 func (m *FileMetadata) String() string { 46 return fmt.Sprintf("%d:%s-%s", m.FileNum, m.Smallest, m.Largest) 47 } 48 49 // TableInfo returns a subset of the FileMetadata state formatted as a 50 // TableInfo. 51 func (m *FileMetadata) TableInfo(dirname string) TableInfo { 52 return TableInfo{ 53 Path: base.MakeFilename(dirname, base.FileTypeTable, m.FileNum), 54 FileNum: m.FileNum, 55 Size: m.Size, 56 Smallest: m.Smallest, 57 Largest: m.Largest, 58 SmallestSeqNum: m.SmallestSeqNum, 59 LargestSeqNum: m.LargestSeqNum, 60 } 61 } 62 63 // KeyRange returns the minimum smallest and maximum largest internalKey for 64 // all the fileMetadata in f0 and f1. 65 func KeyRange(ucmp Compare, f0, f1 []FileMetadata) (smallest, largest InternalKey) { 66 first := true 67 for _, f := range [2][]FileMetadata{f0, f1} { 68 for _, meta := range f { 69 if first { 70 first = false 71 smallest, largest = meta.Smallest, meta.Largest 72 continue 73 } 74 if base.InternalCompare(ucmp, meta.Smallest, smallest) < 0 { 75 smallest = meta.Smallest 76 } 77 if base.InternalCompare(ucmp, meta.Largest, largest) > 0 { 78 largest = meta.Largest 79 } 80 } 81 } 82 return smallest, largest 83 } 84 85 type bySeqNum []FileMetadata 86 87 func (b bySeqNum) Len() int { return len(b) } 88 func (b bySeqNum) Less(i, j int) bool { 89 // NB: This is the same ordering that RocksDB uses for L0 files. 90 91 // Sort first by largest sequence number. 92 if b[i].LargestSeqNum != b[j].LargestSeqNum { 93 return b[i].LargestSeqNum < b[j].LargestSeqNum 94 } 95 // Then by smallest sequence number. 96 if b[i].SmallestSeqNum != b[j].SmallestSeqNum { 97 return b[i].SmallestSeqNum < b[j].SmallestSeqNum 98 } 99 // Break ties by file number. 100 return b[i].FileNum < b[j].FileNum 101 } 102 func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 103 104 // SortBySeqNum sorts the specified files by decreasing sequence number. 105 func SortBySeqNum(files []FileMetadata) { 106 sort.Sort(bySeqNum(files)) 107 } 108 109 type bySmallest struct { 110 dat []FileMetadata 111 cmp Compare 112 } 113 114 func (b bySmallest) Len() int { return len(b.dat) } 115 func (b bySmallest) Less(i, j int) bool { 116 return base.InternalCompare(b.cmp, b.dat[i].Smallest, b.dat[j].Smallest) < 0 117 } 118 func (b bySmallest) Swap(i, j int) { b.dat[i], b.dat[j] = b.dat[j], b.dat[i] } 119 120 // SortBySmallest sorts the specified files by smallest key using the supplied 121 // comparison function to order user keys. 122 func SortBySmallest(files []FileMetadata, cmp Compare) { 123 sort.Sort(bySmallest{files, cmp}) 124 } 125 126 // NumLevels is the number of levels a Version contains. 127 const NumLevels = 7 128 129 // Version is a collection of file metadata for on-disk tables at various 130 // levels. In-memory DBs are written to level-0 tables, and compactions 131 // migrate data from level N to level N+1. The tables map internal keys (which 132 // are a user key, a delete or set bit, and a sequence number) to user values. 133 // 134 // The tables at level 0 are sorted by increasing fileNum. If two level 0 135 // tables have fileNums i and j and i < j, then the sequence numbers of every 136 // internal key in table i are all less than those for table j. The range of 137 // internal keys [fileMetadata.smallest, fileMetadata.largest] in each level 0 138 // table may overlap. 139 // 140 // The tables at any non-0 level are sorted by their internal key range and any 141 // two tables at the same non-0 level do not overlap. 142 // 143 // The internal key ranges of two tables at different levels X and Y may 144 // overlap, for any X != Y. 145 // 146 // Finally, for every internal key in a table at level X, there is no internal 147 // key in a higher level table that has both the same user key and a higher 148 // sequence number. 149 type Version struct { 150 refs int32 151 152 Files [NumLevels][]FileMetadata 153 154 // The callback to invoke when the last reference to a version is 155 // removed. Will be called with list.mu held. 156 Deleted func(obsolete []uint64) 157 158 // The list the version is linked into. 159 list *VersionList 160 161 // The next/prev link for the versionList doubly-linked list of versions. 162 prev, next *Version 163 } 164 165 func (v *Version) String() string { 166 var buf bytes.Buffer 167 for level := 0; level < NumLevels; level++ { 168 if len(v.Files[level]) == 0 { 169 continue 170 } 171 fmt.Fprintf(&buf, "%d:", level) 172 for j := range v.Files[level] { 173 f := &v.Files[level][j] 174 fmt.Fprintf(&buf, " %s-%s", f.Smallest.UserKey, f.Largest.UserKey) 175 } 176 fmt.Fprintf(&buf, "\n") 177 } 178 return buf.String() 179 } 180 181 // DebugString returns an alternative format to String() which includes 182 // sequence number and kind information for the sstable boundaries. 183 func (v *Version) DebugString() string { 184 var buf bytes.Buffer 185 for level := 0; level < NumLevels; level++ { 186 if len(v.Files[level]) == 0 { 187 continue 188 } 189 fmt.Fprintf(&buf, "%d:", level) 190 for j := range v.Files[level] { 191 f := &v.Files[level][j] 192 fmt.Fprintf(&buf, " %s-%s", f.Smallest, f.Largest) 193 } 194 fmt.Fprintf(&buf, "\n") 195 } 196 return buf.String() 197 } 198 199 // Refs returns the number of references to the version. 200 func (v *Version) Refs() int32 { 201 return atomic.LoadInt32(&v.refs) 202 } 203 204 // Ref increments the version refcount. 205 func (v *Version) Ref() { 206 atomic.AddInt32(&v.refs, 1) 207 } 208 209 // Unref decrements the version refcount. If the last reference to the version 210 // was removed, the version is removed from the list of versions and the 211 // Deleted callback is invoked. Requires that the VersionList mutex is NOT 212 // locked. 213 func (v *Version) Unref() { 214 if atomic.AddInt32(&v.refs, -1) == 0 { 215 obsolete := v.unrefFiles() 216 l := v.list 217 l.mu.Lock() 218 l.Remove(v) 219 v.Deleted(obsolete) 220 l.mu.Unlock() 221 } 222 } 223 224 // UnrefLocked decrements the version refcount. If the last reference to the 225 // version was removed, the version is removed from the list of versions and 226 // the Deleted callback is invoked. Requires that the VersionList mutex is 227 // already locked. 228 func (v *Version) UnrefLocked() { 229 if atomic.AddInt32(&v.refs, -1) == 0 { 230 v.list.Remove(v) 231 v.Deleted(v.unrefFiles()) 232 } 233 } 234 235 func (v *Version) unrefFiles() []uint64 { 236 var obsolete []uint64 237 for _, files := range v.Files { 238 for i := range files { 239 f := &files[i] 240 if atomic.AddInt32(f.refs, -1) == 0 { 241 obsolete = append(obsolete, f.FileNum) 242 } 243 } 244 } 245 return obsolete 246 } 247 248 // Next returns the next version in the list of versions. 249 func (v *Version) Next() *Version { 250 return v.next 251 } 252 253 // Overlaps returns all elements of v.files[level] whose user key range 254 // intersects the inclusive range [start, end]. If level is non-zero then the 255 // user key ranges of v.files[level] are assumed to not overlap (although they 256 // may touch). If level is zero then that assumption cannot be made, and the 257 // [start, end] range is expanded to the union of those matching ranges so far 258 // and the computation is repeated until [start, end] stabilizes. 259 func (v *Version) Overlaps( 260 level int, cmp Compare, start, end []byte, 261 ) (ret []FileMetadata) { 262 if level == 0 { 263 // The sstables in level 0 can overlap with each other. As soon as we find 264 // one sstable that overlaps with our target range, we need to expand the 265 // range and find all sstables that overlap with the expanded range. 266 loop: 267 for { 268 for _, meta := range v.Files[level] { 269 smallest := meta.Smallest.UserKey 270 largest := meta.Largest.UserKey 271 if cmp(largest, start) < 0 { 272 // meta is completely before the specified range; skip it. 273 continue 274 } 275 if cmp(smallest, end) > 0 { 276 // meta is completely after the specified range; skip it. 277 continue 278 } 279 ret = append(ret, meta) 280 281 // If level == 0, check if the newly added fileMetadata has 282 // expanded the range. If so, restart the search. 283 restart := false 284 if cmp(smallest, start) < 0 { 285 start = smallest 286 restart = true 287 } 288 if cmp(largest, end) > 0 { 289 end = largest 290 restart = true 291 } 292 if restart { 293 ret = ret[:0] 294 continue loop 295 } 296 } 297 return ret 298 } 299 } 300 301 // Binary search to find the range of files which overlaps with our target 302 // range. 303 files := v.Files[level] 304 lower := sort.Search(len(files), func(i int) bool { 305 return cmp(files[i].Largest.UserKey, start) >= 0 306 }) 307 upper := sort.Search(len(files), func(i int) bool { 308 return cmp(files[i].Smallest.UserKey, end) > 0 309 }) 310 if lower >= upper { 311 return nil 312 } 313 return files[lower:upper] 314 } 315 316 // CheckOrdering checks that the files are consistent with respect to 317 // increasing file numbers (for level 0 files) and increasing and non- 318 // overlapping internal key ranges (for level non-0 files). 319 func (v *Version) CheckOrdering(cmp Compare) error { 320 for level, ff := range v.Files { 321 if level == 0 { 322 for i := 1; i < len(ff); i++ { 323 prev := &ff[i-1] 324 f := &ff[i] 325 if prev.LargestSeqNum >= f.LargestSeqNum { 326 return fmt.Errorf("level 0 files are not in increasing largest seqNum order: %d, %d", 327 prev.LargestSeqNum, f.LargestSeqNum) 328 } 329 if prev.SmallestSeqNum >= f.SmallestSeqNum { 330 return fmt.Errorf("level 0 files are not in increasing smallest seqNum order: %d, %d", 331 prev.SmallestSeqNum, f.SmallestSeqNum) 332 } 333 } 334 } else { 335 for i := 1; i < len(ff); i++ { 336 prev := &ff[i-1] 337 f := &ff[i] 338 if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 { 339 return fmt.Errorf("level non-0 files are not in increasing ikey order: %s, %s\n%s", 340 prev.Largest, f.Smallest, v.DebugString()) 341 } 342 if base.InternalCompare(cmp, f.Smallest, f.Largest) > 0 { 343 return fmt.Errorf("level non-0 file has inconsistent bounds: %s, %s", 344 f.Smallest, f.Largest) 345 } 346 } 347 } 348 } 349 return nil 350 } 351 352 // VersionList holds a list of versions. The versions are ordered from oldest 353 // to newest. 354 type VersionList struct { 355 mu *sync.Mutex 356 root Version 357 } 358 359 // Init initializes the version list. 360 func (l *VersionList) Init(mu *sync.Mutex) { 361 l.mu = mu 362 l.root.next = &l.root 363 l.root.prev = &l.root 364 } 365 366 // Empty returns true if the list is empty, and false otherwise. 367 func (l *VersionList) Empty() bool { 368 return l.root.next == &l.root 369 } 370 371 // Front returns the oldest version in the list. Note that this version is only 372 // valid if Empty() returns true. 373 func (l *VersionList) Front() *Version { 374 return l.root.next 375 } 376 377 // Back returns the newest version in the list. Note that this version is only 378 // valid if Empty() returns true. 379 func (l *VersionList) Back() *Version { 380 return l.root.prev 381 } 382 383 // PushBack adds a new version to the back of the list. This new version 384 // becomes the "newest" version in the list. 385 func (l *VersionList) PushBack(v *Version) { 386 if v.list != nil || v.prev != nil || v.next != nil { 387 panic("pebble: version list is inconsistent") 388 } 389 v.prev = l.root.prev 390 v.prev.next = v 391 v.next = &l.root 392 v.next.prev = v 393 v.list = l 394 } 395 396 // Remove removes the specified version from the list. 397 func (l *VersionList) Remove(v *Version) { 398 if v == &l.root { 399 panic("pebble: cannot remove version list root node") 400 } 401 if v.list != l { 402 panic("pebble: version list is inconsistent") 403 } 404 v.prev.next = v.next 405 v.next.prev = v.prev 406 v.next = nil // avoid memory leaks 407 v.prev = nil // avoid memory leaks 408 v.list = nil // avoid memory leaks 409 }