github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/manifest/version.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"sort"
    11  	"sync"
    12  	"sync/atomic"
    13  
    14  	"github.com/petermattis/pebble/internal/base"
    15  )
    16  
    17  type Compare = base.Compare
    18  type InternalKey = base.InternalKey
    19  type Options = base.Options
    20  type TableInfo = base.TableInfo
    21  
    22  // FileMetadata holds the metadata for an on-disk table.
    23  type FileMetadata struct {
    24  	// reference count for the file: incremented when a file is added to a
    25  	// version and decremented when the version is unreferenced. The file is
    26  	// obsolete when the reference count falls to zero. This is a pointer because
    27  	// fileMetadata is copied by value from version to version, but we want the
    28  	// reference count to be shared.
    29  	refs *int32
    30  	// FileNum is the file number.
    31  	FileNum uint64
    32  	// Size is the Size of the file, in bytes.
    33  	Size uint64
    34  	// Smallest and Largest are the inclusive bounds for the internal keys
    35  	// stored in the table.
    36  	Smallest InternalKey
    37  	Largest  InternalKey
    38  	// Smallest and largest sequence numbers in the table.
    39  	SmallestSeqNum uint64
    40  	LargestSeqNum  uint64
    41  	// true if client asked us nicely to compact this file.
    42  	MarkedForCompaction bool
    43  }
    44  
    45  func (m *FileMetadata) String() string {
    46  	return fmt.Sprintf("%d:%s-%s", m.FileNum, m.Smallest, m.Largest)
    47  }
    48  
    49  // TableInfo returns a subset of the FileMetadata state formatted as a
    50  // TableInfo.
    51  func (m *FileMetadata) TableInfo(dirname string) TableInfo {
    52  	return TableInfo{
    53  		Path:           base.MakeFilename(dirname, base.FileTypeTable, m.FileNum),
    54  		FileNum:        m.FileNum,
    55  		Size:           m.Size,
    56  		Smallest:       m.Smallest,
    57  		Largest:        m.Largest,
    58  		SmallestSeqNum: m.SmallestSeqNum,
    59  		LargestSeqNum:  m.LargestSeqNum,
    60  	}
    61  }
    62  
    63  // KeyRange returns the minimum smallest and maximum largest internalKey for
    64  // all the fileMetadata in f0 and f1.
    65  func KeyRange(ucmp Compare, f0, f1 []FileMetadata) (smallest, largest InternalKey) {
    66  	first := true
    67  	for _, f := range [2][]FileMetadata{f0, f1} {
    68  		for _, meta := range f {
    69  			if first {
    70  				first = false
    71  				smallest, largest = meta.Smallest, meta.Largest
    72  				continue
    73  			}
    74  			if base.InternalCompare(ucmp, meta.Smallest, smallest) < 0 {
    75  				smallest = meta.Smallest
    76  			}
    77  			if base.InternalCompare(ucmp, meta.Largest, largest) > 0 {
    78  				largest = meta.Largest
    79  			}
    80  		}
    81  	}
    82  	return smallest, largest
    83  }
    84  
    85  type bySeqNum []FileMetadata
    86  
    87  func (b bySeqNum) Len() int { return len(b) }
    88  func (b bySeqNum) Less(i, j int) bool {
    89  	// NB: This is the same ordering that RocksDB uses for L0 files.
    90  
    91  	// Sort first by largest sequence number.
    92  	if b[i].LargestSeqNum != b[j].LargestSeqNum {
    93  		return b[i].LargestSeqNum < b[j].LargestSeqNum
    94  	}
    95  	// Then by smallest sequence number.
    96  	if b[i].SmallestSeqNum != b[j].SmallestSeqNum {
    97  		return b[i].SmallestSeqNum < b[j].SmallestSeqNum
    98  	}
    99  	// Break ties by file number.
   100  	return b[i].FileNum < b[j].FileNum
   101  }
   102  func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
   103  
   104  // SortBySeqNum sorts the specified files by decreasing sequence number.
   105  func SortBySeqNum(files []FileMetadata) {
   106  	sort.Sort(bySeqNum(files))
   107  }
   108  
   109  type bySmallest struct {
   110  	dat []FileMetadata
   111  	cmp Compare
   112  }
   113  
   114  func (b bySmallest) Len() int { return len(b.dat) }
   115  func (b bySmallest) Less(i, j int) bool {
   116  	return base.InternalCompare(b.cmp, b.dat[i].Smallest, b.dat[j].Smallest) < 0
   117  }
   118  func (b bySmallest) Swap(i, j int) { b.dat[i], b.dat[j] = b.dat[j], b.dat[i] }
   119  
   120  // SortBySmallest sorts the specified files by smallest key using the supplied
   121  // comparison function to order user keys.
   122  func SortBySmallest(files []FileMetadata, cmp Compare) {
   123  	sort.Sort(bySmallest{files, cmp})
   124  }
   125  
   126  // NumLevels is the number of levels a Version contains.
   127  const NumLevels = 7
   128  
   129  // Version is a collection of file metadata for on-disk tables at various
   130  // levels. In-memory DBs are written to level-0 tables, and compactions
   131  // migrate data from level N to level N+1. The tables map internal keys (which
   132  // are a user key, a delete or set bit, and a sequence number) to user values.
   133  //
   134  // The tables at level 0 are sorted by increasing fileNum. If two level 0
   135  // tables have fileNums i and j and i < j, then the sequence numbers of every
   136  // internal key in table i are all less than those for table j. The range of
   137  // internal keys [fileMetadata.smallest, fileMetadata.largest] in each level 0
   138  // table may overlap.
   139  //
   140  // The tables at any non-0 level are sorted by their internal key range and any
   141  // two tables at the same non-0 level do not overlap.
   142  //
   143  // The internal key ranges of two tables at different levels X and Y may
   144  // overlap, for any X != Y.
   145  //
   146  // Finally, for every internal key in a table at level X, there is no internal
   147  // key in a higher level table that has both the same user key and a higher
   148  // sequence number.
   149  type Version struct {
   150  	refs int32
   151  
   152  	Files [NumLevels][]FileMetadata
   153  
   154  	// The callback to invoke when the last reference to a version is
   155  	// removed. Will be called with list.mu held.
   156  	Deleted func(obsolete []uint64)
   157  
   158  	// The list the version is linked into.
   159  	list *VersionList
   160  
   161  	// The next/prev link for the versionList doubly-linked list of versions.
   162  	prev, next *Version
   163  }
   164  
   165  func (v *Version) String() string {
   166  	var buf bytes.Buffer
   167  	for level := 0; level < NumLevels; level++ {
   168  		if len(v.Files[level]) == 0 {
   169  			continue
   170  		}
   171  		fmt.Fprintf(&buf, "%d:", level)
   172  		for j := range v.Files[level] {
   173  			f := &v.Files[level][j]
   174  			fmt.Fprintf(&buf, " %s-%s", f.Smallest.UserKey, f.Largest.UserKey)
   175  		}
   176  		fmt.Fprintf(&buf, "\n")
   177  	}
   178  	return buf.String()
   179  }
   180  
   181  // DebugString returns an alternative format to String() which includes
   182  // sequence number and kind information for the sstable boundaries.
   183  func (v *Version) DebugString() string {
   184  	var buf bytes.Buffer
   185  	for level := 0; level < NumLevels; level++ {
   186  		if len(v.Files[level]) == 0 {
   187  			continue
   188  		}
   189  		fmt.Fprintf(&buf, "%d:", level)
   190  		for j := range v.Files[level] {
   191  			f := &v.Files[level][j]
   192  			fmt.Fprintf(&buf, " %s-%s", f.Smallest, f.Largest)
   193  		}
   194  		fmt.Fprintf(&buf, "\n")
   195  	}
   196  	return buf.String()
   197  }
   198  
   199  // Refs returns the number of references to the version.
   200  func (v *Version) Refs() int32 {
   201  	return atomic.LoadInt32(&v.refs)
   202  }
   203  
   204  // Ref increments the version refcount.
   205  func (v *Version) Ref() {
   206  	atomic.AddInt32(&v.refs, 1)
   207  }
   208  
   209  // Unref decrements the version refcount. If the last reference to the version
   210  // was removed, the version is removed from the list of versions and the
   211  // Deleted callback is invoked. Requires that the VersionList mutex is NOT
   212  // locked.
   213  func (v *Version) Unref() {
   214  	if atomic.AddInt32(&v.refs, -1) == 0 {
   215  		obsolete := v.unrefFiles()
   216  		l := v.list
   217  		l.mu.Lock()
   218  		l.Remove(v)
   219  		v.Deleted(obsolete)
   220  		l.mu.Unlock()
   221  	}
   222  }
   223  
   224  // UnrefLocked decrements the version refcount. If the last reference to the
   225  // version was removed, the version is removed from the list of versions and
   226  // the Deleted callback is invoked. Requires that the VersionList mutex is
   227  // already locked.
   228  func (v *Version) UnrefLocked() {
   229  	if atomic.AddInt32(&v.refs, -1) == 0 {
   230  		v.list.Remove(v)
   231  		v.Deleted(v.unrefFiles())
   232  	}
   233  }
   234  
   235  func (v *Version) unrefFiles() []uint64 {
   236  	var obsolete []uint64
   237  	for _, files := range v.Files {
   238  		for i := range files {
   239  			f := &files[i]
   240  			if atomic.AddInt32(f.refs, -1) == 0 {
   241  				obsolete = append(obsolete, f.FileNum)
   242  			}
   243  		}
   244  	}
   245  	return obsolete
   246  }
   247  
   248  // Next returns the next version in the list of versions.
   249  func (v *Version) Next() *Version {
   250  	return v.next
   251  }
   252  
   253  // Overlaps returns all elements of v.files[level] whose user key range
   254  // intersects the inclusive range [start, end]. If level is non-zero then the
   255  // user key ranges of v.files[level] are assumed to not overlap (although they
   256  // may touch). If level is zero then that assumption cannot be made, and the
   257  // [start, end] range is expanded to the union of those matching ranges so far
   258  // and the computation is repeated until [start, end] stabilizes.
   259  func (v *Version) Overlaps(
   260  	level int, cmp Compare, start, end []byte,
   261  ) (ret []FileMetadata) {
   262  	if level == 0 {
   263  		// The sstables in level 0 can overlap with each other. As soon as we find
   264  		// one sstable that overlaps with our target range, we need to expand the
   265  		// range and find all sstables that overlap with the expanded range.
   266  	loop:
   267  		for {
   268  			for _, meta := range v.Files[level] {
   269  				smallest := meta.Smallest.UserKey
   270  				largest := meta.Largest.UserKey
   271  				if cmp(largest, start) < 0 {
   272  					// meta is completely before the specified range; skip it.
   273  					continue
   274  				}
   275  				if cmp(smallest, end) > 0 {
   276  					// meta is completely after the specified range; skip it.
   277  					continue
   278  				}
   279  				ret = append(ret, meta)
   280  
   281  				// If level == 0, check if the newly added fileMetadata has
   282  				// expanded the range. If so, restart the search.
   283  				restart := false
   284  				if cmp(smallest, start) < 0 {
   285  					start = smallest
   286  					restart = true
   287  				}
   288  				if cmp(largest, end) > 0 {
   289  					end = largest
   290  					restart = true
   291  				}
   292  				if restart {
   293  					ret = ret[:0]
   294  					continue loop
   295  				}
   296  			}
   297  			return ret
   298  		}
   299  	}
   300  
   301  	// Binary search to find the range of files which overlaps with our target
   302  	// range.
   303  	files := v.Files[level]
   304  	lower := sort.Search(len(files), func(i int) bool {
   305  		return cmp(files[i].Largest.UserKey, start) >= 0
   306  	})
   307  	upper := sort.Search(len(files), func(i int) bool {
   308  		return cmp(files[i].Smallest.UserKey, end) > 0
   309  	})
   310  	if lower >= upper {
   311  		return nil
   312  	}
   313  	return files[lower:upper]
   314  }
   315  
   316  // CheckOrdering checks that the files are consistent with respect to
   317  // increasing file numbers (for level 0 files) and increasing and non-
   318  // overlapping internal key ranges (for level non-0 files).
   319  func (v *Version) CheckOrdering(cmp Compare) error {
   320  	for level, ff := range v.Files {
   321  		if level == 0 {
   322  			for i := 1; i < len(ff); i++ {
   323  				prev := &ff[i-1]
   324  				f := &ff[i]
   325  				if prev.LargestSeqNum >= f.LargestSeqNum {
   326  					return fmt.Errorf("level 0 files are not in increasing largest seqNum order: %d, %d",
   327  						prev.LargestSeqNum, f.LargestSeqNum)
   328  				}
   329  				if prev.SmallestSeqNum >= f.SmallestSeqNum {
   330  					return fmt.Errorf("level 0 files are not in increasing smallest seqNum order: %d, %d",
   331  						prev.SmallestSeqNum, f.SmallestSeqNum)
   332  				}
   333  			}
   334  		} else {
   335  			for i := 1; i < len(ff); i++ {
   336  				prev := &ff[i-1]
   337  				f := &ff[i]
   338  				if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
   339  					return fmt.Errorf("level non-0 files are not in increasing ikey order: %s, %s\n%s",
   340  						prev.Largest, f.Smallest, v.DebugString())
   341  				}
   342  				if base.InternalCompare(cmp, f.Smallest, f.Largest) > 0 {
   343  					return fmt.Errorf("level non-0 file has inconsistent bounds: %s, %s",
   344  						f.Smallest, f.Largest)
   345  				}
   346  			}
   347  		}
   348  	}
   349  	return nil
   350  }
   351  
   352  // VersionList holds a list of versions. The versions are ordered from oldest
   353  // to newest.
   354  type VersionList struct {
   355  	mu   *sync.Mutex
   356  	root Version
   357  }
   358  
   359  // Init initializes the version list.
   360  func (l *VersionList) Init(mu *sync.Mutex) {
   361  	l.mu = mu
   362  	l.root.next = &l.root
   363  	l.root.prev = &l.root
   364  }
   365  
   366  // Empty returns true if the list is empty, and false otherwise.
   367  func (l *VersionList) Empty() bool {
   368  	return l.root.next == &l.root
   369  }
   370  
   371  // Front returns the oldest version in the list. Note that this version is only
   372  // valid if Empty() returns true.
   373  func (l *VersionList) Front() *Version {
   374  	return l.root.next
   375  }
   376  
   377  // Back returns the newest version in the list. Note that this version is only
   378  // valid if Empty() returns true.
   379  func (l *VersionList) Back() *Version {
   380  	return l.root.prev
   381  }
   382  
   383  // PushBack adds a new version to the back of the list. This new version
   384  // becomes the "newest" version in the list.
   385  func (l *VersionList) PushBack(v *Version) {
   386  	if v.list != nil || v.prev != nil || v.next != nil {
   387  		panic("pebble: version list is inconsistent")
   388  	}
   389  	v.prev = l.root.prev
   390  	v.prev.next = v
   391  	v.next = &l.root
   392  	v.next.prev = v
   393  	v.list = l
   394  }
   395  
   396  // Remove removes the specified version from the list.
   397  func (l *VersionList) Remove(v *Version) {
   398  	if v == &l.root {
   399  		panic("pebble: cannot remove version list root node")
   400  	}
   401  	if v.list != l {
   402  		panic("pebble: version list is inconsistent")
   403  	}
   404  	v.prev.next = v.next
   405  	v.next.prev = v.prev
   406  	v.next = nil // avoid memory leaks
   407  	v.prev = nil // avoid memory leaks
   408  	v.list = nil // avoid memory leaks
   409  }