github.com/cockroachdb/pebble@v1.1.2/internal/manifest/version_edit.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"io"
    13  	"sort"
    14  	"time"
    15  
    16  	"github.com/cockroachdb/errors"
    17  	"github.com/cockroachdb/pebble/internal/base"
    18  	"github.com/cockroachdb/pebble/internal/invariants"
    19  )
    20  
    21  // TODO(peter): describe the MANIFEST file format, independently of the C++
    22  // project.
    23  
    24  var errCorruptManifest = base.CorruptionErrorf("pebble: corrupt manifest")
    25  
    26  type byteReader interface {
    27  	io.ByteReader
    28  	io.Reader
    29  }
    30  
    31  // Tags for the versionEdit disk format.
    32  // Tag 8 is no longer used.
    33  const (
    34  	// LevelDB tags.
    35  	tagComparator     = 1
    36  	tagLogNumber      = 2
    37  	tagNextFileNumber = 3
    38  	tagLastSequence   = 4
    39  	tagCompactPointer = 5
    40  	tagDeletedFile    = 6
    41  	tagNewFile        = 7
    42  	tagPrevLogNumber  = 9
    43  
    44  	// RocksDB tags.
    45  	tagNewFile2         = 100
    46  	tagNewFile3         = 102
    47  	tagNewFile4         = 103
    48  	tagColumnFamily     = 200
    49  	tagColumnFamilyAdd  = 201
    50  	tagColumnFamilyDrop = 202
    51  	tagMaxColumnFamily  = 203
    52  
    53  	// Pebble tags.
    54  	tagNewFile5            = 104 // Range keys.
    55  	tagCreatedBackingTable = 105
    56  	tagRemovedBackingTable = 106
    57  
    58  	// The custom tags sub-format used by tagNewFile4 and above.
    59  	customTagTerminate         = 1
    60  	customTagNeedsCompaction   = 2
    61  	customTagCreationTime      = 6
    62  	customTagPathID            = 65
    63  	customTagNonSafeIgnoreMask = 1 << 6
    64  	customTagVirtual           = 66
    65  )
    66  
    67  // DeletedFileEntry holds the state for a file deletion from a level. The file
    68  // itself might still be referenced by another level.
    69  type DeletedFileEntry struct {
    70  	Level   int
    71  	FileNum base.FileNum
    72  }
    73  
    74  // NewFileEntry holds the state for a new file or one moved from a different
    75  // level.
    76  type NewFileEntry struct {
    77  	Level int
    78  	Meta  *FileMetadata
    79  	// BackingFileNum is only set during manifest replay, and only for virtual
    80  	// sstables.
    81  	BackingFileNum base.DiskFileNum
    82  }
    83  
    84  // VersionEdit holds the state for an edit to a Version along with other
    85  // on-disk state (log numbers, next file number, and the last sequence number).
    86  type VersionEdit struct {
    87  	// ComparerName is the value of Options.Comparer.Name. This is only set in
    88  	// the first VersionEdit in a manifest (either when the DB is created, or
    89  	// when a new manifest is created) and is used to verify that the comparer
    90  	// specified at Open matches the comparer that was previously used.
    91  	ComparerName string
    92  
    93  	// MinUnflushedLogNum is the smallest WAL log file number corresponding to
    94  	// mutations that have not been flushed to an sstable.
    95  	//
    96  	// This is an optional field, and 0 represents it is not set.
    97  	MinUnflushedLogNum base.FileNum
    98  
    99  	// ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by
   100  	// Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in
   101  	// 6/2011. We keep it around purely for informational purposes when
   102  	// displaying MANIFEST contents.
   103  	ObsoletePrevLogNum uint64
   104  
   105  	// The next file number. A single counter is used to assign file numbers
   106  	// for the WAL, MANIFEST, sstable, and OPTIONS files.
   107  	NextFileNum base.FileNum
   108  
   109  	// LastSeqNum is an upper bound on the sequence numbers that have been
   110  	// assigned in flushed WALs. Unflushed WALs (that will be replayed during
   111  	// recovery) may contain sequence numbers greater than this value.
   112  	LastSeqNum uint64
   113  
   114  	// A file num may be present in both deleted files and new files when it
   115  	// is moved from a lower level to a higher level (when the compaction
   116  	// found that there was no overlapping file at the higher level).
   117  	DeletedFiles map[DeletedFileEntry]*FileMetadata
   118  	NewFiles     []NewFileEntry
   119  	// CreatedBackingTables can be used to preserve the FileBacking associated
   120  	// with a physical sstable. This is useful when virtual sstables in the
   121  	// latest version are reconstructed during manifest replay, and we also need
   122  	// to reconstruct the FileBacking which is required by these virtual
   123  	// sstables.
   124  	//
   125  	// INVARIANT: The FileBacking associated with a physical sstable must only
   126  	// be added as a backing file in the same version edit where the physical
   127  	// sstable is first virtualized. This means that the physical sstable must
   128  	// be present in DeletedFiles and that there must be at least one virtual
   129  	// sstable with the same FileBacking as the physical sstable in NewFiles. A
   130  	// file must be present in CreatedBackingTables in exactly one version edit.
   131  	// The physical sstable associated with the FileBacking must also not be
   132  	// present in NewFiles.
   133  	CreatedBackingTables []*FileBacking
   134  	// RemovedBackingTables is used to remove the FileBacking associated with a
   135  	// virtual sstable. Note that a backing sstable can be removed as soon as
   136  	// there are no virtual sstables in the latest version which are using the
   137  	// backing sstable, but the backing sstable doesn't necessarily have to be
   138  	// removed atomically with the version edit which removes the last virtual
   139  	// sstable associated with the backing sstable. The removal can happen in a
   140  	// future version edit.
   141  	//
   142  	// INVARIANT: A file must only be added to RemovedBackingTables if it was
   143  	// added to CreateBackingTables in a prior version edit. The same version
   144  	// edit also cannot have the same file present in both CreateBackingTables
   145  	// and RemovedBackingTables. A file must be present in RemovedBackingTables
   146  	// in exactly one version edit.
   147  	RemovedBackingTables []base.DiskFileNum
   148  }
   149  
   150  // Decode decodes an edit from the specified reader.
   151  //
   152  // Note that the Decode step will not set the FileBacking for virtual sstables
   153  // and the responsibility is left to the caller. However, the Decode step will
   154  // populate the NewFileEntry.BackingFileNum in VersionEdit.NewFiles.
   155  func (v *VersionEdit) Decode(r io.Reader) error {
   156  	br, ok := r.(byteReader)
   157  	if !ok {
   158  		br = bufio.NewReader(r)
   159  	}
   160  	d := versionEditDecoder{br}
   161  	for {
   162  		tag, err := binary.ReadUvarint(br)
   163  		if err == io.EOF {
   164  			break
   165  		}
   166  		if err != nil {
   167  			return err
   168  		}
   169  		switch tag {
   170  		case tagComparator:
   171  			s, err := d.readBytes()
   172  			if err != nil {
   173  				return err
   174  			}
   175  			v.ComparerName = string(s)
   176  
   177  		case tagLogNumber:
   178  			n, err := d.readFileNum()
   179  			if err != nil {
   180  				return err
   181  			}
   182  			v.MinUnflushedLogNum = n
   183  
   184  		case tagNextFileNumber:
   185  			n, err := d.readFileNum()
   186  			if err != nil {
   187  				return err
   188  			}
   189  			v.NextFileNum = n
   190  
   191  		case tagLastSequence:
   192  			n, err := d.readUvarint()
   193  			if err != nil {
   194  				return err
   195  			}
   196  			v.LastSeqNum = n
   197  
   198  		case tagCompactPointer:
   199  			if _, err := d.readLevel(); err != nil {
   200  				return err
   201  			}
   202  			if _, err := d.readBytes(); err != nil {
   203  				return err
   204  			}
   205  			// NB: RocksDB does not use compaction pointers anymore.
   206  
   207  		case tagRemovedBackingTable:
   208  			n, err := d.readUvarint()
   209  			if err != nil {
   210  				return err
   211  			}
   212  			v.RemovedBackingTables = append(
   213  				v.RemovedBackingTables, base.FileNum(n).DiskFileNum(),
   214  			)
   215  		case tagCreatedBackingTable:
   216  			dfn, err := d.readUvarint()
   217  			if err != nil {
   218  				return err
   219  			}
   220  			size, err := d.readUvarint()
   221  			if err != nil {
   222  				return err
   223  			}
   224  			fileBacking := &FileBacking{
   225  				DiskFileNum: base.FileNum(dfn).DiskFileNum(),
   226  				Size:        size,
   227  			}
   228  			v.CreatedBackingTables = append(v.CreatedBackingTables, fileBacking)
   229  		case tagDeletedFile:
   230  			level, err := d.readLevel()
   231  			if err != nil {
   232  				return err
   233  			}
   234  			fileNum, err := d.readFileNum()
   235  			if err != nil {
   236  				return err
   237  			}
   238  			if v.DeletedFiles == nil {
   239  				v.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata)
   240  			}
   241  			v.DeletedFiles[DeletedFileEntry{level, fileNum}] = nil
   242  
   243  		case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5:
   244  			level, err := d.readLevel()
   245  			if err != nil {
   246  				return err
   247  			}
   248  			fileNum, err := d.readFileNum()
   249  			if err != nil {
   250  				return err
   251  			}
   252  			if tag == tagNewFile3 {
   253  				// The pathID field appears unused in RocksDB.
   254  				_ /* pathID */, err := d.readUvarint()
   255  				if err != nil {
   256  					return err
   257  				}
   258  			}
   259  			size, err := d.readUvarint()
   260  			if err != nil {
   261  				return err
   262  			}
   263  			// We read the smallest / largest key bounds differently depending on
   264  			// whether we have point, range or both types of keys present in the
   265  			// table.
   266  			var (
   267  				smallestPointKey, largestPointKey []byte
   268  				smallestRangeKey, largestRangeKey []byte
   269  				parsedPointBounds                 bool
   270  				boundsMarker                      byte
   271  			)
   272  			if tag != tagNewFile5 {
   273  				// Range keys not present in the table. Parse the point key bounds.
   274  				smallestPointKey, err = d.readBytes()
   275  				if err != nil {
   276  					return err
   277  				}
   278  				largestPointKey, err = d.readBytes()
   279  				if err != nil {
   280  					return err
   281  				}
   282  			} else {
   283  				// Range keys are present in the table. Determine whether we have point
   284  				// keys to parse, in addition to the bounds.
   285  				boundsMarker, err = d.ReadByte()
   286  				if err != nil {
   287  					return err
   288  				}
   289  				// Parse point key bounds, if present.
   290  				if boundsMarker&maskContainsPointKeys > 0 {
   291  					smallestPointKey, err = d.readBytes()
   292  					if err != nil {
   293  						return err
   294  					}
   295  					largestPointKey, err = d.readBytes()
   296  					if err != nil {
   297  						return err
   298  					}
   299  					parsedPointBounds = true
   300  				} else {
   301  					// The table does not have point keys.
   302  					// Sanity check: the bounds must be range keys.
   303  					if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 {
   304  						return base.CorruptionErrorf(
   305  							"new-file-4-range-keys: table without point keys has point key bounds: marker=%x",
   306  							boundsMarker,
   307  						)
   308  					}
   309  				}
   310  				// Parse range key bounds.
   311  				smallestRangeKey, err = d.readBytes()
   312  				if err != nil {
   313  					return err
   314  				}
   315  				largestRangeKey, err = d.readBytes()
   316  				if err != nil {
   317  					return err
   318  				}
   319  			}
   320  			var smallestSeqNum uint64
   321  			var largestSeqNum uint64
   322  			if tag != tagNewFile {
   323  				smallestSeqNum, err = d.readUvarint()
   324  				if err != nil {
   325  					return err
   326  				}
   327  				largestSeqNum, err = d.readUvarint()
   328  				if err != nil {
   329  					return err
   330  				}
   331  			}
   332  			var markedForCompaction bool
   333  			var creationTime uint64
   334  			virtualState := struct {
   335  				virtual        bool
   336  				backingFileNum uint64
   337  			}{}
   338  			if tag == tagNewFile4 || tag == tagNewFile5 {
   339  				for {
   340  					customTag, err := d.readUvarint()
   341  					if err != nil {
   342  						return err
   343  					}
   344  					if customTag == customTagTerminate {
   345  						break
   346  					} else if customTag == customTagVirtual {
   347  						virtualState.virtual = true
   348  						n, err := d.readUvarint()
   349  						if err != nil {
   350  							return err
   351  						}
   352  						virtualState.backingFileNum = n
   353  						continue
   354  					}
   355  
   356  					field, err := d.readBytes()
   357  					if err != nil {
   358  						return err
   359  					}
   360  					switch customTag {
   361  					case customTagNeedsCompaction:
   362  						if len(field) != 1 {
   363  							return base.CorruptionErrorf("new-file4: need-compaction field wrong size")
   364  						}
   365  						markedForCompaction = (field[0] == 1)
   366  
   367  					case customTagCreationTime:
   368  						var n int
   369  						creationTime, n = binary.Uvarint(field)
   370  						if n != len(field) {
   371  							return base.CorruptionErrorf("new-file4: invalid file creation time")
   372  						}
   373  
   374  					case customTagPathID:
   375  						return base.CorruptionErrorf("new-file4: path-id field not supported")
   376  
   377  					default:
   378  						if (customTag & customTagNonSafeIgnoreMask) != 0 {
   379  							return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag)
   380  						}
   381  					}
   382  				}
   383  			}
   384  			m := &FileMetadata{
   385  				FileNum:             fileNum,
   386  				Size:                size,
   387  				CreationTime:        int64(creationTime),
   388  				SmallestSeqNum:      smallestSeqNum,
   389  				LargestSeqNum:       largestSeqNum,
   390  				MarkedForCompaction: markedForCompaction,
   391  				Virtual:             virtualState.virtual,
   392  			}
   393  			if tag != tagNewFile5 { // no range keys present
   394  				m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey)
   395  				m.LargestPointKey = base.DecodeInternalKey(largestPointKey)
   396  				m.HasPointKeys = true
   397  				m.Smallest, m.Largest = m.SmallestPointKey, m.LargestPointKey
   398  				m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey
   399  			} else { // range keys present
   400  				// Set point key bounds, if parsed.
   401  				if parsedPointBounds {
   402  					m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey)
   403  					m.LargestPointKey = base.DecodeInternalKey(largestPointKey)
   404  					m.HasPointKeys = true
   405  				}
   406  				// Set range key bounds.
   407  				m.SmallestRangeKey = base.DecodeInternalKey(smallestRangeKey)
   408  				m.LargestRangeKey = base.DecodeInternalKey(largestRangeKey)
   409  				m.HasRangeKeys = true
   410  				// Set overall bounds (by default assume range keys).
   411  				m.Smallest, m.Largest = m.SmallestRangeKey, m.LargestRangeKey
   412  				m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey
   413  				if boundsMarker&maskSmallest == maskSmallest {
   414  					m.Smallest = m.SmallestPointKey
   415  					m.boundTypeSmallest = boundTypePointKey
   416  				}
   417  				if boundsMarker&maskLargest == maskLargest {
   418  					m.Largest = m.LargestPointKey
   419  					m.boundTypeLargest = boundTypePointKey
   420  				}
   421  			}
   422  			m.boundsSet = true
   423  			if !virtualState.virtual {
   424  				m.InitPhysicalBacking()
   425  			}
   426  
   427  			nfe := NewFileEntry{
   428  				Level: level,
   429  				Meta:  m,
   430  			}
   431  			if virtualState.virtual {
   432  				nfe.BackingFileNum = base.FileNum(virtualState.backingFileNum).DiskFileNum()
   433  			}
   434  			v.NewFiles = append(v.NewFiles, nfe)
   435  
   436  		case tagPrevLogNumber:
   437  			n, err := d.readUvarint()
   438  			if err != nil {
   439  				return err
   440  			}
   441  			v.ObsoletePrevLogNum = n
   442  
   443  		case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily:
   444  			return base.CorruptionErrorf("column families are not supported")
   445  
   446  		default:
   447  			return errCorruptManifest
   448  		}
   449  	}
   450  	return nil
   451  }
   452  
   453  func (v *VersionEdit) string(verbose bool, fmtKey base.FormatKey) string {
   454  	var buf bytes.Buffer
   455  	if v.ComparerName != "" {
   456  		fmt.Fprintf(&buf, "  comparer:     %s", v.ComparerName)
   457  	}
   458  	if v.MinUnflushedLogNum != 0 {
   459  		fmt.Fprintf(&buf, "  log-num:       %d\n", v.MinUnflushedLogNum)
   460  	}
   461  	if v.ObsoletePrevLogNum != 0 {
   462  		fmt.Fprintf(&buf, "  prev-log-num:  %d\n", v.ObsoletePrevLogNum)
   463  	}
   464  	if v.NextFileNum != 0 {
   465  		fmt.Fprintf(&buf, "  next-file-num: %d\n", v.NextFileNum)
   466  	}
   467  	if v.LastSeqNum != 0 {
   468  		fmt.Fprintf(&buf, "  last-seq-num:  %d\n", v.LastSeqNum)
   469  	}
   470  	entries := make([]DeletedFileEntry, 0, len(v.DeletedFiles))
   471  	for df := range v.DeletedFiles {
   472  		entries = append(entries, df)
   473  	}
   474  	sort.Slice(entries, func(i, j int) bool {
   475  		if entries[i].Level != entries[j].Level {
   476  			return entries[i].Level < entries[j].Level
   477  		}
   478  		return entries[i].FileNum < entries[j].FileNum
   479  	})
   480  	for _, df := range entries {
   481  		fmt.Fprintf(&buf, "  deleted:       L%d %s\n", df.Level, df.FileNum)
   482  	}
   483  	for _, nf := range v.NewFiles {
   484  		fmt.Fprintf(&buf, "  added:         L%d", nf.Level)
   485  		if verbose {
   486  			fmt.Fprintf(&buf, " %s", nf.Meta.DebugString(fmtKey, true /* verbose */))
   487  		} else {
   488  			fmt.Fprintf(&buf, " %s", nf.Meta.String())
   489  		}
   490  		if nf.Meta.CreationTime != 0 {
   491  			fmt.Fprintf(&buf, " (%s)",
   492  				time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339))
   493  		}
   494  		fmt.Fprintln(&buf)
   495  	}
   496  	return buf.String()
   497  }
   498  
   499  // DebugString is a more verbose version of String(). Use this in tests.
   500  func (v *VersionEdit) DebugString(fmtKey base.FormatKey) string {
   501  	return v.string(true /* verbose */, fmtKey)
   502  }
   503  
   504  // String implements fmt.Stringer for a VersionEdit.
   505  func (v *VersionEdit) String() string {
   506  	return v.string(false /* verbose */, base.DefaultFormatter)
   507  }
   508  
   509  // Encode encodes an edit to the specified writer.
   510  func (v *VersionEdit) Encode(w io.Writer) error {
   511  	e := versionEditEncoder{new(bytes.Buffer)}
   512  
   513  	if v.ComparerName != "" {
   514  		e.writeUvarint(tagComparator)
   515  		e.writeString(v.ComparerName)
   516  	}
   517  	if v.MinUnflushedLogNum != 0 {
   518  		e.writeUvarint(tagLogNumber)
   519  		e.writeUvarint(uint64(v.MinUnflushedLogNum))
   520  	}
   521  	if v.ObsoletePrevLogNum != 0 {
   522  		e.writeUvarint(tagPrevLogNumber)
   523  		e.writeUvarint(v.ObsoletePrevLogNum)
   524  	}
   525  	if v.NextFileNum != 0 {
   526  		e.writeUvarint(tagNextFileNumber)
   527  		e.writeUvarint(uint64(v.NextFileNum))
   528  	}
   529  	for _, dfn := range v.RemovedBackingTables {
   530  		e.writeUvarint(tagRemovedBackingTable)
   531  		e.writeUvarint(uint64(dfn.FileNum()))
   532  	}
   533  	for _, fileBacking := range v.CreatedBackingTables {
   534  		e.writeUvarint(tagCreatedBackingTable)
   535  		e.writeUvarint(uint64(fileBacking.DiskFileNum.FileNum()))
   536  		e.writeUvarint(fileBacking.Size)
   537  	}
   538  	// RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry,
   539  	// even though its value is zero. We detect this by encoding LastSeqNum when
   540  	// ComparerName is set.
   541  	if v.LastSeqNum != 0 || v.ComparerName != "" {
   542  		e.writeUvarint(tagLastSequence)
   543  		e.writeUvarint(v.LastSeqNum)
   544  	}
   545  	for x := range v.DeletedFiles {
   546  		e.writeUvarint(tagDeletedFile)
   547  		e.writeUvarint(uint64(x.Level))
   548  		e.writeUvarint(uint64(x.FileNum))
   549  	}
   550  	for _, x := range v.NewFiles {
   551  		customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 || x.Meta.Virtual
   552  		var tag uint64
   553  		switch {
   554  		case x.Meta.HasRangeKeys:
   555  			tag = tagNewFile5
   556  		case customFields:
   557  			tag = tagNewFile4
   558  		default:
   559  			tag = tagNewFile2
   560  		}
   561  		e.writeUvarint(tag)
   562  		e.writeUvarint(uint64(x.Level))
   563  		e.writeUvarint(uint64(x.Meta.FileNum))
   564  		e.writeUvarint(x.Meta.Size)
   565  		if !x.Meta.HasRangeKeys {
   566  			// If we have no range keys, preserve the original format and write the
   567  			// smallest and largest point keys.
   568  			e.writeKey(x.Meta.SmallestPointKey)
   569  			e.writeKey(x.Meta.LargestPointKey)
   570  		} else {
   571  			// When range keys are present, we first write a marker byte that
   572  			// indicates if the table also contains point keys, in addition to how the
   573  			// overall bounds for the table should be reconstructed. This byte is
   574  			// followed by the keys themselves.
   575  			b, err := x.Meta.boundsMarker()
   576  			if err != nil {
   577  				return err
   578  			}
   579  			if err = e.WriteByte(b); err != nil {
   580  				return err
   581  			}
   582  			// Write point key bounds (if present).
   583  			if x.Meta.HasPointKeys {
   584  				e.writeKey(x.Meta.SmallestPointKey)
   585  				e.writeKey(x.Meta.LargestPointKey)
   586  			}
   587  			// Write range key bounds.
   588  			e.writeKey(x.Meta.SmallestRangeKey)
   589  			e.writeKey(x.Meta.LargestRangeKey)
   590  		}
   591  		e.writeUvarint(x.Meta.SmallestSeqNum)
   592  		e.writeUvarint(x.Meta.LargestSeqNum)
   593  		if customFields {
   594  			if x.Meta.CreationTime != 0 {
   595  				e.writeUvarint(customTagCreationTime)
   596  				var buf [binary.MaxVarintLen64]byte
   597  				n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime))
   598  				e.writeBytes(buf[:n])
   599  			}
   600  			if x.Meta.MarkedForCompaction {
   601  				e.writeUvarint(customTagNeedsCompaction)
   602  				e.writeBytes([]byte{1})
   603  			}
   604  			if x.Meta.Virtual {
   605  				e.writeUvarint(customTagVirtual)
   606  				e.writeUvarint(uint64(x.Meta.FileBacking.DiskFileNum.FileNum()))
   607  			}
   608  			e.writeUvarint(customTagTerminate)
   609  		}
   610  	}
   611  	_, err := w.Write(e.Bytes())
   612  	return err
   613  }
   614  
   615  // versionEditDecoder should be used to decode version edits.
   616  type versionEditDecoder struct {
   617  	byteReader
   618  }
   619  
   620  func (d versionEditDecoder) readBytes() ([]byte, error) {
   621  	n, err := d.readUvarint()
   622  	if err != nil {
   623  		return nil, err
   624  	}
   625  	s := make([]byte, n)
   626  	_, err = io.ReadFull(d, s)
   627  	if err != nil {
   628  		if err == io.ErrUnexpectedEOF {
   629  			return nil, errCorruptManifest
   630  		}
   631  		return nil, err
   632  	}
   633  	return s, nil
   634  }
   635  
   636  func (d versionEditDecoder) readLevel() (int, error) {
   637  	u, err := d.readUvarint()
   638  	if err != nil {
   639  		return 0, err
   640  	}
   641  	if u >= NumLevels {
   642  		return 0, errCorruptManifest
   643  	}
   644  	return int(u), nil
   645  }
   646  
   647  func (d versionEditDecoder) readFileNum() (base.FileNum, error) {
   648  	u, err := d.readUvarint()
   649  	if err != nil {
   650  		return 0, err
   651  	}
   652  	return base.FileNum(u), nil
   653  }
   654  
   655  func (d versionEditDecoder) readUvarint() (uint64, error) {
   656  	u, err := binary.ReadUvarint(d)
   657  	if err != nil {
   658  		if err == io.EOF {
   659  			return 0, errCorruptManifest
   660  		}
   661  		return 0, err
   662  	}
   663  	return u, nil
   664  }
   665  
   666  type versionEditEncoder struct {
   667  	*bytes.Buffer
   668  }
   669  
   670  func (e versionEditEncoder) writeBytes(p []byte) {
   671  	e.writeUvarint(uint64(len(p)))
   672  	e.Write(p)
   673  }
   674  
   675  func (e versionEditEncoder) writeKey(k InternalKey) {
   676  	e.writeUvarint(uint64(k.Size()))
   677  	e.Write(k.UserKey)
   678  	buf := k.EncodeTrailer()
   679  	e.Write(buf[:])
   680  }
   681  
   682  func (e versionEditEncoder) writeString(s string) {
   683  	e.writeUvarint(uint64(len(s)))
   684  	e.WriteString(s)
   685  }
   686  
   687  func (e versionEditEncoder) writeUvarint(u uint64) {
   688  	var buf [binary.MaxVarintLen64]byte
   689  	n := binary.PutUvarint(buf[:], u)
   690  	e.Write(buf[:n])
   691  }
   692  
   693  // BulkVersionEdit summarizes the files added and deleted from a set of version
   694  // edits.
   695  //
   696  // INVARIANTS:
   697  // No file can be added to a level more than once. This is true globally, and
   698  // also true for all of the calls to Accumulate for a single bulk version edit.
   699  //
   700  // No file can be removed from a level more than once. This is true globally,
   701  // and also true for all of the calls to Accumulate for a single bulk version
   702  // edit.
   703  //
   704  // A file must not be added and removed from a given level in the same version
   705  // edit.
   706  //
   707  // A file that is being removed from a level must have been added to that level
   708  // before (in a prior version edit). Note that a given file can be deleted from
   709  // a level and added to another level in a single version edit
   710  type BulkVersionEdit struct {
   711  	Added   [NumLevels]map[base.FileNum]*FileMetadata
   712  	Deleted [NumLevels]map[base.FileNum]*FileMetadata
   713  
   714  	// AddedFileBacking is a map to support lookup so that we can populate the
   715  	// FileBacking of virtual sstables during manifest replay.
   716  	AddedFileBacking   map[base.DiskFileNum]*FileBacking
   717  	RemovedFileBacking []base.DiskFileNum
   718  
   719  	// AddedByFileNum maps file number to file metadata for all added files
   720  	// from accumulated version edits. AddedByFileNum is only populated if set
   721  	// to non-nil by a caller. It must be set to non-nil when replaying
   722  	// version edits read from a MANIFEST (as opposed to VersionEdits
   723  	// constructed in-memory).  While replaying a MANIFEST file,
   724  	// VersionEdit.DeletedFiles map entries have nil values, because the
   725  	// on-disk deletion record encodes only the file number. Accumulate
   726  	// uses AddedByFileNum to correctly populate the BulkVersionEdit's Deleted
   727  	// field with non-nil *FileMetadata.
   728  	AddedByFileNum map[base.FileNum]*FileMetadata
   729  
   730  	// MarkedForCompactionCountDiff holds the aggregated count of files
   731  	// marked for compaction added or removed.
   732  	MarkedForCompactionCountDiff int
   733  }
   734  
   735  // Accumulate adds the file addition and deletions in the specified version
   736  // edit to the bulk edit's internal state.
   737  //
   738  // INVARIANTS:
   739  // If a file is added to a given level in a call to Accumulate and then removed
   740  // from that level in a subsequent call, the file will not be present in the
   741  // resulting BulkVersionEdit.Deleted for that level.
   742  //
   743  // After accumulation of version edits, the bulk version edit may have
   744  // information about a file which has been deleted from a level, but it may
   745  // not have information about the same file added to the same level. The add
   746  // could've occurred as part of a previous bulk version edit. In this case,
   747  // the deleted file must be present in BulkVersionEdit.Deleted, at the end
   748  // of the accumulation, because we need to decrease the refcount of the
   749  // deleted file in Apply.
   750  func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error {
   751  	for df, m := range ve.DeletedFiles {
   752  		dmap := b.Deleted[df.Level]
   753  		if dmap == nil {
   754  			dmap = make(map[base.FileNum]*FileMetadata)
   755  			b.Deleted[df.Level] = dmap
   756  		}
   757  
   758  		if m == nil {
   759  			// m is nil only when replaying a MANIFEST.
   760  			if b.AddedByFileNum == nil {
   761  				return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum)
   762  			}
   763  			m = b.AddedByFileNum[df.FileNum]
   764  			if m == nil {
   765  				return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", df.Level, df.FileNum)
   766  			}
   767  		}
   768  		if m.MarkedForCompaction {
   769  			b.MarkedForCompactionCountDiff--
   770  		}
   771  		if _, ok := b.Added[df.Level][df.FileNum]; !ok {
   772  			dmap[df.FileNum] = m
   773  		} else {
   774  			// Present in b.Added for the same level.
   775  			delete(b.Added[df.Level], df.FileNum)
   776  		}
   777  	}
   778  
   779  	// Generate state for Added backing files. Note that these must be generated
   780  	// before we loop through the NewFiles, because we need to populate the
   781  	// FileBackings which might be used by the NewFiles loop.
   782  	if b.AddedFileBacking == nil {
   783  		b.AddedFileBacking = make(map[base.DiskFileNum]*FileBacking)
   784  	}
   785  	for _, fb := range ve.CreatedBackingTables {
   786  		if _, ok := b.AddedFileBacking[fb.DiskFileNum]; ok {
   787  			// There is already a FileBacking associated with fb.DiskFileNum.
   788  			// This should never happen. There must always be only one FileBacking
   789  			// associated with a backing sstable.
   790  			panic(fmt.Sprintf("pebble: duplicate file backing %s", fb.DiskFileNum.String()))
   791  		}
   792  		b.AddedFileBacking[fb.DiskFileNum] = fb
   793  	}
   794  
   795  	for _, nf := range ve.NewFiles {
   796  		// A new file should not have been deleted in this or a preceding
   797  		// VersionEdit at the same level (though files can move across levels).
   798  		if dmap := b.Deleted[nf.Level]; dmap != nil {
   799  			if _, ok := dmap[nf.Meta.FileNum]; ok {
   800  				return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.FileNum)
   801  			}
   802  		}
   803  		if nf.Meta.Virtual && nf.Meta.FileBacking == nil {
   804  			// FileBacking for a virtual sstable must only be nil if we're performing
   805  			// manifest replay.
   806  			nf.Meta.FileBacking = b.AddedFileBacking[nf.BackingFileNum]
   807  			if nf.Meta.FileBacking == nil {
   808  				return errors.Errorf("FileBacking for virtual sstable must not be nil")
   809  			}
   810  		} else if nf.Meta.FileBacking == nil {
   811  			return errors.Errorf("Added file L%d.%s's has no FileBacking", nf.Level, nf.Meta.FileNum)
   812  		}
   813  
   814  		if b.Added[nf.Level] == nil {
   815  			b.Added[nf.Level] = make(map[base.FileNum]*FileMetadata)
   816  		}
   817  		b.Added[nf.Level][nf.Meta.FileNum] = nf.Meta
   818  		if b.AddedByFileNum != nil {
   819  			b.AddedByFileNum[nf.Meta.FileNum] = nf.Meta
   820  		}
   821  		if nf.Meta.MarkedForCompaction {
   822  			b.MarkedForCompactionCountDiff++
   823  		}
   824  	}
   825  
   826  	// Since a file can be removed from backing files in exactly one version
   827  	// edit it is safe to just append without any de-duplication.
   828  	b.RemovedFileBacking = append(b.RemovedFileBacking, ve.RemovedBackingTables...)
   829  
   830  	return nil
   831  }
   832  
   833  // AccumulateIncompleteAndApplySingleVE should be called if a single version edit
   834  // is to be applied to the provided curr Version and if the caller needs to
   835  // update the versionSet.zombieTables map. This function exists separately from
   836  // BulkVersionEdit.Apply because it is easier to reason about properties
   837  // regarding BulkVersionedit.Accumulate/Apply and zombie table generation, if we
   838  // know that exactly one version edit is being accumulated.
   839  //
   840  // Note that the version edit passed into this function may be incomplete
   841  // because compactions don't have the ref counting information necessary to
   842  // populate VersionEdit.RemovedBackingTables. This function will complete such a
   843  // version edit by populating RemovedBackingTables.
   844  //
   845  // Invariant: Any file being deleted through ve must belong to the curr Version.
   846  // We can't have a delete for some arbitrary file which does not exist in curr.
   847  func AccumulateIncompleteAndApplySingleVE(
   848  	ve *VersionEdit,
   849  	curr *Version,
   850  	cmp Compare,
   851  	formatKey base.FormatKey,
   852  	flushSplitBytes int64,
   853  	readCompactionRate int64,
   854  	backingStateMap map[base.DiskFileNum]*FileBacking,
   855  	addBackingFunc func(*FileBacking),
   856  	removeBackingFunc func(base.DiskFileNum),
   857  	orderingInvariants OrderingInvariants,
   858  ) (_ *Version, zombies map[base.DiskFileNum]uint64, _ error) {
   859  	if len(ve.RemovedBackingTables) != 0 {
   860  		panic("pebble: invalid incomplete version edit")
   861  	}
   862  	var b BulkVersionEdit
   863  	err := b.Accumulate(ve)
   864  	if err != nil {
   865  		return nil, nil, err
   866  	}
   867  	zombies = make(map[base.DiskFileNum]uint64)
   868  	v, err := b.Apply(
   869  		curr, cmp, formatKey, flushSplitBytes, readCompactionRate, zombies, orderingInvariants,
   870  	)
   871  	if err != nil {
   872  		return nil, nil, err
   873  	}
   874  
   875  	for _, s := range b.AddedFileBacking {
   876  		addBackingFunc(s)
   877  	}
   878  
   879  	for fileNum := range zombies {
   880  		if _, ok := backingStateMap[fileNum]; ok {
   881  			// This table was backing some virtual sstable in the latest version,
   882  			// but is now a zombie. We add RemovedBackingTables entries for
   883  			// these, before the version edit is written to disk.
   884  			ve.RemovedBackingTables = append(
   885  				ve.RemovedBackingTables, fileNum,
   886  			)
   887  			removeBackingFunc(fileNum)
   888  		}
   889  	}
   890  	return v, zombies, nil
   891  }
   892  
   893  // Apply applies the delta b to the current version to produce a new
   894  // version. The new version is consistent with respect to the comparer cmp.
   895  //
   896  // curr may be nil, which is equivalent to a pointer to a zero version.
   897  //
   898  // On success, if a non-nil zombies map is provided to Apply, the map is updated
   899  // with file numbers and files sizes of deleted files. These files are
   900  // considered zombies because they are no longer referenced by the returned
   901  // Version, but cannot be deleted from disk as they are still in use by the
   902  // incoming Version.
   903  func (b *BulkVersionEdit) Apply(
   904  	curr *Version,
   905  	cmp Compare,
   906  	formatKey base.FormatKey,
   907  	flushSplitBytes int64,
   908  	readCompactionRate int64,
   909  	zombies map[base.DiskFileNum]uint64,
   910  	orderingInvariants OrderingInvariants,
   911  ) (*Version, error) {
   912  	addZombie := func(state *FileBacking) {
   913  		if zombies != nil {
   914  			zombies[state.DiskFileNum] = state.Size
   915  		}
   916  	}
   917  	removeZombie := func(state *FileBacking) {
   918  		if zombies != nil {
   919  			delete(zombies, state.DiskFileNum)
   920  		}
   921  	}
   922  
   923  	v := new(Version)
   924  
   925  	// Adjust the count of files marked for compaction.
   926  	if curr != nil {
   927  		v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction
   928  	}
   929  	v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff
   930  	if v.Stats.MarkedForCompaction < 0 {
   931  		return nil, base.CorruptionErrorf("pebble: version marked for compaction count negative")
   932  	}
   933  
   934  	for level := range v.Levels {
   935  		if curr == nil || curr.Levels[level].tree.root == nil {
   936  			v.Levels[level] = makeLevelMetadata(cmp, level, nil /* files */)
   937  		} else {
   938  			v.Levels[level] = curr.Levels[level].clone()
   939  		}
   940  		if curr == nil || curr.RangeKeyLevels[level].tree.root == nil {
   941  			v.RangeKeyLevels[level] = makeLevelMetadata(cmp, level, nil /* files */)
   942  		} else {
   943  			v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone()
   944  		}
   945  
   946  		if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 {
   947  			// There are no edits on this level.
   948  			if level == 0 {
   949  				// Initialize L0Sublevels.
   950  				if curr == nil || curr.L0Sublevels == nil {
   951  					if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
   952  						return nil, errors.Wrap(err, "pebble: internal error")
   953  					}
   954  				} else {
   955  					v.L0Sublevels = curr.L0Sublevels
   956  					v.L0SublevelFiles = v.L0Sublevels.Levels
   957  				}
   958  			}
   959  			continue
   960  		}
   961  
   962  		// Some edits on this level.
   963  		lm := &v.Levels[level]
   964  		lmRange := &v.RangeKeyLevels[level]
   965  
   966  		addedFilesMap := b.Added[level]
   967  		deletedFilesMap := b.Deleted[level]
   968  		if n := v.Levels[level].Len() + len(addedFilesMap); n == 0 {
   969  			return nil, base.CorruptionErrorf(
   970  				"pebble: internal error: No current or added files but have deleted files: %d",
   971  				errors.Safe(len(deletedFilesMap)))
   972  		}
   973  
   974  		// NB: addedFilesMap may be empty. If a file is present in addedFilesMap
   975  		// for a level, it won't be present in deletedFilesMap for the same
   976  		// level.
   977  
   978  		for _, f := range deletedFilesMap {
   979  			if obsolete := v.Levels[level].remove(f); obsolete {
   980  				// Deleting a file from the B-Tree may decrement its
   981  				// reference count. However, because we cloned the
   982  				// previous level's B-Tree, this should never result in a
   983  				// file's reference count dropping to zero.
   984  				err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during B-Tree removal", level, f.FileNum)
   985  				return nil, err
   986  			}
   987  			if f.HasRangeKeys {
   988  				if obsolete := v.RangeKeyLevels[level].remove(f); obsolete {
   989  					// Deleting a file from the B-Tree may decrement its
   990  					// reference count. However, because we cloned the
   991  					// previous level's B-Tree, this should never result in a
   992  					// file's reference count dropping to zero.
   993  					err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during range-key B-Tree removal", level, f.FileNum)
   994  					return nil, err
   995  				}
   996  			}
   997  
   998  			// Note that a backing sst will only become a zombie if the
   999  			// references to it in the latest version is 0. We will remove the
  1000  			// backing sst from the zombie list in the next loop if one of the
  1001  			// addedFiles in any of the levels is referencing the backing sst.
  1002  			// This is possible if a physical sstable is virtualized, or if it
  1003  			// is moved.
  1004  			latestRefCount := f.LatestRefs()
  1005  			if latestRefCount <= 0 {
  1006  				// If a file is present in deletedFilesMap for a level, then it
  1007  				// must have already been added to the level previously, which
  1008  				// means that its latest ref count cannot be 0.
  1009  				err := errors.Errorf("pebble: internal error: incorrect latestRefs reference counting for file", f.FileNum)
  1010  				return nil, err
  1011  			} else if f.LatestUnref() == 0 {
  1012  				addZombie(f.FileBacking)
  1013  			}
  1014  		}
  1015  
  1016  		addedFiles := make([]*FileMetadata, 0, len(addedFilesMap))
  1017  		for _, f := range addedFilesMap {
  1018  			addedFiles = append(addedFiles, f)
  1019  		}
  1020  		// Sort addedFiles by file number. This isn't necessary, but tests which
  1021  		// replay invalid manifests check the error output, and the error output
  1022  		// depends on the order in which files are added to the btree.
  1023  		sort.Slice(addedFiles, func(i, j int) bool {
  1024  			return addedFiles[i].FileNum < addedFiles[j].FileNum
  1025  		})
  1026  
  1027  		var sm, la *FileMetadata
  1028  		for _, f := range addedFiles {
  1029  			// NB: allowedSeeks is used for read triggered compactions. It is set using
  1030  			// Options.Experimental.ReadCompactionRate which defaults to 32KB.
  1031  			var allowedSeeks int64
  1032  			if readCompactionRate != 0 {
  1033  				allowedSeeks = int64(f.Size) / readCompactionRate
  1034  			}
  1035  			if allowedSeeks < 100 {
  1036  				allowedSeeks = 100
  1037  			}
  1038  			f.AllowedSeeks.Store(allowedSeeks)
  1039  			f.InitAllowedSeeks = allowedSeeks
  1040  
  1041  			err := lm.insert(f)
  1042  			// We're adding this file to the new version, so increment the
  1043  			// latest refs count.
  1044  			f.LatestRef()
  1045  			if err != nil {
  1046  				return nil, errors.Wrap(err, "pebble")
  1047  			}
  1048  			if f.HasRangeKeys {
  1049  				err = lmRange.insert(f)
  1050  				if err != nil {
  1051  					return nil, errors.Wrap(err, "pebble")
  1052  				}
  1053  			}
  1054  			removeZombie(f.FileBacking)
  1055  			// Track the keys with the smallest and largest keys, so that we can
  1056  			// check consistency of the modified span.
  1057  			if sm == nil || base.InternalCompare(cmp, sm.Smallest, f.Smallest) > 0 {
  1058  				sm = f
  1059  			}
  1060  			if la == nil || base.InternalCompare(cmp, la.Largest, f.Largest) < 0 {
  1061  				la = f
  1062  			}
  1063  		}
  1064  
  1065  		if level == 0 {
  1066  			if curr != nil && curr.L0Sublevels != nil && len(deletedFilesMap) == 0 {
  1067  				// Flushes and ingestions that do not delete any L0 files do not require
  1068  				// a regeneration of L0Sublevels from scratch. We can instead generate
  1069  				// it incrementally.
  1070  				var err error
  1071  				// AddL0Files requires addedFiles to be sorted in seqnum order.
  1072  				SortBySeqNum(addedFiles)
  1073  				v.L0Sublevels, err = curr.L0Sublevels.AddL0Files(addedFiles, flushSplitBytes, &v.Levels[0])
  1074  				if errors.Is(err, errInvalidL0SublevelsOpt) {
  1075  					err = v.InitL0Sublevels(cmp, formatKey, flushSplitBytes)
  1076  				} else if invariants.Enabled && err == nil {
  1077  					copyOfSublevels, err := NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
  1078  					if err != nil {
  1079  						panic(fmt.Sprintf("error when regenerating sublevels: %s", err))
  1080  					}
  1081  					s1 := describeSublevels(base.DefaultFormatter, false /* verbose */, copyOfSublevels.Levels)
  1082  					s2 := describeSublevels(base.DefaultFormatter, false /* verbose */, v.L0Sublevels.Levels)
  1083  					if s1 != s2 {
  1084  						panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2))
  1085  					}
  1086  				}
  1087  				if err != nil {
  1088  					return nil, errors.Wrap(err, "pebble: internal error")
  1089  				}
  1090  				v.L0SublevelFiles = v.L0Sublevels.Levels
  1091  			} else if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
  1092  				return nil, errors.Wrap(err, "pebble: internal error")
  1093  			}
  1094  			if err := CheckOrdering(cmp, formatKey, Level(0), v.Levels[level].Iter(), orderingInvariants); err != nil {
  1095  				return nil, errors.Wrap(err, "pebble: internal error")
  1096  			}
  1097  			continue
  1098  		}
  1099  
  1100  		// Check consistency of the level in the vicinity of our edits.
  1101  		if sm != nil && la != nil {
  1102  			overlap := overlaps(v.Levels[level].Iter(), cmp, sm.Smallest.UserKey,
  1103  				la.Largest.UserKey, la.Largest.IsExclusiveSentinel())
  1104  			// overlap contains all of the added files. We want to ensure that
  1105  			// the added files are consistent with neighboring existing files
  1106  			// too, so reslice the overlap to pull in a neighbor on each side.
  1107  			check := overlap.Reslice(func(start, end *LevelIterator) {
  1108  				if m := start.Prev(); m == nil {
  1109  					start.Next()
  1110  				}
  1111  				if m := end.Next(); m == nil {
  1112  					end.Prev()
  1113  				}
  1114  			})
  1115  			if err := CheckOrdering(cmp, formatKey, Level(level), check.Iter(), orderingInvariants); err != nil {
  1116  				return nil, errors.Wrap(err, "pebble: internal error")
  1117  			}
  1118  		}
  1119  	}
  1120  	return v, nil
  1121  }