github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/version_edit.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	stdcmp "cmp"
    11  	"encoding/binary"
    12  	"fmt"
    13  	"io"
    14  	"slices"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/cockroachdb/pebble/internal/base"
    19  	"github.com/cockroachdb/pebble/internal/invariants"
    20  )
    21  
    22  // TODO(peter): describe the MANIFEST file format, independently of the C++
    23  // project.
    24  
    25  var errCorruptManifest = base.CorruptionErrorf("pebble: corrupt manifest")
    26  
    27  type byteReader interface {
    28  	io.ByteReader
    29  	io.Reader
    30  }
    31  
    32  // Tags for the versionEdit disk format.
    33  // Tag 8 is no longer used.
    34  const (
    35  	// LevelDB tags.
    36  	tagComparator     = 1
    37  	tagLogNumber      = 2
    38  	tagNextFileNumber = 3
    39  	tagLastSequence   = 4
    40  	tagCompactPointer = 5
    41  	tagDeletedFile    = 6
    42  	tagNewFile        = 7
    43  	tagPrevLogNumber  = 9
    44  
    45  	// RocksDB tags.
    46  	tagNewFile2         = 100
    47  	tagNewFile3         = 102
    48  	tagNewFile4         = 103
    49  	tagColumnFamily     = 200
    50  	tagColumnFamilyAdd  = 201
    51  	tagColumnFamilyDrop = 202
    52  	tagMaxColumnFamily  = 203
    53  
    54  	// Pebble tags.
    55  	tagNewFile5            = 104 // Range keys.
    56  	tagCreatedBackingTable = 105
    57  	tagRemovedBackingTable = 106
    58  
    59  	// The custom tags sub-format used by tagNewFile4 and above.
    60  	customTagTerminate         = 1
    61  	customTagNeedsCompaction   = 2
    62  	customTagCreationTime      = 6
    63  	customTagPathID            = 65
    64  	customTagNonSafeIgnoreMask = 1 << 6
    65  	customTagVirtual           = 66
    66  )
    67  
    68  // DeletedFileEntry holds the state for a file deletion from a level. The file
    69  // itself might still be referenced by another level.
    70  type DeletedFileEntry struct {
    71  	Level   int
    72  	FileNum base.FileNum
    73  }
    74  
    75  // NewFileEntry holds the state for a new file or one moved from a different
    76  // level.
    77  type NewFileEntry struct {
    78  	Level int
    79  	Meta  *FileMetadata
    80  	// BackingFileNum is only set during manifest replay, and only for virtual
    81  	// sstables.
    82  	BackingFileNum base.DiskFileNum
    83  }
    84  
    85  // VersionEdit holds the state for an edit to a Version along with other
    86  // on-disk state (log numbers, next file number, and the last sequence number).
    87  type VersionEdit struct {
    88  	// ComparerName is the value of Options.Comparer.Name. This is only set in
    89  	// the first VersionEdit in a manifest (either when the DB is created, or
    90  	// when a new manifest is created) and is used to verify that the comparer
    91  	// specified at Open matches the comparer that was previously used.
    92  	ComparerName string
    93  
    94  	// MinUnflushedLogNum is the smallest WAL log file number corresponding to
    95  	// mutations that have not been flushed to an sstable.
    96  	//
    97  	// This is an optional field, and 0 represents it is not set.
    98  	MinUnflushedLogNum base.DiskFileNum
    99  
   100  	// ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by
   101  	// Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in
   102  	// 6/2011. We keep it around purely for informational purposes when
   103  	// displaying MANIFEST contents.
   104  	ObsoletePrevLogNum uint64
   105  
   106  	// The next file number. A single counter is used to assign file numbers
   107  	// for the WAL, MANIFEST, sstable, and OPTIONS files.
   108  	NextFileNum uint64
   109  
   110  	// LastSeqNum is an upper bound on the sequence numbers that have been
   111  	// assigned in flushed WALs. Unflushed WALs (that will be replayed during
   112  	// recovery) may contain sequence numbers greater than this value.
   113  	LastSeqNum uint64
   114  
   115  	// A file num may be present in both deleted files and new files when it
   116  	// is moved from a lower level to a higher level (when the compaction
   117  	// found that there was no overlapping file at the higher level).
   118  	DeletedFiles map[DeletedFileEntry]*FileMetadata
   119  	NewFiles     []NewFileEntry
   120  	// CreatedBackingTables can be used to preserve the FileBacking associated
   121  	// with a physical sstable. This is useful when virtual sstables in the
   122  	// latest version are reconstructed during manifest replay, and we also need
   123  	// to reconstruct the FileBacking which is required by these virtual
   124  	// sstables.
   125  	//
   126  	// INVARIANT: The FileBacking associated with a physical sstable must only
   127  	// be added as a backing file in the same version edit where the physical
   128  	// sstable is first virtualized. This means that the physical sstable must
   129  	// be present in DeletedFiles and that there must be at least one virtual
   130  	// sstable with the same FileBacking as the physical sstable in NewFiles. A
   131  	// file must be present in CreatedBackingTables in exactly one version edit.
   132  	// The physical sstable associated with the FileBacking must also not be
   133  	// present in NewFiles.
   134  	CreatedBackingTables []*FileBacking
   135  	// RemovedBackingTables is used to remove the FileBacking associated with a
   136  	// virtual sstable. Note that a backing sstable can be removed as soon as
   137  	// there are no virtual sstables in the latest version which are using the
   138  	// backing sstable, but the backing sstable doesn't necessarily have to be
   139  	// removed atomically with the version edit which removes the last virtual
   140  	// sstable associated with the backing sstable. The removal can happen in a
   141  	// future version edit.
   142  	//
   143  	// INVARIANT: A file must only be added to RemovedBackingTables if it was
   144  	// added to CreateBackingTables in a prior version edit. The same version
   145  	// edit also cannot have the same file present in both CreateBackingTables
   146  	// and RemovedBackingTables. A file must be present in RemovedBackingTables
   147  	// in exactly one version edit.
   148  	RemovedBackingTables []base.DiskFileNum
   149  }
   150  
   151  // Decode decodes an edit from the specified reader.
   152  //
   153  // Note that the Decode step will not set the FileBacking for virtual sstables
   154  // and the responsibility is left to the caller. However, the Decode step will
   155  // populate the NewFileEntry.BackingFileNum in VersionEdit.NewFiles.
   156  func (v *VersionEdit) Decode(r io.Reader) error {
   157  	br, ok := r.(byteReader)
   158  	if !ok {
   159  		br = bufio.NewReader(r)
   160  	}
   161  	d := versionEditDecoder{br}
   162  	for {
   163  		tag, err := binary.ReadUvarint(br)
   164  		if err == io.EOF {
   165  			break
   166  		}
   167  		if err != nil {
   168  			return err
   169  		}
   170  		switch tag {
   171  		case tagComparator:
   172  			s, err := d.readBytes()
   173  			if err != nil {
   174  				return err
   175  			}
   176  			v.ComparerName = string(s)
   177  
   178  		case tagLogNumber:
   179  			n, err := d.readUvarint()
   180  			if err != nil {
   181  				return err
   182  			}
   183  			v.MinUnflushedLogNum = base.DiskFileNum(n)
   184  
   185  		case tagNextFileNumber:
   186  			n, err := d.readUvarint()
   187  			if err != nil {
   188  				return err
   189  			}
   190  			v.NextFileNum = n
   191  
   192  		case tagLastSequence:
   193  			n, err := d.readUvarint()
   194  			if err != nil {
   195  				return err
   196  			}
   197  			v.LastSeqNum = n
   198  
   199  		case tagCompactPointer:
   200  			if _, err := d.readLevel(); err != nil {
   201  				return err
   202  			}
   203  			if _, err := d.readBytes(); err != nil {
   204  				return err
   205  			}
   206  			// NB: RocksDB does not use compaction pointers anymore.
   207  
   208  		case tagRemovedBackingTable:
   209  			n, err := d.readUvarint()
   210  			if err != nil {
   211  				return err
   212  			}
   213  			v.RemovedBackingTables = append(
   214  				v.RemovedBackingTables, base.FileNum(n).DiskFileNum(),
   215  			)
   216  		case tagCreatedBackingTable:
   217  			dfn, err := d.readUvarint()
   218  			if err != nil {
   219  				return err
   220  			}
   221  			size, err := d.readUvarint()
   222  			if err != nil {
   223  				return err
   224  			}
   225  			fileBacking := &FileBacking{
   226  				DiskFileNum: base.FileNum(dfn).DiskFileNum(),
   227  				Size:        size,
   228  			}
   229  			v.CreatedBackingTables = append(v.CreatedBackingTables, fileBacking)
   230  		case tagDeletedFile:
   231  			level, err := d.readLevel()
   232  			if err != nil {
   233  				return err
   234  			}
   235  			fileNum, err := d.readFileNum()
   236  			if err != nil {
   237  				return err
   238  			}
   239  			if v.DeletedFiles == nil {
   240  				v.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata)
   241  			}
   242  			v.DeletedFiles[DeletedFileEntry{level, fileNum}] = nil
   243  
   244  		case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5:
   245  			level, err := d.readLevel()
   246  			if err != nil {
   247  				return err
   248  			}
   249  			fileNum, err := d.readFileNum()
   250  			if err != nil {
   251  				return err
   252  			}
   253  			if tag == tagNewFile3 {
   254  				// The pathID field appears unused in RocksDB.
   255  				_ /* pathID */, err := d.readUvarint()
   256  				if err != nil {
   257  					return err
   258  				}
   259  			}
   260  			size, err := d.readUvarint()
   261  			if err != nil {
   262  				return err
   263  			}
   264  			// We read the smallest / largest key bounds differently depending on
   265  			// whether we have point, range or both types of keys present in the
   266  			// table.
   267  			var (
   268  				smallestPointKey, largestPointKey []byte
   269  				smallestRangeKey, largestRangeKey []byte
   270  				parsedPointBounds                 bool
   271  				boundsMarker                      byte
   272  			)
   273  			if tag != tagNewFile5 {
   274  				// Range keys not present in the table. Parse the point key bounds.
   275  				smallestPointKey, err = d.readBytes()
   276  				if err != nil {
   277  					return err
   278  				}
   279  				largestPointKey, err = d.readBytes()
   280  				if err != nil {
   281  					return err
   282  				}
   283  			} else {
   284  				// Range keys are present in the table. Determine whether we have point
   285  				// keys to parse, in addition to the bounds.
   286  				boundsMarker, err = d.ReadByte()
   287  				if err != nil {
   288  					return err
   289  				}
   290  				// Parse point key bounds, if present.
   291  				if boundsMarker&maskContainsPointKeys > 0 {
   292  					smallestPointKey, err = d.readBytes()
   293  					if err != nil {
   294  						return err
   295  					}
   296  					largestPointKey, err = d.readBytes()
   297  					if err != nil {
   298  						return err
   299  					}
   300  					parsedPointBounds = true
   301  				} else {
   302  					// The table does not have point keys.
   303  					// Sanity check: the bounds must be range keys.
   304  					if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 {
   305  						return base.CorruptionErrorf(
   306  							"new-file-4-range-keys: table without point keys has point key bounds: marker=%x",
   307  							boundsMarker,
   308  						)
   309  					}
   310  				}
   311  				// Parse range key bounds.
   312  				smallestRangeKey, err = d.readBytes()
   313  				if err != nil {
   314  					return err
   315  				}
   316  				largestRangeKey, err = d.readBytes()
   317  				if err != nil {
   318  					return err
   319  				}
   320  			}
   321  			var smallestSeqNum uint64
   322  			var largestSeqNum uint64
   323  			if tag != tagNewFile {
   324  				smallestSeqNum, err = d.readUvarint()
   325  				if err != nil {
   326  					return err
   327  				}
   328  				largestSeqNum, err = d.readUvarint()
   329  				if err != nil {
   330  					return err
   331  				}
   332  			}
   333  			var markedForCompaction bool
   334  			var creationTime uint64
   335  			virtualState := struct {
   336  				virtual        bool
   337  				backingFileNum uint64
   338  			}{}
   339  			if tag == tagNewFile4 || tag == tagNewFile5 {
   340  				for {
   341  					customTag, err := d.readUvarint()
   342  					if err != nil {
   343  						return err
   344  					}
   345  					if customTag == customTagTerminate {
   346  						break
   347  					} else if customTag == customTagVirtual {
   348  						virtualState.virtual = true
   349  						n, err := d.readUvarint()
   350  						if err != nil {
   351  							return err
   352  						}
   353  						virtualState.backingFileNum = n
   354  						continue
   355  					}
   356  
   357  					field, err := d.readBytes()
   358  					if err != nil {
   359  						return err
   360  					}
   361  					switch customTag {
   362  					case customTagNeedsCompaction:
   363  						if len(field) != 1 {
   364  							return base.CorruptionErrorf("new-file4: need-compaction field wrong size")
   365  						}
   366  						markedForCompaction = (field[0] == 1)
   367  
   368  					case customTagCreationTime:
   369  						var n int
   370  						creationTime, n = binary.Uvarint(field)
   371  						if n != len(field) {
   372  							return base.CorruptionErrorf("new-file4: invalid file creation time")
   373  						}
   374  
   375  					case customTagPathID:
   376  						return base.CorruptionErrorf("new-file4: path-id field not supported")
   377  
   378  					default:
   379  						if (customTag & customTagNonSafeIgnoreMask) != 0 {
   380  							return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag)
   381  						}
   382  					}
   383  				}
   384  			}
   385  			m := &FileMetadata{
   386  				FileNum:             fileNum,
   387  				Size:                size,
   388  				CreationTime:        int64(creationTime),
   389  				SmallestSeqNum:      smallestSeqNum,
   390  				LargestSeqNum:       largestSeqNum,
   391  				MarkedForCompaction: markedForCompaction,
   392  				Virtual:             virtualState.virtual,
   393  			}
   394  			if tag != tagNewFile5 { // no range keys present
   395  				m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey)
   396  				m.LargestPointKey = base.DecodeInternalKey(largestPointKey)
   397  				m.HasPointKeys = true
   398  				m.Smallest, m.Largest = m.SmallestPointKey, m.LargestPointKey
   399  				m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey
   400  			} else { // range keys present
   401  				// Set point key bounds, if parsed.
   402  				if parsedPointBounds {
   403  					m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey)
   404  					m.LargestPointKey = base.DecodeInternalKey(largestPointKey)
   405  					m.HasPointKeys = true
   406  				}
   407  				// Set range key bounds.
   408  				m.SmallestRangeKey = base.DecodeInternalKey(smallestRangeKey)
   409  				m.LargestRangeKey = base.DecodeInternalKey(largestRangeKey)
   410  				m.HasRangeKeys = true
   411  				// Set overall bounds (by default assume range keys).
   412  				m.Smallest, m.Largest = m.SmallestRangeKey, m.LargestRangeKey
   413  				m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey
   414  				if boundsMarker&maskSmallest == maskSmallest {
   415  					m.Smallest = m.SmallestPointKey
   416  					m.boundTypeSmallest = boundTypePointKey
   417  				}
   418  				if boundsMarker&maskLargest == maskLargest {
   419  					m.Largest = m.LargestPointKey
   420  					m.boundTypeLargest = boundTypePointKey
   421  				}
   422  			}
   423  			m.boundsSet = true
   424  			if !virtualState.virtual {
   425  				m.InitPhysicalBacking()
   426  			}
   427  
   428  			nfe := NewFileEntry{
   429  				Level: level,
   430  				Meta:  m,
   431  			}
   432  			if virtualState.virtual {
   433  				nfe.BackingFileNum = base.FileNum(virtualState.backingFileNum).DiskFileNum()
   434  			}
   435  			v.NewFiles = append(v.NewFiles, nfe)
   436  
   437  		case tagPrevLogNumber:
   438  			n, err := d.readUvarint()
   439  			if err != nil {
   440  				return err
   441  			}
   442  			v.ObsoletePrevLogNum = n
   443  
   444  		case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily:
   445  			return base.CorruptionErrorf("column families are not supported")
   446  
   447  		default:
   448  			return errCorruptManifest
   449  		}
   450  	}
   451  	return nil
   452  }
   453  
   454  func (v *VersionEdit) string(verbose bool, fmtKey base.FormatKey) string {
   455  	var buf bytes.Buffer
   456  	if v.ComparerName != "" {
   457  		fmt.Fprintf(&buf, "  comparer:     %s", v.ComparerName)
   458  	}
   459  	if v.MinUnflushedLogNum != 0 {
   460  		fmt.Fprintf(&buf, "  log-num:       %d\n", v.MinUnflushedLogNum)
   461  	}
   462  	if v.ObsoletePrevLogNum != 0 {
   463  		fmt.Fprintf(&buf, "  prev-log-num:  %d\n", v.ObsoletePrevLogNum)
   464  	}
   465  	if v.NextFileNum != 0 {
   466  		fmt.Fprintf(&buf, "  next-file-num: %d\n", v.NextFileNum)
   467  	}
   468  	if v.LastSeqNum != 0 {
   469  		fmt.Fprintf(&buf, "  last-seq-num:  %d\n", v.LastSeqNum)
   470  	}
   471  	entries := make([]DeletedFileEntry, 0, len(v.DeletedFiles))
   472  	for df := range v.DeletedFiles {
   473  		entries = append(entries, df)
   474  	}
   475  	slices.SortFunc(entries, func(a, b DeletedFileEntry) int {
   476  		if v := stdcmp.Compare(a.Level, b.Level); v != 0 {
   477  			return v
   478  		}
   479  		return stdcmp.Compare(a.FileNum, b.FileNum)
   480  	})
   481  	for _, df := range entries {
   482  		fmt.Fprintf(&buf, "  deleted:       L%d %s\n", df.Level, df.FileNum)
   483  	}
   484  	for _, nf := range v.NewFiles {
   485  		fmt.Fprintf(&buf, "  added:         L%d", nf.Level)
   486  		if verbose {
   487  			fmt.Fprintf(&buf, " %s", nf.Meta.DebugString(fmtKey, true /* verbose */))
   488  		} else {
   489  			fmt.Fprintf(&buf, " %s", nf.Meta.String())
   490  		}
   491  		if nf.Meta.CreationTime != 0 {
   492  			fmt.Fprintf(&buf, " (%s)",
   493  				time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339))
   494  		}
   495  		fmt.Fprintln(&buf)
   496  	}
   497  	return buf.String()
   498  }
   499  
   500  // DebugString is a more verbose version of String(). Use this in tests.
   501  func (v *VersionEdit) DebugString(fmtKey base.FormatKey) string {
   502  	return v.string(true /* verbose */, fmtKey)
   503  }
   504  
   505  // String implements fmt.Stringer for a VersionEdit.
   506  func (v *VersionEdit) String() string {
   507  	return v.string(false /* verbose */, base.DefaultFormatter)
   508  }
   509  
   510  // Encode encodes an edit to the specified writer.
   511  func (v *VersionEdit) Encode(w io.Writer) error {
   512  	e := versionEditEncoder{new(bytes.Buffer)}
   513  
   514  	if v.ComparerName != "" {
   515  		e.writeUvarint(tagComparator)
   516  		e.writeString(v.ComparerName)
   517  	}
   518  	if v.MinUnflushedLogNum != 0 {
   519  		e.writeUvarint(tagLogNumber)
   520  		e.writeUvarint(uint64(v.MinUnflushedLogNum))
   521  	}
   522  	if v.ObsoletePrevLogNum != 0 {
   523  		e.writeUvarint(tagPrevLogNumber)
   524  		e.writeUvarint(v.ObsoletePrevLogNum)
   525  	}
   526  	if v.NextFileNum != 0 {
   527  		e.writeUvarint(tagNextFileNumber)
   528  		e.writeUvarint(uint64(v.NextFileNum))
   529  	}
   530  	for _, dfn := range v.RemovedBackingTables {
   531  		e.writeUvarint(tagRemovedBackingTable)
   532  		e.writeUvarint(uint64(dfn.FileNum()))
   533  	}
   534  	for _, fileBacking := range v.CreatedBackingTables {
   535  		e.writeUvarint(tagCreatedBackingTable)
   536  		e.writeUvarint(uint64(fileBacking.DiskFileNum.FileNum()))
   537  		e.writeUvarint(fileBacking.Size)
   538  	}
   539  	// RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry,
   540  	// even though its value is zero. We detect this by encoding LastSeqNum when
   541  	// ComparerName is set.
   542  	if v.LastSeqNum != 0 || v.ComparerName != "" {
   543  		e.writeUvarint(tagLastSequence)
   544  		e.writeUvarint(v.LastSeqNum)
   545  	}
   546  	for x := range v.DeletedFiles {
   547  		e.writeUvarint(tagDeletedFile)
   548  		e.writeUvarint(uint64(x.Level))
   549  		e.writeUvarint(uint64(x.FileNum))
   550  	}
   551  	for _, x := range v.NewFiles {
   552  		customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 || x.Meta.Virtual
   553  		var tag uint64
   554  		switch {
   555  		case x.Meta.HasRangeKeys:
   556  			tag = tagNewFile5
   557  		case customFields:
   558  			tag = tagNewFile4
   559  		default:
   560  			tag = tagNewFile2
   561  		}
   562  		e.writeUvarint(tag)
   563  		e.writeUvarint(uint64(x.Level))
   564  		e.writeUvarint(uint64(x.Meta.FileNum))
   565  		e.writeUvarint(x.Meta.Size)
   566  		if !x.Meta.HasRangeKeys {
   567  			// If we have no range keys, preserve the original format and write the
   568  			// smallest and largest point keys.
   569  			e.writeKey(x.Meta.SmallestPointKey)
   570  			e.writeKey(x.Meta.LargestPointKey)
   571  		} else {
   572  			// When range keys are present, we first write a marker byte that
   573  			// indicates if the table also contains point keys, in addition to how the
   574  			// overall bounds for the table should be reconstructed. This byte is
   575  			// followed by the keys themselves.
   576  			b, err := x.Meta.boundsMarker()
   577  			if err != nil {
   578  				return err
   579  			}
   580  			if err = e.WriteByte(b); err != nil {
   581  				return err
   582  			}
   583  			// Write point key bounds (if present).
   584  			if x.Meta.HasPointKeys {
   585  				e.writeKey(x.Meta.SmallestPointKey)
   586  				e.writeKey(x.Meta.LargestPointKey)
   587  			}
   588  			// Write range key bounds.
   589  			e.writeKey(x.Meta.SmallestRangeKey)
   590  			e.writeKey(x.Meta.LargestRangeKey)
   591  		}
   592  		e.writeUvarint(x.Meta.SmallestSeqNum)
   593  		e.writeUvarint(x.Meta.LargestSeqNum)
   594  		if customFields {
   595  			if x.Meta.CreationTime != 0 {
   596  				e.writeUvarint(customTagCreationTime)
   597  				var buf [binary.MaxVarintLen64]byte
   598  				n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime))
   599  				e.writeBytes(buf[:n])
   600  			}
   601  			if x.Meta.MarkedForCompaction {
   602  				e.writeUvarint(customTagNeedsCompaction)
   603  				e.writeBytes([]byte{1})
   604  			}
   605  			if x.Meta.Virtual {
   606  				e.writeUvarint(customTagVirtual)
   607  				e.writeUvarint(uint64(x.Meta.FileBacking.DiskFileNum.FileNum()))
   608  			}
   609  			e.writeUvarint(customTagTerminate)
   610  		}
   611  	}
   612  	_, err := w.Write(e.Bytes())
   613  	return err
   614  }
   615  
   616  // versionEditDecoder should be used to decode version edits.
   617  type versionEditDecoder struct {
   618  	byteReader
   619  }
   620  
   621  func (d versionEditDecoder) readBytes() ([]byte, error) {
   622  	n, err := d.readUvarint()
   623  	if err != nil {
   624  		return nil, err
   625  	}
   626  	s := make([]byte, n)
   627  	_, err = io.ReadFull(d, s)
   628  	if err != nil {
   629  		if err == io.ErrUnexpectedEOF {
   630  			return nil, errCorruptManifest
   631  		}
   632  		return nil, err
   633  	}
   634  	return s, nil
   635  }
   636  
   637  func (d versionEditDecoder) readLevel() (int, error) {
   638  	u, err := d.readUvarint()
   639  	if err != nil {
   640  		return 0, err
   641  	}
   642  	if u >= NumLevels {
   643  		return 0, errCorruptManifest
   644  	}
   645  	return int(u), nil
   646  }
   647  
   648  func (d versionEditDecoder) readFileNum() (base.FileNum, error) {
   649  	u, err := d.readUvarint()
   650  	if err != nil {
   651  		return 0, err
   652  	}
   653  	return base.FileNum(u), nil
   654  }
   655  
   656  func (d versionEditDecoder) readUvarint() (uint64, error) {
   657  	u, err := binary.ReadUvarint(d)
   658  	if err != nil {
   659  		if err == io.EOF {
   660  			return 0, errCorruptManifest
   661  		}
   662  		return 0, err
   663  	}
   664  	return u, nil
   665  }
   666  
   667  type versionEditEncoder struct {
   668  	*bytes.Buffer
   669  }
   670  
   671  func (e versionEditEncoder) writeBytes(p []byte) {
   672  	e.writeUvarint(uint64(len(p)))
   673  	e.Write(p)
   674  }
   675  
   676  func (e versionEditEncoder) writeKey(k InternalKey) {
   677  	e.writeUvarint(uint64(k.Size()))
   678  	e.Write(k.UserKey)
   679  	buf := k.EncodeTrailer()
   680  	e.Write(buf[:])
   681  }
   682  
   683  func (e versionEditEncoder) writeString(s string) {
   684  	e.writeUvarint(uint64(len(s)))
   685  	e.WriteString(s)
   686  }
   687  
   688  func (e versionEditEncoder) writeUvarint(u uint64) {
   689  	var buf [binary.MaxVarintLen64]byte
   690  	n := binary.PutUvarint(buf[:], u)
   691  	e.Write(buf[:n])
   692  }
   693  
   694  // BulkVersionEdit summarizes the files added and deleted from a set of version
   695  // edits.
   696  //
   697  // INVARIANTS:
   698  // No file can be added to a level more than once. This is true globally, and
   699  // also true for all of the calls to Accumulate for a single bulk version edit.
   700  //
   701  // No file can be removed from a level more than once. This is true globally,
   702  // and also true for all of the calls to Accumulate for a single bulk version
   703  // edit.
   704  //
   705  // A file must not be added and removed from a given level in the same version
   706  // edit.
   707  //
   708  // A file that is being removed from a level must have been added to that level
   709  // before (in a prior version edit). Note that a given file can be deleted from
   710  // a level and added to another level in a single version edit
   711  type BulkVersionEdit struct {
   712  	Added   [NumLevels]map[base.FileNum]*FileMetadata
   713  	Deleted [NumLevels]map[base.FileNum]*FileMetadata
   714  
   715  	// AddedFileBacking is a map to support lookup so that we can populate the
   716  	// FileBacking of virtual sstables during manifest replay.
   717  	AddedFileBacking   map[base.DiskFileNum]*FileBacking
   718  	RemovedFileBacking []base.DiskFileNum
   719  
   720  	// AddedByFileNum maps file number to file metadata for all added files
   721  	// from accumulated version edits. AddedByFileNum is only populated if set
   722  	// to non-nil by a caller. It must be set to non-nil when replaying
   723  	// version edits read from a MANIFEST (as opposed to VersionEdits
   724  	// constructed in-memory).  While replaying a MANIFEST file,
   725  	// VersionEdit.DeletedFiles map entries have nil values, because the
   726  	// on-disk deletion record encodes only the file number. Accumulate
   727  	// uses AddedByFileNum to correctly populate the BulkVersionEdit's Deleted
   728  	// field with non-nil *FileMetadata.
   729  	AddedByFileNum map[base.FileNum]*FileMetadata
   730  
   731  	// MarkedForCompactionCountDiff holds the aggregated count of files
   732  	// marked for compaction added or removed.
   733  	MarkedForCompactionCountDiff int
   734  }
   735  
   736  // Accumulate adds the file addition and deletions in the specified version
   737  // edit to the bulk edit's internal state.
   738  //
   739  // INVARIANTS:
   740  // If a file is added to a given level in a call to Accumulate and then removed
   741  // from that level in a subsequent call, the file will not be present in the
   742  // resulting BulkVersionEdit.Deleted for that level.
   743  //
   744  // After accumulation of version edits, the bulk version edit may have
   745  // information about a file which has been deleted from a level, but it may
   746  // not have information about the same file added to the same level. The add
   747  // could've occurred as part of a previous bulk version edit. In this case,
   748  // the deleted file must be present in BulkVersionEdit.Deleted, at the end
   749  // of the accumulation, because we need to decrease the refcount of the
   750  // deleted file in Apply.
   751  func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error {
   752  	for df, m := range ve.DeletedFiles {
   753  		dmap := b.Deleted[df.Level]
   754  		if dmap == nil {
   755  			dmap = make(map[base.FileNum]*FileMetadata)
   756  			b.Deleted[df.Level] = dmap
   757  		}
   758  
   759  		if m == nil {
   760  			// m is nil only when replaying a MANIFEST.
   761  			if b.AddedByFileNum == nil {
   762  				return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum)
   763  			}
   764  			m = b.AddedByFileNum[df.FileNum]
   765  			if m == nil {
   766  				return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", df.Level, df.FileNum)
   767  			}
   768  		}
   769  		if m.MarkedForCompaction {
   770  			b.MarkedForCompactionCountDiff--
   771  		}
   772  		if _, ok := b.Added[df.Level][df.FileNum]; !ok {
   773  			dmap[df.FileNum] = m
   774  		} else {
   775  			// Present in b.Added for the same level.
   776  			delete(b.Added[df.Level], df.FileNum)
   777  		}
   778  	}
   779  
   780  	// Generate state for Added backing files. Note that these must be generated
   781  	// before we loop through the NewFiles, because we need to populate the
   782  	// FileBackings which might be used by the NewFiles loop.
   783  	if b.AddedFileBacking == nil {
   784  		b.AddedFileBacking = make(map[base.DiskFileNum]*FileBacking)
   785  	}
   786  	for _, fb := range ve.CreatedBackingTables {
   787  		if _, ok := b.AddedFileBacking[fb.DiskFileNum]; ok {
   788  			// There is already a FileBacking associated with fb.DiskFileNum.
   789  			// This should never happen. There must always be only one FileBacking
   790  			// associated with a backing sstable.
   791  			panic(fmt.Sprintf("pebble: duplicate file backing %s", fb.DiskFileNum.String()))
   792  		}
   793  		b.AddedFileBacking[fb.DiskFileNum] = fb
   794  	}
   795  
   796  	for _, nf := range ve.NewFiles {
   797  		// A new file should not have been deleted in this or a preceding
   798  		// VersionEdit at the same level (though files can move across levels).
   799  		if dmap := b.Deleted[nf.Level]; dmap != nil {
   800  			if _, ok := dmap[nf.Meta.FileNum]; ok {
   801  				return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.FileNum)
   802  			}
   803  		}
   804  		if nf.Meta.Virtual && nf.Meta.FileBacking == nil {
   805  			// FileBacking for a virtual sstable must only be nil if we're performing
   806  			// manifest replay.
   807  			nf.Meta.FileBacking = b.AddedFileBacking[nf.BackingFileNum]
   808  			if nf.Meta.FileBacking == nil {
   809  				return errors.Errorf("FileBacking for virtual sstable must not be nil")
   810  			}
   811  		} else if nf.Meta.FileBacking == nil {
   812  			return errors.Errorf("Added file L%d.%s's has no FileBacking", nf.Level, nf.Meta.FileNum)
   813  		}
   814  
   815  		if b.Added[nf.Level] == nil {
   816  			b.Added[nf.Level] = make(map[base.FileNum]*FileMetadata)
   817  		}
   818  		b.Added[nf.Level][nf.Meta.FileNum] = nf.Meta
   819  		if b.AddedByFileNum != nil {
   820  			b.AddedByFileNum[nf.Meta.FileNum] = nf.Meta
   821  		}
   822  		if nf.Meta.MarkedForCompaction {
   823  			b.MarkedForCompactionCountDiff++
   824  		}
   825  	}
   826  
   827  	// Since a file can be removed from backing files in exactly one version
   828  	// edit it is safe to just append without any de-duplication.
   829  	b.RemovedFileBacking = append(b.RemovedFileBacking, ve.RemovedBackingTables...)
   830  
   831  	return nil
   832  }
   833  
   834  // AccumulateIncompleteAndApplySingleVE should be called if a single version edit
   835  // is to be applied to the provided curr Version and if the caller needs to
   836  // update the versionSet.zombieTables map. This function exists separately from
   837  // BulkVersionEdit.Apply because it is easier to reason about properties
   838  // regarding BulkVersionedit.Accumulate/Apply and zombie table generation, if we
   839  // know that exactly one version edit is being accumulated.
   840  //
   841  // Note that the version edit passed into this function may be incomplete
   842  // because compactions don't have the ref counting information necessary to
   843  // populate VersionEdit.RemovedBackingTables. This function will complete such a
   844  // version edit by populating RemovedBackingTables.
   845  //
   846  // Invariant: Any file being deleted through ve must belong to the curr Version.
   847  // We can't have a delete for some arbitrary file which does not exist in curr.
   848  func AccumulateIncompleteAndApplySingleVE(
   849  	ve *VersionEdit,
   850  	curr *Version,
   851  	cmp Compare,
   852  	formatKey base.FormatKey,
   853  	flushSplitBytes int64,
   854  	readCompactionRate int64,
   855  	backingStateMap map[base.DiskFileNum]*FileBacking,
   856  	addBackingFunc func(*FileBacking),
   857  	removeBackingFunc func(base.DiskFileNum),
   858  	orderingInvariants OrderingInvariants,
   859  ) (_ *Version, zombies map[base.DiskFileNum]uint64, _ error) {
   860  	if len(ve.RemovedBackingTables) != 0 {
   861  		panic("pebble: invalid incomplete version edit")
   862  	}
   863  	var b BulkVersionEdit
   864  	err := b.Accumulate(ve)
   865  	if err != nil {
   866  		return nil, nil, err
   867  	}
   868  	zombies = make(map[base.DiskFileNum]uint64)
   869  	v, err := b.Apply(
   870  		curr, cmp, formatKey, flushSplitBytes, readCompactionRate, zombies, orderingInvariants,
   871  	)
   872  	if err != nil {
   873  		return nil, nil, err
   874  	}
   875  
   876  	for _, s := range b.AddedFileBacking {
   877  		addBackingFunc(s)
   878  	}
   879  
   880  	for fileNum := range zombies {
   881  		if _, ok := backingStateMap[fileNum]; ok {
   882  			// This table was backing some virtual sstable in the latest version,
   883  			// but is now a zombie. We add RemovedBackingTables entries for
   884  			// these, before the version edit is written to disk.
   885  			ve.RemovedBackingTables = append(
   886  				ve.RemovedBackingTables, fileNum,
   887  			)
   888  			removeBackingFunc(fileNum)
   889  		}
   890  	}
   891  	return v, zombies, nil
   892  }
   893  
   894  // Apply applies the delta b to the current version to produce a new
   895  // version. The new version is consistent with respect to the comparer cmp.
   896  //
   897  // curr may be nil, which is equivalent to a pointer to a zero version.
   898  //
   899  // On success, if a non-nil zombies map is provided to Apply, the map is updated
   900  // with file numbers and files sizes of deleted files. These files are
   901  // considered zombies because they are no longer referenced by the returned
   902  // Version, but cannot be deleted from disk as they are still in use by the
   903  // incoming Version.
   904  func (b *BulkVersionEdit) Apply(
   905  	curr *Version,
   906  	cmp Compare,
   907  	formatKey base.FormatKey,
   908  	flushSplitBytes int64,
   909  	readCompactionRate int64,
   910  	zombies map[base.DiskFileNum]uint64,
   911  	orderingInvariants OrderingInvariants,
   912  ) (*Version, error) {
   913  	addZombie := func(state *FileBacking) {
   914  		if zombies != nil {
   915  			zombies[state.DiskFileNum] = state.Size
   916  		}
   917  	}
   918  	removeZombie := func(state *FileBacking) {
   919  		if zombies != nil {
   920  			delete(zombies, state.DiskFileNum)
   921  		}
   922  	}
   923  
   924  	v := new(Version)
   925  
   926  	// Adjust the count of files marked for compaction.
   927  	if curr != nil {
   928  		v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction
   929  	}
   930  	v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff
   931  	if v.Stats.MarkedForCompaction < 0 {
   932  		return nil, base.CorruptionErrorf("pebble: version marked for compaction count negative")
   933  	}
   934  
   935  	for level := range v.Levels {
   936  		if curr == nil || curr.Levels[level].tree.root == nil {
   937  			v.Levels[level] = makeLevelMetadata(cmp, level, nil /* files */)
   938  		} else {
   939  			v.Levels[level] = curr.Levels[level].clone()
   940  		}
   941  		if curr == nil || curr.RangeKeyLevels[level].tree.root == nil {
   942  			v.RangeKeyLevels[level] = makeLevelMetadata(cmp, level, nil /* files */)
   943  		} else {
   944  			v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone()
   945  		}
   946  
   947  		if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 {
   948  			// There are no edits on this level.
   949  			if level == 0 {
   950  				// Initialize L0Sublevels.
   951  				if curr == nil || curr.L0Sublevels == nil {
   952  					if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
   953  						return nil, errors.Wrap(err, "pebble: internal error")
   954  					}
   955  				} else {
   956  					v.L0Sublevels = curr.L0Sublevels
   957  					v.L0SublevelFiles = v.L0Sublevels.Levels
   958  				}
   959  			}
   960  			continue
   961  		}
   962  
   963  		// Some edits on this level.
   964  		lm := &v.Levels[level]
   965  		lmRange := &v.RangeKeyLevels[level]
   966  
   967  		addedFilesMap := b.Added[level]
   968  		deletedFilesMap := b.Deleted[level]
   969  		if n := v.Levels[level].Len() + len(addedFilesMap); n == 0 {
   970  			return nil, base.CorruptionErrorf(
   971  				"pebble: internal error: No current or added files but have deleted files: %d",
   972  				errors.Safe(len(deletedFilesMap)))
   973  		}
   974  
   975  		// NB: addedFilesMap may be empty. If a file is present in addedFilesMap
   976  		// for a level, it won't be present in deletedFilesMap for the same
   977  		// level.
   978  
   979  		for _, f := range deletedFilesMap {
   980  			if obsolete := v.Levels[level].remove(f); obsolete {
   981  				// Deleting a file from the B-Tree may decrement its
   982  				// reference count. However, because we cloned the
   983  				// previous level's B-Tree, this should never result in a
   984  				// file's reference count dropping to zero.
   985  				err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during B-Tree removal", level, f.FileNum)
   986  				return nil, err
   987  			}
   988  			if f.HasRangeKeys {
   989  				if obsolete := v.RangeKeyLevels[level].remove(f); obsolete {
   990  					// Deleting a file from the B-Tree may decrement its
   991  					// reference count. However, because we cloned the
   992  					// previous level's B-Tree, this should never result in a
   993  					// file's reference count dropping to zero.
   994  					err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during range-key B-Tree removal", level, f.FileNum)
   995  					return nil, err
   996  				}
   997  			}
   998  
   999  			// Note that a backing sst will only become a zombie if the
  1000  			// references to it in the latest version is 0. We will remove the
  1001  			// backing sst from the zombie list in the next loop if one of the
  1002  			// addedFiles in any of the levels is referencing the backing sst.
  1003  			// This is possible if a physical sstable is virtualized, or if it
  1004  			// is moved.
  1005  			latestRefCount := f.LatestRefs()
  1006  			if latestRefCount <= 0 {
  1007  				// If a file is present in deletedFilesMap for a level, then it
  1008  				// must have already been added to the level previously, which
  1009  				// means that its latest ref count cannot be 0.
  1010  				err := errors.Errorf("pebble: internal error: incorrect latestRefs reference counting for file", f.FileNum)
  1011  				return nil, err
  1012  			} else if f.LatestUnref() == 0 {
  1013  				addZombie(f.FileBacking)
  1014  			}
  1015  		}
  1016  
  1017  		addedFiles := make([]*FileMetadata, 0, len(addedFilesMap))
  1018  		for _, f := range addedFilesMap {
  1019  			addedFiles = append(addedFiles, f)
  1020  		}
  1021  		// Sort addedFiles by file number. This isn't necessary, but tests which
  1022  		// replay invalid manifests check the error output, and the error output
  1023  		// depends on the order in which files are added to the btree.
  1024  		slices.SortFunc(addedFiles, func(a, b *FileMetadata) int {
  1025  			return stdcmp.Compare(a.FileNum, b.FileNum)
  1026  		})
  1027  
  1028  		var sm, la *FileMetadata
  1029  		for _, f := range addedFiles {
  1030  			// NB: allowedSeeks is used for read triggered compactions. It is set using
  1031  			// Options.Experimental.ReadCompactionRate which defaults to 32KB.
  1032  			var allowedSeeks int64
  1033  			if readCompactionRate != 0 {
  1034  				allowedSeeks = int64(f.Size) / readCompactionRate
  1035  			}
  1036  			if allowedSeeks < 100 {
  1037  				allowedSeeks = 100
  1038  			}
  1039  			f.AllowedSeeks.Store(allowedSeeks)
  1040  			f.InitAllowedSeeks = allowedSeeks
  1041  
  1042  			err := lm.insert(f)
  1043  			// We're adding this file to the new version, so increment the
  1044  			// latest refs count.
  1045  			f.LatestRef()
  1046  			if err != nil {
  1047  				return nil, errors.Wrap(err, "pebble")
  1048  			}
  1049  			if f.HasRangeKeys {
  1050  				err = lmRange.insert(f)
  1051  				if err != nil {
  1052  					return nil, errors.Wrap(err, "pebble")
  1053  				}
  1054  			}
  1055  			removeZombie(f.FileBacking)
  1056  			// Track the keys with the smallest and largest keys, so that we can
  1057  			// check consistency of the modified span.
  1058  			if sm == nil || base.InternalCompare(cmp, sm.Smallest, f.Smallest) > 0 {
  1059  				sm = f
  1060  			}
  1061  			if la == nil || base.InternalCompare(cmp, la.Largest, f.Largest) < 0 {
  1062  				la = f
  1063  			}
  1064  		}
  1065  
  1066  		if level == 0 {
  1067  			if curr != nil && curr.L0Sublevels != nil && len(deletedFilesMap) == 0 {
  1068  				// Flushes and ingestions that do not delete any L0 files do not require
  1069  				// a regeneration of L0Sublevels from scratch. We can instead generate
  1070  				// it incrementally.
  1071  				var err error
  1072  				// AddL0Files requires addedFiles to be sorted in seqnum order.
  1073  				SortBySeqNum(addedFiles)
  1074  				v.L0Sublevels, err = curr.L0Sublevels.AddL0Files(addedFiles, flushSplitBytes, &v.Levels[0])
  1075  				if errors.Is(err, errInvalidL0SublevelsOpt) {
  1076  					err = v.InitL0Sublevels(cmp, formatKey, flushSplitBytes)
  1077  				} else if invariants.Enabled && err == nil {
  1078  					copyOfSublevels, err := NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
  1079  					if err != nil {
  1080  						panic(fmt.Sprintf("error when regenerating sublevels: %s", err))
  1081  					}
  1082  					s1 := describeSublevels(base.DefaultFormatter, false /* verbose */, copyOfSublevels.Levels)
  1083  					s2 := describeSublevels(base.DefaultFormatter, false /* verbose */, v.L0Sublevels.Levels)
  1084  					if s1 != s2 {
  1085  						panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2))
  1086  					}
  1087  				}
  1088  				if err != nil {
  1089  					return nil, errors.Wrap(err, "pebble: internal error")
  1090  				}
  1091  				v.L0SublevelFiles = v.L0Sublevels.Levels
  1092  			} else if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
  1093  				return nil, errors.Wrap(err, "pebble: internal error")
  1094  			}
  1095  			if err := CheckOrdering(cmp, formatKey, Level(0), v.Levels[level].Iter(), orderingInvariants); err != nil {
  1096  				return nil, errors.Wrap(err, "pebble: internal error")
  1097  			}
  1098  			continue
  1099  		}
  1100  
  1101  		// Check consistency of the level in the vicinity of our edits.
  1102  		if sm != nil && la != nil {
  1103  			overlap := overlaps(v.Levels[level].Iter(), cmp, sm.Smallest.UserKey,
  1104  				la.Largest.UserKey, la.Largest.IsExclusiveSentinel())
  1105  			// overlap contains all of the added files. We want to ensure that
  1106  			// the added files are consistent with neighboring existing files
  1107  			// too, so reslice the overlap to pull in a neighbor on each side.
  1108  			check := overlap.Reslice(func(start, end *LevelIterator) {
  1109  				if m := start.Prev(); m == nil {
  1110  					start.Next()
  1111  				}
  1112  				if m := end.Next(); m == nil {
  1113  					end.Prev()
  1114  				}
  1115  			})
  1116  			if err := CheckOrdering(cmp, formatKey, Level(level), check.Iter(), orderingInvariants); err != nil {
  1117  				return nil, errors.Wrap(err, "pebble: internal error")
  1118  			}
  1119  		}
  1120  	}
  1121  	return v, nil
  1122  }