github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/manifest/version_edit.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"encoding/binary"
    11  	"errors"
    12  	"fmt"
    13  	"io"
    14  	"sync/atomic"
    15  
    16  	"github.com/petermattis/pebble/internal/base"
    17  )
    18  
    19  // TODO(peter): describe the MANIFEST file format, independently of the C++
    20  // project.
    21  
    22  var errCorruptManifest = errors.New("pebble: corrupt manifest")
    23  
    24  type byteReader interface {
    25  	io.ByteReader
    26  	io.Reader
    27  }
    28  
    29  // Tags for the versionEdit disk format.
    30  // Tag 8 is no longer used.
    31  const (
    32  	// LevelDB tags.
    33  	tagComparator     = 1
    34  	tagLogNumber      = 2
    35  	tagNextFileNumber = 3
    36  	tagLastSequence   = 4
    37  	tagCompactPointer = 5
    38  	tagDeletedFile    = 6
    39  	tagNewFile        = 7
    40  	tagPrevLogNumber  = 9
    41  
    42  	// RocksDB tags.
    43  	tagNewFile2         = 100
    44  	tagNewFile3         = 102
    45  	tagNewFile4         = 103
    46  	tagColumnFamily     = 200
    47  	tagColumnFamilyAdd  = 201
    48  	tagColumnFamilyDrop = 202
    49  	tagMaxColumnFamily  = 203
    50  
    51  	// The custom tags sub-format used by tagNewFile4.
    52  	customTagTerminate         = 1
    53  	customTagNeedsCompaction   = 2
    54  	customTagPathID            = 65
    55  	customTagNonSafeIgnoreMask = 1 << 6
    56  )
    57  
    58  // DeletedFileEntry holds the state for a file deletion from a level. The file
    59  // itself might still be referenced by another level.
    60  type DeletedFileEntry struct {
    61  	Level   int
    62  	FileNum uint64
    63  }
    64  
    65  // NewFileEntry holds the state for a new file or one moved from a different
    66  // level.
    67  type NewFileEntry struct {
    68  	Level int
    69  	Meta  FileMetadata
    70  }
    71  
    72  // VersionEdit holds the state for an edit to a Version along with other
    73  // on-disk state (log numbers, next file number, and the last sequence number).
    74  type VersionEdit struct {
    75  	ComparerName string
    76  	LogNum       uint64
    77  	PrevLogNum   uint64
    78  	NextFileNum  uint64
    79  	LastSeqNum   uint64
    80  	DeletedFiles map[DeletedFileEntry]bool // set of DeletedFileEntry values
    81  	NewFiles     []NewFileEntry
    82  }
    83  
    84  // Decode decodes an edit from the specified reader.
    85  func (v *VersionEdit) Decode(r io.Reader) error {
    86  	br, ok := r.(byteReader)
    87  	if !ok {
    88  		br = bufio.NewReader(r)
    89  	}
    90  	d := versionEditDecoder{br}
    91  	for {
    92  		tag, err := binary.ReadUvarint(br)
    93  		if err == io.EOF {
    94  			break
    95  		}
    96  		if err != nil {
    97  			return err
    98  		}
    99  		switch tag {
   100  		case tagComparator:
   101  			s, err := d.readBytes()
   102  			if err != nil {
   103  				return err
   104  			}
   105  			v.ComparerName = string(s)
   106  
   107  		case tagLogNumber:
   108  			n, err := d.readUvarint()
   109  			if err != nil {
   110  				return err
   111  			}
   112  			v.LogNum = n
   113  
   114  		case tagNextFileNumber:
   115  			n, err := d.readUvarint()
   116  			if err != nil {
   117  				return err
   118  			}
   119  			v.NextFileNum = n
   120  
   121  		case tagLastSequence:
   122  			n, err := d.readUvarint()
   123  			if err != nil {
   124  				return err
   125  			}
   126  			v.LastSeqNum = n
   127  
   128  		case tagCompactPointer:
   129  			if _, err := d.readLevel(); err != nil {
   130  				return err
   131  			}
   132  			if _, err := d.readBytes(); err != nil {
   133  				return err
   134  			}
   135  			// NB: RocksDB does not use compaction pointers anymore.
   136  
   137  		case tagDeletedFile:
   138  			level, err := d.readLevel()
   139  			if err != nil {
   140  				return err
   141  			}
   142  			fileNum, err := d.readUvarint()
   143  			if err != nil {
   144  				return err
   145  			}
   146  			if v.DeletedFiles == nil {
   147  				v.DeletedFiles = make(map[DeletedFileEntry]bool)
   148  			}
   149  			v.DeletedFiles[DeletedFileEntry{level, fileNum}] = true
   150  
   151  		case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4:
   152  			level, err := d.readLevel()
   153  			if err != nil {
   154  				return err
   155  			}
   156  			fileNum, err := d.readUvarint()
   157  			if err != nil {
   158  				return err
   159  			}
   160  			if tag == tagNewFile3 {
   161  				// The pathID field appears unused in RocksDB.
   162  				_ /* pathID */, err := d.readUvarint()
   163  				if err != nil {
   164  					return err
   165  				}
   166  			}
   167  			size, err := d.readUvarint()
   168  			if err != nil {
   169  				return err
   170  			}
   171  			smallest, err := d.readBytes()
   172  			if err != nil {
   173  				return err
   174  			}
   175  			largest, err := d.readBytes()
   176  			if err != nil {
   177  				return err
   178  			}
   179  			var smallestSeqNum uint64
   180  			var largestSeqNum uint64
   181  			if tag != tagNewFile {
   182  				smallestSeqNum, err = d.readUvarint()
   183  				if err != nil {
   184  					return err
   185  				}
   186  				largestSeqNum, err = d.readUvarint()
   187  				if err != nil {
   188  					return err
   189  				}
   190  			}
   191  			var markedForCompaction bool
   192  			if tag == tagNewFile4 {
   193  				for {
   194  					customTag, err := d.readUvarint()
   195  					if err != nil {
   196  						return err
   197  					}
   198  					if customTag == customTagTerminate {
   199  						break
   200  					}
   201  					field, err := d.readBytes()
   202  					if err != nil {
   203  						return err
   204  					}
   205  					switch customTag {
   206  					case customTagNeedsCompaction:
   207  						if len(field) != 1 {
   208  							return fmt.Errorf("new-file4: need-compaction field wrong size")
   209  						}
   210  						markedForCompaction = (field[0] == 1)
   211  
   212  					case customTagPathID:
   213  						return fmt.Errorf("new-file4: path-id field not supported")
   214  
   215  					default:
   216  						if (customTag & customTagNonSafeIgnoreMask) != 0 {
   217  							return fmt.Errorf("new-file4: custom field not supported: %d", customTag)
   218  						}
   219  					}
   220  				}
   221  			}
   222  			v.NewFiles = append(v.NewFiles, NewFileEntry{
   223  				Level: level,
   224  				Meta: FileMetadata{
   225  					FileNum:             fileNum,
   226  					Size:                size,
   227  					Smallest:            base.DecodeInternalKey(smallest),
   228  					Largest:             base.DecodeInternalKey(largest),
   229  					SmallestSeqNum:      smallestSeqNum,
   230  					LargestSeqNum:       largestSeqNum,
   231  					MarkedForCompaction: markedForCompaction,
   232  				},
   233  			})
   234  
   235  		case tagPrevLogNumber:
   236  			n, err := d.readUvarint()
   237  			if err != nil {
   238  				return err
   239  			}
   240  			v.PrevLogNum = n
   241  
   242  		case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily:
   243  			return fmt.Errorf("column families are not supported")
   244  
   245  		default:
   246  			return errCorruptManifest
   247  		}
   248  	}
   249  	return nil
   250  }
   251  
   252  // Encode encodes an edit to the specified writer.
   253  func (v *VersionEdit) Encode(w io.Writer) error {
   254  	e := versionEditEncoder{new(bytes.Buffer)}
   255  	if v.ComparerName != "" {
   256  		e.writeUvarint(tagComparator)
   257  		e.writeString(v.ComparerName)
   258  	}
   259  	if v.LogNum != 0 {
   260  		e.writeUvarint(tagLogNumber)
   261  		e.writeUvarint(v.LogNum)
   262  	}
   263  	if v.PrevLogNum != 0 {
   264  		e.writeUvarint(tagPrevLogNumber)
   265  		e.writeUvarint(v.PrevLogNum)
   266  	}
   267  	if v.NextFileNum != 0 {
   268  		e.writeUvarint(tagNextFileNumber)
   269  		e.writeUvarint(v.NextFileNum)
   270  	}
   271  	if v.LastSeqNum != 0 {
   272  		e.writeUvarint(tagLastSequence)
   273  		e.writeUvarint(v.LastSeqNum)
   274  	}
   275  	for x := range v.DeletedFiles {
   276  		e.writeUvarint(tagDeletedFile)
   277  		e.writeUvarint(uint64(x.Level))
   278  		e.writeUvarint(x.FileNum)
   279  	}
   280  	for _, x := range v.NewFiles {
   281  		var customFields bool
   282  		if x.Meta.MarkedForCompaction {
   283  			customFields = true
   284  			e.writeUvarint(tagNewFile4)
   285  		} else {
   286  			e.writeUvarint(tagNewFile2)
   287  		}
   288  		e.writeUvarint(uint64(x.Level))
   289  		e.writeUvarint(x.Meta.FileNum)
   290  		e.writeUvarint(x.Meta.Size)
   291  		e.writeKey(x.Meta.Smallest)
   292  		e.writeKey(x.Meta.Largest)
   293  		e.writeUvarint(x.Meta.SmallestSeqNum)
   294  		e.writeUvarint(x.Meta.LargestSeqNum)
   295  		if customFields {
   296  			if x.Meta.MarkedForCompaction {
   297  				e.writeUvarint(customTagNeedsCompaction)
   298  				e.writeBytes([]byte{1})
   299  			}
   300  			e.writeUvarint(customTagTerminate)
   301  		}
   302  	}
   303  	_, err := w.Write(e.Bytes())
   304  	return err
   305  }
   306  
   307  type versionEditDecoder struct {
   308  	byteReader
   309  }
   310  
   311  func (d versionEditDecoder) readBytes() ([]byte, error) {
   312  	n, err := d.readUvarint()
   313  	if err != nil {
   314  		return nil, err
   315  	}
   316  	s := make([]byte, n)
   317  	_, err = io.ReadFull(d, s)
   318  	if err != nil {
   319  		if err == io.ErrUnexpectedEOF {
   320  			return nil, errCorruptManifest
   321  		}
   322  		return nil, err
   323  	}
   324  	return s, nil
   325  }
   326  
   327  func (d versionEditDecoder) readLevel() (int, error) {
   328  	u, err := d.readUvarint()
   329  	if err != nil {
   330  		return 0, err
   331  	}
   332  	if u >= NumLevels {
   333  		return 0, errCorruptManifest
   334  	}
   335  	return int(u), nil
   336  }
   337  
   338  func (d versionEditDecoder) readUvarint() (uint64, error) {
   339  	u, err := binary.ReadUvarint(d)
   340  	if err != nil {
   341  		if err == io.EOF {
   342  			return 0, errCorruptManifest
   343  		}
   344  		return 0, err
   345  	}
   346  	return u, nil
   347  }
   348  
   349  type versionEditEncoder struct {
   350  	*bytes.Buffer
   351  }
   352  
   353  func (e versionEditEncoder) writeBytes(p []byte) {
   354  	e.writeUvarint(uint64(len(p)))
   355  	e.Write(p)
   356  }
   357  
   358  func (e versionEditEncoder) writeKey(k InternalKey) {
   359  	e.writeUvarint(uint64(k.Size()))
   360  	e.Write(k.UserKey)
   361  	buf := k.EncodeTrailer()
   362  	e.Write(buf[:])
   363  }
   364  
   365  func (e versionEditEncoder) writeString(s string) {
   366  	e.writeUvarint(uint64(len(s)))
   367  	e.WriteString(s)
   368  }
   369  
   370  func (e versionEditEncoder) writeUvarint(u uint64) {
   371  	var buf [binary.MaxVarintLen64]byte
   372  	n := binary.PutUvarint(buf[:], u)
   373  	e.Write(buf[:n])
   374  }
   375  
   376  // BulkVersionEdit summarizes the files added and deleted from a set of version
   377  // edits.
   378  type BulkVersionEdit struct {
   379  	Added   [NumLevels][]FileMetadata
   380  	Deleted [NumLevels]map[uint64]bool // map[uint64]bool is a set of fileNums
   381  }
   382  
   383  // Accumulate adds the file addition and deletions in the specified version
   384  // edit to the bulk edit's internal state.
   385  func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) {
   386  	for df := range ve.DeletedFiles {
   387  		dmap := b.Deleted[df.Level]
   388  		if dmap == nil {
   389  			dmap = make(map[uint64]bool)
   390  			b.Deleted[df.Level] = dmap
   391  		}
   392  		dmap[df.FileNum] = true
   393  	}
   394  
   395  	for _, nf := range ve.NewFiles {
   396  		if dmap := b.Deleted[nf.Level]; dmap != nil {
   397  			delete(dmap, nf.Meta.FileNum)
   398  		}
   399  		b.Added[nf.Level] = append(b.Added[nf.Level], nf.Meta)
   400  	}
   401  }
   402  
   403  // Apply applies the delta b to a base version to produce a new version. The
   404  // new version is consistent with respect to the internal key comparer icmp.
   405  //
   406  // base may be nil, which is equivalent to a pointer to a zero version.
   407  func (b *BulkVersionEdit) Apply(
   408  	opts *Options, base *Version, cmp Compare,
   409  ) (*Version, error) {
   410  	v := new(Version)
   411  	for level := range v.Files {
   412  		if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 {
   413  			// There are no edits on this level.
   414  			if base == nil {
   415  				continue
   416  			}
   417  			files := base.Files[level]
   418  			v.Files[level] = files
   419  			// We still have to bump the ref count for all files.
   420  			for i := range files {
   421  				atomic.AddInt32(files[i].refs, 1)
   422  			}
   423  			continue
   424  		}
   425  
   426  		combined := [2][]FileMetadata{
   427  			nil,
   428  			b.Added[level],
   429  		}
   430  		if base != nil {
   431  			combined[0] = base.Files[level]
   432  		}
   433  		n := len(combined[0]) + len(combined[1])
   434  		if n == 0 {
   435  			continue
   436  		}
   437  		v.Files[level] = make([]FileMetadata, 0, n)
   438  		dmap := b.Deleted[level]
   439  
   440  		for _, ff := range combined {
   441  			for _, f := range ff {
   442  				if dmap != nil && dmap[f.FileNum] {
   443  					continue
   444  				}
   445  				if f.refs == nil {
   446  					f.refs = new(int32)
   447  				}
   448  				atomic.AddInt32(f.refs, 1)
   449  				v.Files[level] = append(v.Files[level], f)
   450  			}
   451  		}
   452  
   453  		// TODO(peter): base.files[level] is already sorted. Instead of appending
   454  		// b.addFiles[level] to the end and sorting afterwards, it might be more
   455  		// efficient to sort b.addFiles[level] and then merge the two sorted
   456  		// slices.
   457  		if level == 0 {
   458  			SortBySeqNum(v.Files[level])
   459  		} else {
   460  			SortBySmallest(v.Files[level], cmp)
   461  		}
   462  	}
   463  	if err := v.CheckOrdering(cmp); err != nil {
   464  		return nil, fmt.Errorf("pebble: internal error: %v", err)
   465  	}
   466  	return v, nil
   467  }