github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/manifest.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package badger
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"encoding/binary"
    23  	"fmt"
    24  	"hash/crc32"
    25  	"io"
    26  	"os"
    27  	"path/filepath"
    28  	"sync"
    29  
    30  	"github.com/pingcap/badger/options"
    31  	"github.com/pingcap/badger/protos"
    32  	"github.com/pingcap/badger/y"
    33  	"github.com/pingcap/errors"
    34  )
    35  
    36  // Manifest represents the contents of the MANIFEST file in a Badger store.
    37  //
    38  // The MANIFEST file describes the startup state of the db -- all LSM files and what level they're
    39  // at.
    40  //
    41  // It consists of a sequence of ManifestChangeSet objects.  Each of these is treated atomically,
    42  // and contains a sequence of ManifestChange's (file creations/deletions) which we use to
    43  // reconstruct the manifest at startup.
    44  type Manifest struct {
    45  	Levels []levelManifest
    46  	Tables map[uint64]tableManifest
    47  
    48  	// Contains total number of creation and deletion changes in the manifest -- used to compute
    49  	// whether it'd be useful to rewrite the manifest.
    50  	Creations int
    51  	Deletions int
    52  
    53  	Head *protos.HeadInfo
    54  }
    55  
    56  func createManifest() Manifest {
    57  	levels := make([]levelManifest, 0)
    58  	return Manifest{
    59  		Levels: levels,
    60  		Tables: make(map[uint64]tableManifest),
    61  	}
    62  }
    63  
    64  // levelManifest contains information about LSM tree levels
    65  // in the MANIFEST file.
    66  type levelManifest struct {
    67  	Tables map[uint64]struct{} // Set of table id's
    68  }
    69  
    70  // tableManifest contains information about a specific level
    71  // in the LSM tree.
    72  type tableManifest struct {
    73  	Level       uint8
    74  	Compression options.CompressionType
    75  }
    76  
    77  // manifestFile holds the file pointer (and other info) about the manifest file, which is a log
    78  // file we append to.
    79  type manifestFile struct {
    80  	fp        *os.File
    81  	directory string
    82  	// We make this configurable so that unit tests can hit rewrite() code quickly
    83  	deletionsRewriteThreshold int
    84  
    85  	// Guards appends, which includes access to the manifest field.
    86  	appendLock sync.Mutex
    87  
    88  	// Used to track the current state of the manifest, used when rewriting.
    89  	manifest Manifest
    90  }
    91  
    92  const (
    93  	// ManifestFilename is the filename for the manifest file.
    94  	ManifestFilename                  = "MANIFEST"
    95  	manifestRewriteFilename           = "MANIFEST-REWRITE"
    96  	manifestDeletionsRewriteThreshold = 10000
    97  	manifestDeletionsRatio            = 10
    98  )
    99  
   100  // asChanges returns a sequence of changes that could be used to recreate the Manifest in its
   101  // present state.
   102  func (m *Manifest) asChanges() []*protos.ManifestChange {
   103  	changes := make([]*protos.ManifestChange, 0, len(m.Tables))
   104  	for id, tm := range m.Tables {
   105  		changes = append(changes, newCreateChange(id, int(tm.Level)))
   106  	}
   107  	return changes
   108  }
   109  
   110  func (m *Manifest) clone() Manifest {
   111  	changeSet := protos.ManifestChangeSet{Changes: m.asChanges()}
   112  	ret := createManifest()
   113  	y.Check(applyChangeSet(&ret, &changeSet))
   114  	return ret
   115  }
   116  
   117  // openOrCreateManifestFile opens a Badger manifest file if it exists, or creates on if
   118  // one doesn’t.
   119  func openOrCreateManifestFile(dir string, readOnly bool) (ret *manifestFile, result Manifest, err error) {
   120  	return helpOpenOrCreateManifestFile(dir, readOnly, manifestDeletionsRewriteThreshold)
   121  }
   122  
   123  func helpOpenOrCreateManifestFile(dir string, readOnly bool, deletionsThreshold int) (ret *manifestFile, result Manifest, err error) {
   124  	path := filepath.Join(dir, ManifestFilename)
   125  	var flags uint32
   126  	if readOnly {
   127  		flags |= y.ReadOnly
   128  	}
   129  	fp, err := y.OpenExistingFile(path, flags) // We explicitly sync in addChanges, outside the lock.
   130  	if err != nil {
   131  		if !os.IsNotExist(err) {
   132  			return nil, Manifest{}, err
   133  		}
   134  		if readOnly {
   135  			return nil, Manifest{}, fmt.Errorf("no manifest found, required for read-only db")
   136  		}
   137  		m := createManifest()
   138  		fp, netCreations, err := helpRewrite(dir, &m)
   139  		if err != nil {
   140  			return nil, Manifest{}, err
   141  		}
   142  		y.Assert(netCreations == 0)
   143  		mf := &manifestFile{
   144  			fp:                        fp,
   145  			directory:                 dir,
   146  			manifest:                  m.clone(),
   147  			deletionsRewriteThreshold: deletionsThreshold,
   148  		}
   149  		return mf, m, nil
   150  	}
   151  
   152  	manifest, truncOffset, err := ReplayManifestFile(fp)
   153  	if err != nil {
   154  		_ = fp.Close()
   155  		return nil, Manifest{}, err
   156  	}
   157  
   158  	if !readOnly {
   159  		// Truncate file so we don't have a half-written entry at the end.
   160  		if err := fp.Truncate(truncOffset); err != nil {
   161  			_ = fp.Close()
   162  			return nil, Manifest{}, err
   163  		}
   164  	}
   165  	if _, err = fp.Seek(0, io.SeekEnd); err != nil {
   166  		_ = fp.Close()
   167  		return nil, Manifest{}, err
   168  	}
   169  
   170  	mf := &manifestFile{
   171  		fp:                        fp,
   172  		directory:                 dir,
   173  		manifest:                  manifest.clone(),
   174  		deletionsRewriteThreshold: deletionsThreshold,
   175  	}
   176  	return mf, manifest, nil
   177  }
   178  
   179  func (mf *manifestFile) close() error {
   180  	return mf.fp.Close()
   181  }
   182  
   183  // addChanges writes a batch of changes, atomically, to the file.  By "atomically" that means when
   184  // we replay the MANIFEST file, we'll either replay all the changes or none of them.  (The truth of
   185  // this depends on the filesystem -- some might append garbage data if a system crash happens at
   186  // the wrong time.)
   187  func (mf *manifestFile) addChanges(changesParam []*protos.ManifestChange, head *protos.HeadInfo) error {
   188  	changes := protos.ManifestChangeSet{Changes: changesParam, Head: head}
   189  	buf, err := changes.Marshal()
   190  	if err != nil {
   191  		return err
   192  	}
   193  
   194  	// Maybe we could use O_APPEND instead (on certain file systems)
   195  	mf.appendLock.Lock()
   196  	if err := applyChangeSet(&mf.manifest, &changes); err != nil {
   197  		mf.appendLock.Unlock()
   198  		return err
   199  	}
   200  	// Rewrite manifest if it'd shrink by 1/10 and it's big enough to care
   201  	if mf.manifest.Deletions > mf.deletionsRewriteThreshold &&
   202  		mf.manifest.Deletions > manifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) {
   203  		if err := mf.rewrite(); err != nil {
   204  			mf.appendLock.Unlock()
   205  			return err
   206  		}
   207  	} else {
   208  		var lenCrcBuf [8]byte
   209  		binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf)))
   210  		binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, y.CastagnoliCrcTable))
   211  		buf = append(lenCrcBuf[:], buf...)
   212  		if _, err := mf.fp.Write(buf); err != nil {
   213  			mf.appendLock.Unlock()
   214  			return err
   215  		}
   216  	}
   217  
   218  	mf.appendLock.Unlock()
   219  	return mf.fp.Sync()
   220  }
   221  
   222  // Has to be 4 bytes.  The value can never change, ever, anyway.
   223  var magicText = [4]byte{'B', 'd', 'g', 'r'}
   224  
   225  // The magic version number.
   226  const magicVersion = 4
   227  
   228  func helpRewrite(dir string, m *Manifest) (*os.File, int, error) {
   229  	rewritePath := filepath.Join(dir, manifestRewriteFilename)
   230  	// We explicitly sync.
   231  	fp, err := y.OpenTruncFile(rewritePath, false)
   232  	if err != nil {
   233  		return nil, 0, err
   234  	}
   235  
   236  	buf := make([]byte, 8)
   237  	copy(buf[0:4], magicText[:])
   238  	binary.BigEndian.PutUint32(buf[4:8], magicVersion)
   239  
   240  	netCreations := len(m.Tables)
   241  	changes := m.asChanges()
   242  	set := protos.ManifestChangeSet{Changes: changes, Head: m.Head}
   243  
   244  	changeBuf, err := set.Marshal()
   245  	if err != nil {
   246  		fp.Close()
   247  		return nil, 0, err
   248  	}
   249  	var lenCrcBuf [8]byte
   250  	binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf)))
   251  	binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, y.CastagnoliCrcTable))
   252  	buf = append(buf, lenCrcBuf[:]...)
   253  	buf = append(buf, changeBuf...)
   254  	if _, err := fp.Write(buf); err != nil {
   255  		fp.Close()
   256  		return nil, 0, err
   257  	}
   258  	if err := fp.Sync(); err != nil {
   259  		fp.Close()
   260  		return nil, 0, err
   261  	}
   262  
   263  	// In Windows the files should be closed before doing a Rename.
   264  	if err = fp.Close(); err != nil {
   265  		return nil, 0, err
   266  	}
   267  	manifestPath := filepath.Join(dir, ManifestFilename)
   268  	if err := os.Rename(rewritePath, manifestPath); err != nil {
   269  		return nil, 0, err
   270  	}
   271  	fp, err = y.OpenExistingFile(manifestPath, 0)
   272  	if err != nil {
   273  		return nil, 0, err
   274  	}
   275  	if _, err := fp.Seek(0, io.SeekEnd); err != nil {
   276  		fp.Close()
   277  		return nil, 0, err
   278  	}
   279  	if err := syncDir(dir); err != nil {
   280  		fp.Close()
   281  		return nil, 0, err
   282  	}
   283  
   284  	return fp, netCreations, nil
   285  }
   286  
   287  // Must be called while appendLock is held.
   288  func (mf *manifestFile) rewrite() error {
   289  	// In Windows the files should be closed before doing a Rename.
   290  	if err := mf.fp.Close(); err != nil {
   291  		return err
   292  	}
   293  	fp, netCreations, err := helpRewrite(mf.directory, &mf.manifest)
   294  	if err != nil {
   295  		return err
   296  	}
   297  	mf.fp = fp
   298  	mf.manifest.Creations = netCreations
   299  	mf.manifest.Deletions = 0
   300  
   301  	return nil
   302  }
   303  
   304  type countingReader struct {
   305  	wrapped *bufio.Reader
   306  	count   int64
   307  }
   308  
   309  func (r *countingReader) Read(p []byte) (n int, err error) {
   310  	n, err = r.wrapped.Read(p)
   311  	r.count += int64(n)
   312  	return
   313  }
   314  
   315  func (r *countingReader) ReadByte() (b byte, err error) {
   316  	b, err = r.wrapped.ReadByte()
   317  	if err == nil {
   318  		r.count++
   319  	}
   320  	return
   321  }
   322  
   323  var (
   324  	errBadMagic = errors.New("manifest has bad magic")
   325  )
   326  
   327  // ReplayManifestFile reads the manifest file and constructs two manifest objects.  (We need one
   328  // immutable copy and one mutable copy of the manifest.  Easiest way is to construct two of them.)
   329  // Also, returns the last offset after a completely read manifest entry -- the file must be
   330  // truncated at that point before further appends are made (if there is a partial entry after
   331  // that).  In normal conditions, truncOffset is the file size.
   332  func ReplayManifestFile(fp *os.File) (ret Manifest, truncOffset int64, err error) {
   333  	r := countingReader{wrapped: bufio.NewReader(fp)}
   334  
   335  	var magicBuf [8]byte
   336  	if _, err := io.ReadFull(&r, magicBuf[:]); err != nil {
   337  		return Manifest{}, 0, errBadMagic
   338  	}
   339  	if !bytes.Equal(magicBuf[0:4], magicText[:]) {
   340  		return Manifest{}, 0, errBadMagic
   341  	}
   342  	version := binary.BigEndian.Uint32(magicBuf[4:8])
   343  	if version != magicVersion {
   344  		return Manifest{}, 0,
   345  			fmt.Errorf("manifest has unsupported version: %d (we support %d)", version, magicVersion)
   346  	}
   347  
   348  	build := createManifest()
   349  	var offset int64
   350  	for {
   351  		offset = r.count
   352  		var lenCrcBuf [8]byte
   353  		_, err := io.ReadFull(&r, lenCrcBuf[:])
   354  		if err != nil {
   355  			if err == io.EOF || err == io.ErrUnexpectedEOF {
   356  				break
   357  			}
   358  			return Manifest{}, 0, err
   359  		}
   360  		length := binary.BigEndian.Uint32(lenCrcBuf[0:4])
   361  		var buf = make([]byte, length)
   362  		if _, err := io.ReadFull(&r, buf); err != nil {
   363  			if err == io.EOF || err == io.ErrUnexpectedEOF {
   364  				break
   365  			}
   366  			return Manifest{}, 0, err
   367  		}
   368  		if crc32.Checksum(buf, y.CastagnoliCrcTable) != binary.BigEndian.Uint32(lenCrcBuf[4:8]) {
   369  			break
   370  		}
   371  
   372  		var changeSet protos.ManifestChangeSet
   373  		if err := changeSet.Unmarshal(buf); err != nil {
   374  			return Manifest{}, 0, err
   375  		}
   376  
   377  		if err := applyChangeSet(&build, &changeSet); err != nil {
   378  			return Manifest{}, 0, err
   379  		}
   380  	}
   381  
   382  	return build, offset, err
   383  }
   384  
   385  func addNewToManifest(build *Manifest, tc *protos.ManifestChange) {
   386  	build.Tables[tc.Id] = tableManifest{
   387  		Level: uint8(tc.Level),
   388  	}
   389  	for len(build.Levels) <= int(tc.Level) {
   390  		build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})})
   391  	}
   392  	build.Levels[tc.Level].Tables[tc.Id] = struct{}{}
   393  	build.Creations++
   394  }
   395  
   396  func applyManifestChange(build *Manifest, tc *protos.ManifestChange) error {
   397  	switch tc.Op {
   398  	case protos.ManifestChange_CREATE:
   399  		if _, ok := build.Tables[tc.Id]; ok {
   400  			return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id)
   401  		}
   402  		addNewToManifest(build, tc)
   403  	case protos.ManifestChange_DELETE:
   404  		tm, ok := build.Tables[tc.Id]
   405  		if !ok {
   406  			return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id)
   407  		}
   408  		delete(build.Levels[tm.Level].Tables, tc.Id)
   409  		delete(build.Tables, tc.Id)
   410  		build.Deletions++
   411  	case protos.ManifestChange_MOVE_DOWN:
   412  		tm, ok := build.Tables[tc.Id]
   413  		if !ok {
   414  			return fmt.Errorf("MANIFEST moves down non-exisitng table %d", tc.Id)
   415  		}
   416  		delete(build.Levels[tm.Level].Tables, tc.Id)
   417  		delete(build.Tables, tc.Id)
   418  		build.Deletions++
   419  		addNewToManifest(build, tc)
   420  	default:
   421  		return fmt.Errorf("MANIFEST file has invalid manifestChange op")
   422  	}
   423  	return nil
   424  }
   425  
   426  // This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is
   427  // just plain broken.
   428  func applyChangeSet(build *Manifest, changeSet *protos.ManifestChangeSet) error {
   429  	for _, change := range changeSet.Changes {
   430  		if err := applyManifestChange(build, change); err != nil {
   431  			return err
   432  		}
   433  	}
   434  	if changeSet.Head != nil {
   435  		build.Head = changeSet.Head
   436  	}
   437  	return nil
   438  }
   439  
   440  func newCreateChange(
   441  	id uint64, level int) *protos.ManifestChange {
   442  	return &protos.ManifestChange{
   443  		Id:    id,
   444  		Op:    protos.ManifestChange_CREATE,
   445  		Level: uint32(level),
   446  	}
   447  }
   448  
   449  func newDeleteChange(id uint64) *protos.ManifestChange {
   450  	return &protos.ManifestChange{
   451  		Id: id,
   452  		Op: protos.ManifestChange_DELETE,
   453  	}
   454  }
   455  
   456  func newMoveDownChange(id uint64, moveToLevel int) *protos.ManifestChange {
   457  	return &protos.ManifestChange{
   458  		Id:    id,
   459  		Op:    protos.ManifestChange_MOVE_DOWN,
   460  		Level: uint32(moveToLevel),
   461  	}
   462  }