github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/open.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"io"
    11  	"io/ioutil"
    12  	"os"
    13  	"path/filepath"
    14  	"sort"
    15  	"sync"
    16  
    17  	"github.com/petermattis/pebble/internal/arenaskl"
    18  	"github.com/petermattis/pebble/internal/base"
    19  	"github.com/petermattis/pebble/internal/rate"
    20  	"github.com/petermattis/pebble/internal/record"
    21  	"github.com/petermattis/pebble/vfs"
    22  )
    23  
    24  var dbNumAlloc = struct {
    25  	sync.Mutex
    26  	seq uint64
    27  }{seq: 1}
    28  
    29  func allocDBNum() uint64 {
    30  	dbNumAlloc.Lock()
    31  	num := dbNumAlloc.seq
    32  	dbNumAlloc.seq++
    33  	dbNumAlloc.Unlock()
    34  	return num
    35  }
    36  
    37  func createDB(dirname string, opts *Options) (retErr error) {
    38  	const manifestFileNum = 1
    39  	ve := versionEdit{
    40  		ComparerName: opts.Comparer.Name,
    41  		NextFileNum:  manifestFileNum + 1,
    42  	}
    43  	manifestFilename := base.MakeFilename(dirname, fileTypeManifest, manifestFileNum)
    44  	f, err := opts.FS.Create(manifestFilename)
    45  	if err != nil {
    46  		return fmt.Errorf("pebble: could not create %q: %v", manifestFilename, err)
    47  	}
    48  	defer func() {
    49  		if retErr != nil {
    50  			opts.FS.Remove(manifestFilename)
    51  		}
    52  	}()
    53  	defer f.Close()
    54  
    55  	recWriter := record.NewWriter(f)
    56  	w, err := recWriter.Next()
    57  	if err != nil {
    58  		return err
    59  	}
    60  	err = ve.Encode(w)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	err = recWriter.Close()
    65  	if err != nil {
    66  		return err
    67  	}
    68  	return setCurrentFile(dirname, opts.FS, manifestFileNum)
    69  }
    70  
    71  // Open opens a LevelDB whose files live in the given directory.
    72  func Open(dirname string, opts *Options) (*DB, error) {
    73  	// Make a copy of the options so that we don't mutate the passed in options.
    74  	opts = opts.Clone()
    75  	opts = opts.EnsureDefaults()
    76  
    77  	d := &DB{
    78  		dbNum:          allocDBNum(),
    79  		dirname:        dirname,
    80  		walDirname:     opts.WALDir,
    81  		opts:           opts,
    82  		cmp:            opts.Comparer.Compare,
    83  		equal:          opts.Comparer.Equal,
    84  		merge:          opts.Merger.Merge,
    85  		split:          opts.Comparer.Split,
    86  		abbreviatedKey: opts.Comparer.AbbreviatedKey,
    87  		logRecycler:    logRecycler{limit: opts.MemTableStopWritesThreshold + 1},
    88  	}
    89  	if d.equal == nil {
    90  		d.equal = bytes.Equal
    91  	}
    92  	tableCacheSize := opts.MaxOpenFiles - numNonTableCacheFiles
    93  	if tableCacheSize < minTableCacheSize {
    94  		tableCacheSize = minTableCacheSize
    95  	}
    96  	d.tableCache.init(d.dbNum, dirname, opts.FS, d.opts, tableCacheSize, defaultTableCacheHitBuffer)
    97  	d.newIters = d.tableCache.newIters
    98  	d.commit = newCommitPipeline(commitEnv{
    99  		logSeqNum:     &d.mu.versions.logSeqNum,
   100  		visibleSeqNum: &d.mu.versions.visibleSeqNum,
   101  		apply:         d.commitApply,
   102  		write:         d.commitWrite,
   103  	})
   104  	d.compactionLimiter = rate.NewLimiter(rate.Limit(d.opts.MinCompactionRate), d.opts.MinCompactionRate)
   105  	d.flushLimiter = rate.NewLimiter(rate.Limit(d.opts.MinFlushRate), d.opts.MinFlushRate)
   106  	d.mu.nextJobID = 1
   107  	d.mu.mem.cond.L = &d.mu.Mutex
   108  	d.mu.mem.mutable = newMemTable(d.opts)
   109  	d.mu.mem.queue = append(d.mu.mem.queue, d.mu.mem.mutable)
   110  	d.mu.cleaner.cond.L = &d.mu.Mutex
   111  	d.mu.compact.cond.L = &d.mu.Mutex
   112  	d.mu.compact.pendingOutputs = make(map[uint64]struct{})
   113  	d.mu.snapshots.init()
   114  	d.largeBatchThreshold = (d.opts.MemTableSize - int(d.mu.mem.mutable.emptySize)) / 2
   115  
   116  	d.mu.Lock()
   117  	defer d.mu.Unlock()
   118  
   119  	// Lock the database directory.
   120  	if !d.opts.ReadOnly {
   121  		err := opts.FS.MkdirAll(dirname, 0755)
   122  		if err != nil {
   123  			return nil, err
   124  		}
   125  	}
   126  	fileLock, err := opts.FS.Lock(base.MakeFilename(dirname, fileTypeLock, 0))
   127  	if err != nil {
   128  		return nil, err
   129  	}
   130  	defer func() {
   131  		if fileLock != nil {
   132  			fileLock.Close()
   133  		}
   134  	}()
   135  
   136  	d.dataDir, err = opts.FS.OpenDir(dirname)
   137  	if err != nil {
   138  		return nil, err
   139  	}
   140  	if d.walDirname == "" {
   141  		d.walDirname = d.dirname
   142  	}
   143  	if d.walDirname == d.dirname {
   144  		d.walDir = d.dataDir
   145  	} else {
   146  		if !d.opts.ReadOnly {
   147  			err := opts.FS.MkdirAll(d.walDirname, 0755)
   148  			if err != nil {
   149  				return nil, err
   150  			}
   151  		}
   152  		d.walDir, err = opts.FS.OpenDir(d.walDirname)
   153  	}
   154  
   155  	if _, err := opts.FS.Stat(base.MakeFilename(dirname, fileTypeCurrent, 0)); os.IsNotExist(err) && !d.opts.ReadOnly {
   156  		// Create the DB if it did not already exist.
   157  		if err := createDB(dirname, opts); err != nil {
   158  			return nil, err
   159  		}
   160  		if err := d.dataDir.Sync(); err != nil {
   161  			return nil, err
   162  		}
   163  	} else if err != nil {
   164  		return nil, fmt.Errorf("pebble: database %q: %v", dirname, err)
   165  	} else if opts.ErrorIfDBExists {
   166  		return nil, fmt.Errorf("pebble: database %q already exists", dirname)
   167  	}
   168  
   169  	// Load the version set.
   170  	err = d.mu.versions.load(dirname, opts, &d.mu.Mutex)
   171  	if err != nil {
   172  		return nil, err
   173  	}
   174  
   175  	ls, err := opts.FS.List(d.walDirname)
   176  	if err != nil {
   177  		return nil, err
   178  	}
   179  	if d.dirname != d.walDirname {
   180  		ls2, err := opts.FS.List(d.dirname)
   181  		if err != nil {
   182  			return nil, err
   183  		}
   184  		ls = append(ls, ls2...)
   185  	}
   186  
   187  	// Replay any newer log files than the ones named in the manifest.
   188  	type fileNumAndName struct {
   189  		num  uint64
   190  		name string
   191  	}
   192  	var logFiles []fileNumAndName
   193  	for _, filename := range ls {
   194  		ft, fn, ok := base.ParseFilename(filename)
   195  		if !ok {
   196  			continue
   197  		}
   198  		switch ft {
   199  		case fileTypeLog:
   200  			if fn >= d.mu.versions.logNum || fn == d.mu.versions.prevLogNum {
   201  				logFiles = append(logFiles, fileNumAndName{fn, filename})
   202  			}
   203  		case fileTypeOptions:
   204  			if err := checkOptions(opts, filepath.Join(dirname, filename)); err != nil {
   205  				return nil, err
   206  			}
   207  		}
   208  	}
   209  	sort.Slice(logFiles, func(i, j int) bool {
   210  		return logFiles[i].num < logFiles[j].num
   211  	})
   212  
   213  	jobID := d.mu.nextJobID
   214  	d.mu.nextJobID++
   215  
   216  	var ve versionEdit
   217  	for _, lf := range logFiles {
   218  		maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS, filepath.Join(d.walDirname, lf.name), lf.num)
   219  		if err != nil {
   220  			return nil, err
   221  		}
   222  		d.mu.versions.markFileNumUsed(lf.num)
   223  		if d.mu.versions.logSeqNum < maxSeqNum {
   224  			d.mu.versions.logSeqNum = maxSeqNum
   225  		}
   226  	}
   227  	d.mu.versions.visibleSeqNum = d.mu.versions.logSeqNum
   228  
   229  	if !d.opts.ReadOnly {
   230  		// Create an empty .log file.
   231  		ve.LogNum = d.mu.versions.getNextFileNum()
   232  		d.mu.log.queue = append(d.mu.log.queue, ve.LogNum)
   233  		logFile, err := opts.FS.Create(base.MakeFilename(d.walDirname, fileTypeLog, ve.LogNum))
   234  		if err != nil {
   235  			return nil, err
   236  		}
   237  		if err := d.walDir.Sync(); err != nil {
   238  			return nil, err
   239  		}
   240  		logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{
   241  			BytesPerSync:    d.opts.BytesPerSync,
   242  			PreallocateSize: d.walPreallocateSize(),
   243  		})
   244  		d.mu.log.LogWriter = record.NewLogWriter(logFile, ve.LogNum)
   245  		d.mu.versions.metrics.WAL.Files++
   246  
   247  		// Write a new manifest to disk.
   248  		if err := d.mu.versions.logAndApply(0, &ve, nil, d.dataDir); err != nil {
   249  			return nil, err
   250  		}
   251  	}
   252  	d.updateReadStateLocked()
   253  
   254  	if !d.opts.ReadOnly {
   255  		// Write the current options to disk.
   256  		d.optionsFileNum = d.mu.versions.getNextFileNum()
   257  		optionsFile, err := opts.FS.Create(base.MakeFilename(dirname, fileTypeOptions, d.optionsFileNum))
   258  		if err != nil {
   259  			return nil, err
   260  		}
   261  		if _, err := optionsFile.Write([]byte(opts.String())); err != nil {
   262  			return nil, err
   263  		}
   264  		optionsFile.Close()
   265  		if err := d.dataDir.Sync(); err != nil {
   266  			return nil, err
   267  		}
   268  	}
   269  
   270  	if !d.opts.ReadOnly {
   271  		d.scanObsoleteFiles(ls)
   272  		d.deleteObsoleteFiles(jobID)
   273  	}
   274  	d.maybeScheduleFlush()
   275  	d.maybeScheduleCompaction()
   276  
   277  	d.fileLock, fileLock = fileLock, nil
   278  	return d, nil
   279  }
   280  
   281  // replayWAL replays the edits in the specified log file.
   282  //
   283  // d.mu must be held when calling this, but the mutex may be dropped and
   284  // re-acquired during the course of this method.
   285  func (d *DB) replayWAL(
   286  	jobID int,
   287  	ve *versionEdit,
   288  	fs vfs.FS,
   289  	filename string,
   290  	logNum uint64,
   291  ) (maxSeqNum uint64, err error) {
   292  	file, err := fs.Open(filename)
   293  	if err != nil {
   294  		return 0, err
   295  	}
   296  	defer file.Close()
   297  
   298  	var (
   299  		b   Batch
   300  		buf bytes.Buffer
   301  		mem *memTable
   302  		rr  = record.NewReader(file, logNum)
   303  	)
   304  
   305  	// In read-only mode, we replay directly into the mutable memtable which will
   306  	// never be flushed.
   307  	if d.opts.ReadOnly {
   308  		mem = d.mu.mem.mutable
   309  	}
   310  
   311  	for {
   312  		r, err := rr.Next()
   313  		if err == nil {
   314  			_, err = io.Copy(&buf, r)
   315  		}
   316  		if err != nil {
   317  			// It is common to encounter a zeroed or invalid chunk due to WAL
   318  			// preallocation and WAL recycling. We need to distinguish these errors
   319  			// from EOF in order to recognize that the record was truncated, but want
   320  			// to otherwise treat them like EOF.
   321  			if err == io.EOF || err == record.ErrZeroedChunk || err == record.ErrInvalidChunk {
   322  				break
   323  			}
   324  			return 0, err
   325  		}
   326  
   327  		if buf.Len() < batchHeaderLen {
   328  			return 0, fmt.Errorf("pebble: corrupt log file %q", filename)
   329  		}
   330  
   331  		// TODO(peter): If the batch is too large to fit in the memtable, flush the
   332  		// existing memtable and write the batch as a separate L0 table.
   333  		b = Batch{}
   334  		b.SetRepr(buf.Bytes())
   335  		seqNum := b.SeqNum()
   336  		maxSeqNum = seqNum + uint64(b.Count())
   337  
   338  		if mem == nil {
   339  			mem = newMemTable(d.opts)
   340  		}
   341  
   342  		for {
   343  			err := mem.prepare(&b)
   344  			if err == arenaskl.ErrArenaFull {
   345  				// TODO(peter): write the memtable to disk.
   346  				panic(err)
   347  			}
   348  			if err != nil {
   349  				return 0, err
   350  			}
   351  			break
   352  		}
   353  
   354  		if err := mem.apply(&b, seqNum); err != nil {
   355  			return 0, err
   356  		}
   357  		mem.unref()
   358  
   359  		buf.Reset()
   360  	}
   361  
   362  	if d.opts.ReadOnly {
   363  		// In read-only mode, each WAL file is replayed into its own memtable. This
   364  		// is done so that the WAL metrics can be accurately provided.
   365  		mem.logSize = uint64(rr.Offset())
   366  		d.mu.mem.mutable = newMemTable(d.opts)
   367  		d.mu.mem.queue = append(d.mu.mem.queue, d.mu.mem.mutable)
   368  		d.mu.versions.metrics.WAL.Files++
   369  	} else if mem != nil && !mem.empty() {
   370  		c := newFlush(d.opts, d.mu.versions.currentVersion(),
   371  			1 /* base level */, []flushable{mem}, &d.bytesFlushed)
   372  		newVE, pendingOutputs, err := d.runCompaction(jobID, c, nilPacer)
   373  		if err != nil {
   374  			return 0, err
   375  		}
   376  		ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...)
   377  		// Strictly speaking, it's too early to delete from d.pendingOutputs, but
   378  		// we are replaying the log file, which happens before Open returns, so
   379  		// there is no possibility of deleteObsoleteFiles being called concurrently
   380  		// here.
   381  		for _, fileNum := range pendingOutputs {
   382  			delete(d.mu.compact.pendingOutputs, fileNum)
   383  		}
   384  	}
   385  
   386  	return maxSeqNum, nil
   387  }
   388  
   389  func checkOptions(opts *Options, path string) error {
   390  	f, err := opts.FS.Open(path)
   391  	if err != nil {
   392  		return err
   393  	}
   394  	defer f.Close()
   395  
   396  	data, err := ioutil.ReadAll(f)
   397  	if err != nil {
   398  		return err
   399  	}
   400  	return opts.Check(string(data))
   401  }