github.com/df-mc/goleveldb@v1.1.9/leveldb/db.go (about)

     1  // Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
     2  // All rights reserved.
     3  //
     4  // Use of this source code is governed by a BSD-style license that can be
     5  // found in the LICENSE file.
     6  
     7  package leveldb
     8  
     9  import (
    10  	"container/list"
    11  	"fmt"
    12  	"io"
    13  	"os"
    14  	"runtime"
    15  	"strings"
    16  	"sync"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	"github.com/df-mc/goleveldb/leveldb/errors"
    21  	"github.com/df-mc/goleveldb/leveldb/iterator"
    22  	"github.com/df-mc/goleveldb/leveldb/journal"
    23  	"github.com/df-mc/goleveldb/leveldb/memdb"
    24  	"github.com/df-mc/goleveldb/leveldb/opt"
    25  	"github.com/df-mc/goleveldb/leveldb/storage"
    26  	"github.com/df-mc/goleveldb/leveldb/table"
    27  	"github.com/df-mc/goleveldb/leveldb/util"
    28  )
    29  
    30  // DB is a LevelDB database.
    31  type DB struct {
    32  	// Need 64-bit alignment.
    33  	seq uint64
    34  
    35  	// Stats. Need 64-bit alignment.
    36  	cWriteDelay            int64 // The cumulative duration of write delays
    37  	cWriteDelayN           int32 // The cumulative number of write delays
    38  	inWritePaused          int32 // The indicator whether write operation is paused by compaction
    39  	aliveSnaps, aliveIters int32
    40  
    41  	// Compaction statistic
    42  	memComp       uint32 // The cumulative number of memory compaction
    43  	level0Comp    uint32 // The cumulative number of level0 compaction
    44  	nonLevel0Comp uint32 // The cumulative number of non-level0 compaction
    45  	seekComp      uint32 // The cumulative number of seek compaction
    46  
    47  	// Session.
    48  	s *session
    49  
    50  	// MemDB.
    51  	memMu           sync.RWMutex
    52  	memPool         chan *memdb.DB
    53  	mem, frozenMem  *memDB
    54  	journal         *journal.Writer
    55  	journalWriter   storage.Writer
    56  	journalFd       storage.FileDesc
    57  	frozenJournalFd storage.FileDesc
    58  	frozenSeq       uint64
    59  
    60  	// Snapshot.
    61  	snapsMu   sync.Mutex
    62  	snapsList *list.List
    63  
    64  	// Write.
    65  	batchPool    sync.Pool
    66  	writeMergeC  chan writeMerge
    67  	writeMergedC chan bool
    68  	writeLockC   chan struct{}
    69  	writeAckC    chan error
    70  	writeDelay   time.Duration
    71  	writeDelayN  int
    72  	tr           *Transaction
    73  
    74  	// Compaction.
    75  	compCommitLk     sync.Mutex
    76  	tcompCmdC        chan cCmd
    77  	tcompPauseC      chan chan<- struct{}
    78  	mcompCmdC        chan cCmd
    79  	compErrC         chan error
    80  	compPerErrC      chan error
    81  	compErrSetC      chan error
    82  	compWriteLocking bool
    83  	compStats        cStats
    84  	memdbMaxLevel    int // For testing.
    85  
    86  	// Close.
    87  	closeW sync.WaitGroup
    88  	closeC chan struct{}
    89  	closed uint32
    90  	closer io.Closer
    91  }
    92  
    93  func openDB(s *session) (*DB, error) {
    94  	s.log("db@open opening")
    95  	start := time.Now()
    96  	db := &DB{
    97  		s: s,
    98  		// Initial sequence
    99  		seq: s.stSeqNum,
   100  		// MemDB
   101  		memPool: make(chan *memdb.DB, 1),
   102  		// Snapshot
   103  		snapsList: list.New(),
   104  		// Write
   105  		batchPool:    sync.Pool{New: newBatch},
   106  		writeMergeC:  make(chan writeMerge),
   107  		writeMergedC: make(chan bool),
   108  		writeLockC:   make(chan struct{}, 1),
   109  		writeAckC:    make(chan error),
   110  		// Compaction
   111  		tcompCmdC:   make(chan cCmd),
   112  		tcompPauseC: make(chan chan<- struct{}),
   113  		mcompCmdC:   make(chan cCmd),
   114  		compErrC:    make(chan error),
   115  		compPerErrC: make(chan error),
   116  		compErrSetC: make(chan error),
   117  		// Close
   118  		closeC: make(chan struct{}),
   119  	}
   120  
   121  	// Read-only mode.
   122  	readOnly := s.o.GetReadOnly()
   123  
   124  	if readOnly {
   125  		// Recover journals (read-only mode).
   126  		if err := db.recoverJournalRO(); err != nil {
   127  			return nil, err
   128  		}
   129  	} else {
   130  		// Recover journals.
   131  		if err := db.recoverJournal(); err != nil {
   132  			return nil, err
   133  		}
   134  
   135  		// Remove any obsolete files.
   136  		if err := db.checkAndCleanFiles(); err != nil {
   137  			// Close journal.
   138  			if db.journal != nil {
   139  				db.journal.Close()
   140  				db.journalWriter.Close()
   141  			}
   142  			return nil, err
   143  		}
   144  
   145  	}
   146  
   147  	// Doesn't need to be included in the wait group.
   148  	go db.compactionError()
   149  	go db.mpoolDrain()
   150  
   151  	if readOnly {
   152  		db.SetReadOnly()
   153  	} else {
   154  		db.closeW.Add(2)
   155  		go db.tCompaction()
   156  		go db.mCompaction()
   157  		// go db.jWriter()
   158  	}
   159  
   160  	s.logf("db@open done T·%v", time.Since(start))
   161  
   162  	runtime.SetFinalizer(db, (*DB).Close)
   163  	return db, nil
   164  }
   165  
   166  // Open opens or creates a DB for the given storage.
   167  // The DB will be created if not exist, unless ErrorIfMissing is true.
   168  // Also, if ErrorIfExist is true and the DB exist Open will returns
   169  // os.ErrExist error.
   170  //
   171  // Open will return an error with type of ErrCorrupted if corruption
   172  // detected in the DB. Use errors.IsCorrupted to test whether an error is
   173  // due to corruption. Corrupted DB can be recovered with Recover function.
   174  //
   175  // The returned DB instance is safe for concurrent use.
   176  // The DB must be closed after use, by calling Close method.
   177  func Open(stor storage.Storage, o *opt.Options) (db *DB, err error) {
   178  	s, err := newSession(stor, o)
   179  	if err != nil {
   180  		return
   181  	}
   182  	defer func() {
   183  		if err != nil {
   184  			s.close()
   185  			s.release()
   186  		}
   187  	}()
   188  
   189  	err = s.recover()
   190  	if err != nil {
   191  		if !os.IsNotExist(err) || s.o.GetErrorIfMissing() || s.o.GetReadOnly() {
   192  			return
   193  		}
   194  		err = s.create()
   195  		if err != nil {
   196  			return
   197  		}
   198  	} else if s.o.GetErrorIfExist() {
   199  		err = os.ErrExist
   200  		return
   201  	}
   202  
   203  	return openDB(s)
   204  }
   205  
   206  // OpenFile opens or creates a DB for the given path.
   207  // The DB will be created if not exist, unless ErrorIfMissing is true.
   208  // Also, if ErrorIfExist is true and the DB exist OpenFile will returns
   209  // os.ErrExist error.
   210  //
   211  // OpenFile uses standard file-system backed storage implementation as
   212  // described in the leveldb/storage package.
   213  //
   214  // OpenFile will return an error with type of ErrCorrupted if corruption
   215  // detected in the DB. Use errors.IsCorrupted to test whether an error is
   216  // due to corruption. Corrupted DB can be recovered with Recover function.
   217  //
   218  // The returned DB instance is safe for concurrent use.
   219  // The DB must be closed after use, by calling Close method.
   220  func OpenFile(path string, o *opt.Options) (db *DB, err error) {
   221  	stor, err := storage.OpenFile(path, o.GetReadOnly())
   222  	if err != nil {
   223  		return
   224  	}
   225  	db, err = Open(stor, o)
   226  	if err != nil {
   227  		stor.Close()
   228  	} else {
   229  		db.closer = stor
   230  	}
   231  	return
   232  }
   233  
   234  // Recover recovers and opens a DB with missing or corrupted manifest files
   235  // for the given storage. It will ignore any manifest files, valid or not.
   236  // The DB must already exist or it will returns an error.
   237  // Also, Recover will ignore ErrorIfMissing and ErrorIfExist options.
   238  //
   239  // The returned DB instance is safe for concurrent use.
   240  // The DB must be closed after use, by calling Close method.
   241  func Recover(stor storage.Storage, o *opt.Options) (db *DB, err error) {
   242  	s, err := newSession(stor, o)
   243  	if err != nil {
   244  		return
   245  	}
   246  	defer func() {
   247  		if err != nil {
   248  			s.close()
   249  			s.release()
   250  		}
   251  	}()
   252  
   253  	err = recoverTable(s, o)
   254  	if err != nil {
   255  		return
   256  	}
   257  	return openDB(s)
   258  }
   259  
   260  // RecoverFile recovers and opens a DB with missing or corrupted manifest files
   261  // for the given path. It will ignore any manifest files, valid or not.
   262  // The DB must already exist or it will returns an error.
   263  // Also, Recover will ignore ErrorIfMissing and ErrorIfExist options.
   264  //
   265  // RecoverFile uses standard file-system backed storage implementation as described
   266  // in the leveldb/storage package.
   267  //
   268  // The returned DB instance is safe for concurrent use.
   269  // The DB must be closed after use, by calling Close method.
   270  func RecoverFile(path string, o *opt.Options) (db *DB, err error) {
   271  	stor, err := storage.OpenFile(path, false)
   272  	if err != nil {
   273  		return
   274  	}
   275  	db, err = Recover(stor, o)
   276  	if err != nil {
   277  		stor.Close()
   278  	} else {
   279  		db.closer = stor
   280  	}
   281  	return
   282  }
   283  
   284  func recoverTable(s *session, o *opt.Options) error {
   285  	o = dupOptions(o)
   286  	// Mask StrictReader, lets StrictRecovery doing its job.
   287  	o.Strict &= ^opt.StrictReader
   288  
   289  	// Get all tables and sort it by file number.
   290  	fds, err := s.stor.List(storage.TypeTable)
   291  	if err != nil {
   292  		return err
   293  	}
   294  	sortFds(fds)
   295  
   296  	var (
   297  		maxSeq                                                            uint64
   298  		recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int
   299  
   300  		// We will drop corrupted table.
   301  		strict = o.GetStrict(opt.StrictRecovery)
   302  		noSync = o.GetNoSync()
   303  
   304  		rec   = &sessionRecord{}
   305  		bpool = util.NewBufferPool(o.GetBlockSize() + 5)
   306  	)
   307  	buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) {
   308  		tmpFd = s.newTemp()
   309  		writer, err := s.stor.Create(tmpFd)
   310  		if err != nil {
   311  			return
   312  		}
   313  		defer func() {
   314  			writer.Close()
   315  			if err != nil {
   316  				s.stor.Remove(tmpFd)
   317  				tmpFd = storage.FileDesc{}
   318  			}
   319  		}()
   320  
   321  		// Copy entries.
   322  		tw := table.NewWriter(writer, o)
   323  		for iter.Next() {
   324  			key := iter.Key()
   325  			if validInternalKey(key) {
   326  				err = tw.Append(key, iter.Value())
   327  				if err != nil {
   328  					return
   329  				}
   330  			}
   331  		}
   332  		err = iter.Error()
   333  		if err != nil && !errors.IsCorrupted(err) {
   334  			return
   335  		}
   336  		err = tw.Close()
   337  		if err != nil {
   338  			return
   339  		}
   340  		if !noSync {
   341  			err = writer.Sync()
   342  			if err != nil {
   343  				return
   344  			}
   345  		}
   346  		size = int64(tw.BytesLen())
   347  		return
   348  	}
   349  	recoverTable := func(fd storage.FileDesc) error {
   350  		s.logf("table@recovery recovering @%d", fd.Num)
   351  		reader, err := s.stor.Open(fd)
   352  		if err != nil {
   353  			return err
   354  		}
   355  		var closed bool
   356  		defer func() {
   357  			if !closed {
   358  				reader.Close()
   359  			}
   360  		}()
   361  
   362  		// Get file size.
   363  		size, err := reader.Seek(0, 2)
   364  		if err != nil {
   365  			return err
   366  		}
   367  
   368  		var (
   369  			tSeq                                     uint64
   370  			tgoodKey, tcorruptedKey, tcorruptedBlock int
   371  			imin, imax                               []byte
   372  		)
   373  		tr, err := table.NewReader(reader, size, fd, nil, bpool, o)
   374  		if err != nil {
   375  			return err
   376  		}
   377  		iter := tr.NewIterator(nil, nil)
   378  		if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok {
   379  			itererr.SetErrorCallback(func(err error) {
   380  				if errors.IsCorrupted(err) {
   381  					s.logf("table@recovery block corruption @%d %q", fd.Num, err)
   382  					tcorruptedBlock++
   383  				}
   384  			})
   385  		}
   386  
   387  		// Scan the table.
   388  		for iter.Next() {
   389  			key := iter.Key()
   390  			_, seq, _, kerr := parseInternalKey(key)
   391  			if kerr != nil {
   392  				tcorruptedKey++
   393  				continue
   394  			}
   395  			tgoodKey++
   396  			if seq > tSeq {
   397  				tSeq = seq
   398  			}
   399  			if imin == nil {
   400  				imin = append([]byte{}, key...)
   401  			}
   402  			imax = append(imax[:0], key...)
   403  		}
   404  		if err := iter.Error(); err != nil && !errors.IsCorrupted(err) {
   405  			iter.Release()
   406  			return err
   407  		}
   408  		iter.Release()
   409  
   410  		goodKey += tgoodKey
   411  		corruptedKey += tcorruptedKey
   412  		corruptedBlock += tcorruptedBlock
   413  
   414  		if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
   415  			droppedTable++
   416  			s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
   417  			return nil
   418  		}
   419  
   420  		if tgoodKey > 0 {
   421  			if tcorruptedKey > 0 || tcorruptedBlock > 0 {
   422  				// Rebuild the table.
   423  				s.logf("table@recovery rebuilding @%d", fd.Num)
   424  				iter := tr.NewIterator(nil, nil)
   425  				tmpFd, newSize, err := buildTable(iter)
   426  				iter.Release()
   427  				if err != nil {
   428  					return err
   429  				}
   430  				closed = true
   431  				reader.Close()
   432  				if err := s.stor.Rename(tmpFd, fd); err != nil {
   433  					return err
   434  				}
   435  				size = newSize
   436  			}
   437  			if tSeq > maxSeq {
   438  				maxSeq = tSeq
   439  			}
   440  			recoveredKey += tgoodKey
   441  			// Add table to level 0.
   442  			rec.addTable(0, fd.Num, size, imin, imax)
   443  			s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
   444  		} else {
   445  			droppedTable++
   446  			s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size)
   447  		}
   448  
   449  		return nil
   450  	}
   451  
   452  	// Recover all tables.
   453  	if len(fds) > 0 {
   454  		s.logf("table@recovery F·%d", len(fds))
   455  
   456  		// Mark file number as used.
   457  		s.markFileNum(fds[len(fds)-1].Num)
   458  
   459  		for _, fd := range fds {
   460  			if err := recoverTable(fd); err != nil {
   461  				return err
   462  			}
   463  		}
   464  
   465  		s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq)
   466  	}
   467  
   468  	// Set sequence number.
   469  	rec.setSeqNum(maxSeq)
   470  
   471  	// Create new manifest.
   472  	if err := s.create(); err != nil {
   473  		return err
   474  	}
   475  
   476  	// Commit.
   477  	return s.commit(rec, false)
   478  }
   479  
   480  func (db *DB) recoverJournal() error {
   481  	// Get all journals and sort it by file number.
   482  	rawFds, err := db.s.stor.List(storage.TypeJournal)
   483  	if err != nil {
   484  		return err
   485  	}
   486  	sortFds(rawFds)
   487  
   488  	// Journals that will be recovered.
   489  	var fds []storage.FileDesc
   490  	for _, fd := range rawFds {
   491  		if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
   492  			fds = append(fds, fd)
   493  		}
   494  	}
   495  
   496  	var (
   497  		ofd storage.FileDesc // Obsolete file.
   498  		rec = &sessionRecord{}
   499  	)
   500  
   501  	// Recover journals.
   502  	if len(fds) > 0 {
   503  		db.logf("journal@recovery F·%d", len(fds))
   504  
   505  		// Mark file number as used.
   506  		db.s.markFileNum(fds[len(fds)-1].Num)
   507  
   508  		var (
   509  			// Options.
   510  			strict      = db.s.o.GetStrict(opt.StrictJournal)
   511  			checksum    = db.s.o.GetStrict(opt.StrictJournalChecksum)
   512  			writeBuffer = db.s.o.GetWriteBuffer()
   513  
   514  			jr       *journal.Reader
   515  			mdb      = memdb.New(db.s.icmp, writeBuffer)
   516  			buf      = &util.Buffer{}
   517  			batchSeq uint64
   518  			batchLen int
   519  		)
   520  
   521  		for _, fd := range fds {
   522  			db.logf("journal@recovery recovering @%d", fd.Num)
   523  
   524  			fr, err := db.s.stor.Open(fd)
   525  			if err != nil {
   526  				return err
   527  			}
   528  
   529  			// Create or reset journal reader instance.
   530  			if jr == nil {
   531  				jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
   532  			} else {
   533  				jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
   534  			}
   535  
   536  			// Flush memdb and remove obsolete journal file.
   537  			if !ofd.Zero() {
   538  				if mdb.Len() > 0 {
   539  					if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
   540  						fr.Close()
   541  						return err
   542  					}
   543  				}
   544  
   545  				rec.setJournalNum(fd.Num)
   546  				rec.setSeqNum(db.seq)
   547  				if err := db.s.commit(rec, false); err != nil {
   548  					fr.Close()
   549  					return err
   550  				}
   551  				rec.resetAddedTables()
   552  
   553  				db.s.stor.Remove(ofd)
   554  				ofd = storage.FileDesc{}
   555  			}
   556  
   557  			// Replay journal to memdb.
   558  			mdb.Reset()
   559  			for {
   560  				r, err := jr.Next()
   561  				if err != nil {
   562  					if err == io.EOF {
   563  						break
   564  					}
   565  
   566  					fr.Close()
   567  					return errors.SetFd(err, fd)
   568  				}
   569  
   570  				buf.Reset()
   571  				if _, err := buf.ReadFrom(r); err != nil {
   572  					if err == io.ErrUnexpectedEOF {
   573  						// This is error returned due to corruption, with strict == false.
   574  						continue
   575  					}
   576  
   577  					fr.Close()
   578  					return errors.SetFd(err, fd)
   579  				}
   580  				batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb)
   581  				if err != nil {
   582  					if !strict && errors.IsCorrupted(err) {
   583  						db.s.logf("journal error: %v (skipped)", err)
   584  						// We won't apply sequence number as it might be corrupted.
   585  						continue
   586  					}
   587  
   588  					fr.Close()
   589  					return errors.SetFd(err, fd)
   590  				}
   591  
   592  				// Save sequence number.
   593  				db.seq = batchSeq + uint64(batchLen)
   594  
   595  				// Flush it if large enough.
   596  				if mdb.Size() >= writeBuffer {
   597  					if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
   598  						fr.Close()
   599  						return err
   600  					}
   601  
   602  					mdb.Reset()
   603  				}
   604  			}
   605  
   606  			fr.Close()
   607  			ofd = fd
   608  		}
   609  
   610  		// Flush the last memdb.
   611  		if mdb.Len() > 0 {
   612  			if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
   613  				return err
   614  			}
   615  		}
   616  	}
   617  
   618  	// Create a new journal.
   619  	if _, err := db.newMem(0); err != nil {
   620  		return err
   621  	}
   622  
   623  	// Commit.
   624  	rec.setJournalNum(db.journalFd.Num)
   625  	rec.setSeqNum(db.seq)
   626  	if err := db.s.commit(rec, false); err != nil {
   627  		// Close journal on error.
   628  		if db.journal != nil {
   629  			db.journal.Close()
   630  			db.journalWriter.Close()
   631  		}
   632  		return err
   633  	}
   634  
   635  	// Remove the last obsolete journal file.
   636  	if !ofd.Zero() {
   637  		db.s.stor.Remove(ofd)
   638  	}
   639  
   640  	return nil
   641  }
   642  
   643  func (db *DB) recoverJournalRO() error {
   644  	// Get all journals and sort it by file number.
   645  	rawFds, err := db.s.stor.List(storage.TypeJournal)
   646  	if err != nil {
   647  		return err
   648  	}
   649  	sortFds(rawFds)
   650  
   651  	// Journals that will be recovered.
   652  	var fds []storage.FileDesc
   653  	for _, fd := range rawFds {
   654  		if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
   655  			fds = append(fds, fd)
   656  		}
   657  	}
   658  
   659  	var (
   660  		// Options.
   661  		strict      = db.s.o.GetStrict(opt.StrictJournal)
   662  		checksum    = db.s.o.GetStrict(opt.StrictJournalChecksum)
   663  		writeBuffer = db.s.o.GetWriteBuffer()
   664  
   665  		mdb = memdb.New(db.s.icmp, writeBuffer)
   666  	)
   667  
   668  	// Recover journals.
   669  	if len(fds) > 0 {
   670  		db.logf("journal@recovery RO·Mode F·%d", len(fds))
   671  
   672  		var (
   673  			jr       *journal.Reader
   674  			buf      = &util.Buffer{}
   675  			batchSeq uint64
   676  			batchLen int
   677  		)
   678  
   679  		for _, fd := range fds {
   680  			db.logf("journal@recovery recovering @%d", fd.Num)
   681  
   682  			fr, err := db.s.stor.Open(fd)
   683  			if err != nil {
   684  				return err
   685  			}
   686  
   687  			// Create or reset journal reader instance.
   688  			if jr == nil {
   689  				jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
   690  			} else {
   691  				jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
   692  			}
   693  
   694  			// Replay journal to memdb.
   695  			for {
   696  				r, err := jr.Next()
   697  				if err != nil {
   698  					if err == io.EOF {
   699  						break
   700  					}
   701  
   702  					fr.Close()
   703  					return errors.SetFd(err, fd)
   704  				}
   705  
   706  				buf.Reset()
   707  				if _, err := buf.ReadFrom(r); err != nil {
   708  					if err == io.ErrUnexpectedEOF {
   709  						// This is error returned due to corruption, with strict == false.
   710  						continue
   711  					}
   712  
   713  					fr.Close()
   714  					return errors.SetFd(err, fd)
   715  				}
   716  				batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb)
   717  				if err != nil {
   718  					if !strict && errors.IsCorrupted(err) {
   719  						db.s.logf("journal error: %v (skipped)", err)
   720  						// We won't apply sequence number as it might be corrupted.
   721  						continue
   722  					}
   723  
   724  					fr.Close()
   725  					return errors.SetFd(err, fd)
   726  				}
   727  
   728  				// Save sequence number.
   729  				db.seq = batchSeq + uint64(batchLen)
   730  			}
   731  
   732  			fr.Close()
   733  		}
   734  	}
   735  
   736  	// Set memDB.
   737  	db.mem = &memDB{db: db, DB: mdb, ref: 1}
   738  
   739  	return nil
   740  }
   741  
   742  func memGet(mdb *memdb.DB, ikey internalKey, icmp *iComparer) (ok bool, mv []byte, err error) {
   743  	mk, mv, err := mdb.Find(ikey)
   744  	if err == nil {
   745  		ukey, _, kt, kerr := parseInternalKey(mk)
   746  		if kerr != nil {
   747  			// Shouldn't have had happen.
   748  			panic(kerr)
   749  		}
   750  		if icmp.uCompare(ukey, ikey.ukey()) == 0 {
   751  			if kt == keyTypeDel {
   752  				return true, nil, ErrNotFound
   753  			}
   754  			return true, mv, nil
   755  
   756  		}
   757  	} else if err != ErrNotFound {
   758  		return true, nil, err
   759  	}
   760  	return
   761  }
   762  
   763  func (db *DB) get(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) {
   764  	ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
   765  
   766  	if auxm != nil {
   767  		if ok, mv, me := memGet(auxm, ikey, db.s.icmp); ok {
   768  			return append([]byte{}, mv...), me
   769  		}
   770  	}
   771  
   772  	em, fm := db.getMems()
   773  	for _, m := range [...]*memDB{em, fm} {
   774  		if m == nil {
   775  			continue
   776  		}
   777  		defer m.decref()
   778  
   779  		if ok, mv, me := memGet(m.DB, ikey, db.s.icmp); ok {
   780  			return append([]byte{}, mv...), me
   781  		}
   782  	}
   783  
   784  	v := db.s.version()
   785  	value, cSched, err := v.get(auxt, ikey, ro, false)
   786  	v.release()
   787  	if cSched {
   788  		// Trigger table compaction.
   789  		db.compTrigger(db.tcompCmdC)
   790  	}
   791  	return
   792  }
   793  
   794  func nilIfNotFound(err error) error {
   795  	if err == ErrNotFound {
   796  		return nil
   797  	}
   798  	return err
   799  }
   800  
   801  func (db *DB) has(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) {
   802  	ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
   803  
   804  	if auxm != nil {
   805  		if ok, _, me := memGet(auxm, ikey, db.s.icmp); ok {
   806  			return me == nil, nilIfNotFound(me)
   807  		}
   808  	}
   809  
   810  	em, fm := db.getMems()
   811  	for _, m := range [...]*memDB{em, fm} {
   812  		if m == nil {
   813  			continue
   814  		}
   815  		defer m.decref()
   816  
   817  		if ok, _, me := memGet(m.DB, ikey, db.s.icmp); ok {
   818  			return me == nil, nilIfNotFound(me)
   819  		}
   820  	}
   821  
   822  	v := db.s.version()
   823  	_, cSched, err := v.get(auxt, ikey, ro, true)
   824  	v.release()
   825  	if cSched {
   826  		// Trigger table compaction.
   827  		db.compTrigger(db.tcompCmdC)
   828  	}
   829  	if err == nil {
   830  		ret = true
   831  	} else if err == ErrNotFound {
   832  		err = nil
   833  	}
   834  	return
   835  }
   836  
   837  // Get gets the value for the given key. It returns ErrNotFound if the
   838  // DB does not contains the key.
   839  //
   840  // The returned slice is its own copy, it is safe to modify the contents
   841  // of the returned slice.
   842  // It is safe to modify the contents of the argument after Get returns.
   843  func (db *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) {
   844  	err = db.ok()
   845  	if err != nil {
   846  		return
   847  	}
   848  
   849  	se := db.acquireSnapshot()
   850  	defer db.releaseSnapshot(se)
   851  	return db.get(nil, nil, key, se.seq, ro)
   852  }
   853  
   854  // Has returns true if the DB does contains the given key.
   855  //
   856  // It is safe to modify the contents of the argument after Has returns.
   857  func (db *DB) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) {
   858  	err = db.ok()
   859  	if err != nil {
   860  		return
   861  	}
   862  
   863  	se := db.acquireSnapshot()
   864  	defer db.releaseSnapshot(se)
   865  	return db.has(nil, nil, key, se.seq, ro)
   866  }
   867  
   868  // NewIterator returns an iterator for the latest snapshot of the
   869  // underlying DB.
   870  // The returned iterator is not safe for concurrent use, but it is safe to use
   871  // multiple iterators concurrently, with each in a dedicated goroutine.
   872  // It is also safe to use an iterator concurrently with modifying its
   873  // underlying DB. The resultant key/value pairs are guaranteed to be
   874  // consistent.
   875  //
   876  // Slice allows slicing the iterator to only contains keys in the given
   877  // range. A nil Range.Start is treated as a key before all keys in the
   878  // DB. And a nil Range.Limit is treated as a key after all keys in
   879  // the DB.
   880  //
   881  // WARNING: Any slice returned by interator (e.g. slice returned by calling
   882  // Iterator.Key() or Iterator.Key() methods), its content should not be modified
   883  // unless noted otherwise.
   884  //
   885  // The iterator must be released after use, by calling Release method.
   886  //
   887  // Also read Iterator documentation of the leveldb/iterator package.
   888  func (db *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator {
   889  	if err := db.ok(); err != nil {
   890  		return iterator.NewEmptyIterator(err)
   891  	}
   892  
   893  	se := db.acquireSnapshot()
   894  	defer db.releaseSnapshot(se)
   895  	// Iterator holds 'version' lock, 'version' is immutable so snapshot
   896  	// can be released after iterator created.
   897  	return db.newIterator(nil, nil, se.seq, slice, ro)
   898  }
   899  
   900  // GetSnapshot returns a latest snapshot of the underlying DB. A snapshot
   901  // is a frozen snapshot of a DB state at a particular point in time. The
   902  // content of snapshot are guaranteed to be consistent.
   903  //
   904  // The snapshot must be released after use, by calling Release method.
   905  func (db *DB) GetSnapshot() (*Snapshot, error) {
   906  	if err := db.ok(); err != nil {
   907  		return nil, err
   908  	}
   909  
   910  	return db.newSnapshot(), nil
   911  }
   912  
   913  // GetProperty returns value of the given property name.
   914  //
   915  // Property names:
   916  //	leveldb.num-files-at-level{n}
   917  //		Returns the number of files at level 'n'.
   918  //	leveldb.stats
   919  //		Returns statistics of the underlying DB.
   920  //	leveldb.iostats
   921  //		Returns statistics of effective disk read and write.
   922  //	leveldb.writedelay
   923  //		Returns cumulative write delay caused by compaction.
   924  //	leveldb.sstables
   925  //		Returns sstables list for each level.
   926  //	leveldb.blockpool
   927  //		Returns block pool stats.
   928  //	leveldb.cachedblock
   929  //		Returns size of cached block.
   930  //	leveldb.openedtables
   931  //		Returns number of opened tables.
   932  //	leveldb.alivesnaps
   933  //		Returns number of alive snapshots.
   934  //	leveldb.aliveiters
   935  //		Returns number of alive iterators.
   936  func (db *DB) GetProperty(name string) (value string, err error) {
   937  	err = db.ok()
   938  	if err != nil {
   939  		return
   940  	}
   941  
   942  	const prefix = "leveldb."
   943  	if !strings.HasPrefix(name, prefix) {
   944  		return "", ErrNotFound
   945  	}
   946  	p := name[len(prefix):]
   947  
   948  	v := db.s.version()
   949  	defer v.release()
   950  
   951  	numFilesPrefix := "num-files-at-level"
   952  	switch {
   953  	case strings.HasPrefix(p, numFilesPrefix):
   954  		var level uint
   955  		var rest string
   956  		n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest)
   957  		if n != 1 {
   958  			err = ErrNotFound
   959  		} else {
   960  			value = fmt.Sprint(v.tLen(int(level)))
   961  		}
   962  	case p == "stats":
   963  		value = "Compactions\n" +
   964  			" Level |   Tables   |    Size(MB)   |    Time(sec)  |    Read(MB)   |   Write(MB)\n" +
   965  			"-------+------------+---------------+---------------+---------------+---------------\n"
   966  		var totalTables int
   967  		var totalSize, totalRead, totalWrite int64
   968  		var totalDuration time.Duration
   969  		for level, tables := range v.levels {
   970  			duration, read, write := db.compStats.getStat(level)
   971  			if len(tables) == 0 && duration == 0 {
   972  				continue
   973  			}
   974  			totalTables += len(tables)
   975  			totalSize += tables.size()
   976  			totalRead += read
   977  			totalWrite += write
   978  			totalDuration += duration
   979  			value += fmt.Sprintf(" %3d   | %10d | %13.5f | %13.5f | %13.5f | %13.5f\n",
   980  				level, len(tables), float64(tables.size())/1048576.0, duration.Seconds(),
   981  				float64(read)/1048576.0, float64(write)/1048576.0)
   982  		}
   983  		value += "-------+------------+---------------+---------------+---------------+---------------\n"
   984  		value += fmt.Sprintf(" Total | %10d | %13.5f | %13.5f | %13.5f | %13.5f\n",
   985  			totalTables, float64(totalSize)/1048576.0, totalDuration.Seconds(),
   986  			float64(totalRead)/1048576.0, float64(totalWrite)/1048576.0)
   987  	case p == "compcount":
   988  		value = fmt.Sprintf("MemComp:%d Level0Comp:%d NonLevel0Comp:%d SeekComp:%d", atomic.LoadUint32(&db.memComp), atomic.LoadUint32(&db.level0Comp), atomic.LoadUint32(&db.nonLevel0Comp), atomic.LoadUint32(&db.seekComp))
   989  	case p == "iostats":
   990  		value = fmt.Sprintf("Read(MB):%.5f Write(MB):%.5f",
   991  			float64(db.s.stor.reads())/1048576.0,
   992  			float64(db.s.stor.writes())/1048576.0)
   993  	case p == "writedelay":
   994  		writeDelayN, writeDelay := atomic.LoadInt32(&db.cWriteDelayN), time.Duration(atomic.LoadInt64(&db.cWriteDelay))
   995  		paused := atomic.LoadInt32(&db.inWritePaused) == 1
   996  		value = fmt.Sprintf("DelayN:%d Delay:%s Paused:%t", writeDelayN, writeDelay, paused)
   997  	case p == "sstables":
   998  		for level, tables := range v.levels {
   999  			value += fmt.Sprintf("--- level %d ---\n", level)
  1000  			for _, t := range tables {
  1001  				value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.fd.Num, t.size, t.imin, t.imax)
  1002  			}
  1003  		}
  1004  	case p == "blockpool":
  1005  		value = fmt.Sprintf("%v", db.s.tops.bpool)
  1006  	case p == "cachedblock":
  1007  		if db.s.tops.bcache != nil {
  1008  			value = fmt.Sprintf("%d", db.s.tops.bcache.Size())
  1009  		} else {
  1010  			value = "<nil>"
  1011  		}
  1012  	case p == "openedtables":
  1013  		value = fmt.Sprintf("%d", db.s.tops.cache.Size())
  1014  	case p == "alivesnaps":
  1015  		value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveSnaps))
  1016  	case p == "aliveiters":
  1017  		value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveIters))
  1018  	default:
  1019  		err = ErrNotFound
  1020  	}
  1021  
  1022  	return
  1023  }
  1024  
  1025  // DBStats is database statistics.
  1026  type DBStats struct {
  1027  	WriteDelayCount    int32
  1028  	WriteDelayDuration time.Duration
  1029  	WritePaused        bool
  1030  
  1031  	AliveSnapshots int32
  1032  	AliveIterators int32
  1033  
  1034  	IOWrite uint64
  1035  	IORead  uint64
  1036  
  1037  	BlockCacheSize    int
  1038  	OpenedTablesCount int
  1039  
  1040  	LevelSizes        Sizes
  1041  	LevelTablesCounts []int
  1042  	LevelRead         Sizes
  1043  	LevelWrite        Sizes
  1044  	LevelDurations    []time.Duration
  1045  
  1046  	MemComp       uint32
  1047  	Level0Comp    uint32
  1048  	NonLevel0Comp uint32
  1049  	SeekComp      uint32
  1050  }
  1051  
  1052  // Stats populates s with database statistics.
  1053  func (db *DB) Stats(s *DBStats) error {
  1054  	err := db.ok()
  1055  	if err != nil {
  1056  		return err
  1057  	}
  1058  
  1059  	s.IORead = db.s.stor.reads()
  1060  	s.IOWrite = db.s.stor.writes()
  1061  	s.WriteDelayCount = atomic.LoadInt32(&db.cWriteDelayN)
  1062  	s.WriteDelayDuration = time.Duration(atomic.LoadInt64(&db.cWriteDelay))
  1063  	s.WritePaused = atomic.LoadInt32(&db.inWritePaused) == 1
  1064  
  1065  	s.OpenedTablesCount = db.s.tops.cache.Size()
  1066  	if db.s.tops.bcache != nil {
  1067  		s.BlockCacheSize = db.s.tops.bcache.Size()
  1068  	} else {
  1069  		s.BlockCacheSize = 0
  1070  	}
  1071  
  1072  	s.AliveIterators = atomic.LoadInt32(&db.aliveIters)
  1073  	s.AliveSnapshots = atomic.LoadInt32(&db.aliveSnaps)
  1074  
  1075  	s.LevelDurations = s.LevelDurations[:0]
  1076  	s.LevelRead = s.LevelRead[:0]
  1077  	s.LevelWrite = s.LevelWrite[:0]
  1078  	s.LevelSizes = s.LevelSizes[:0]
  1079  	s.LevelTablesCounts = s.LevelTablesCounts[:0]
  1080  
  1081  	v := db.s.version()
  1082  	defer v.release()
  1083  
  1084  	for level, tables := range v.levels {
  1085  		duration, read, write := db.compStats.getStat(level)
  1086  
  1087  		s.LevelDurations = append(s.LevelDurations, duration)
  1088  		s.LevelRead = append(s.LevelRead, read)
  1089  		s.LevelWrite = append(s.LevelWrite, write)
  1090  		s.LevelSizes = append(s.LevelSizes, tables.size())
  1091  		s.LevelTablesCounts = append(s.LevelTablesCounts, len(tables))
  1092  	}
  1093  	s.MemComp = atomic.LoadUint32(&db.memComp)
  1094  	s.Level0Comp = atomic.LoadUint32(&db.level0Comp)
  1095  	s.NonLevel0Comp = atomic.LoadUint32(&db.nonLevel0Comp)
  1096  	s.SeekComp = atomic.LoadUint32(&db.seekComp)
  1097  	return nil
  1098  }
  1099  
  1100  // SizeOf calculates approximate sizes of the given key ranges.
  1101  // The length of the returned sizes are equal with the length of the given
  1102  // ranges. The returned sizes measure storage space usage, so if the user
  1103  // data compresses by a factor of ten, the returned sizes will be one-tenth
  1104  // the size of the corresponding user data size.
  1105  // The results may not include the sizes of recently written data.
  1106  func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
  1107  	if err := db.ok(); err != nil {
  1108  		return nil, err
  1109  	}
  1110  
  1111  	v := db.s.version()
  1112  	defer v.release()
  1113  
  1114  	sizes := make(Sizes, 0, len(ranges))
  1115  	for _, r := range ranges {
  1116  		imin := makeInternalKey(nil, r.Start, keyMaxSeq, keyTypeSeek)
  1117  		imax := makeInternalKey(nil, r.Limit, keyMaxSeq, keyTypeSeek)
  1118  		start, err := v.offsetOf(imin)
  1119  		if err != nil {
  1120  			return nil, err
  1121  		}
  1122  		limit, err := v.offsetOf(imax)
  1123  		if err != nil {
  1124  			return nil, err
  1125  		}
  1126  		var size int64
  1127  		if limit >= start {
  1128  			size = limit - start
  1129  		}
  1130  		sizes = append(sizes, size)
  1131  	}
  1132  
  1133  	return sizes, nil
  1134  }
  1135  
  1136  // Close closes the DB. This will also releases any outstanding snapshot,
  1137  // abort any in-flight compaction and discard open transaction.
  1138  //
  1139  // It is not safe to close a DB until all outstanding iterators are released.
  1140  // It is valid to call Close multiple times. Other methods should not be
  1141  // called after the DB has been closed.
  1142  func (db *DB) Close() error {
  1143  	if !db.setClosed() {
  1144  		return ErrClosed
  1145  	}
  1146  
  1147  	start := time.Now()
  1148  	db.log("db@close closing")
  1149  
  1150  	// Clear the finalizer.
  1151  	runtime.SetFinalizer(db, nil)
  1152  
  1153  	// Get compaction error.
  1154  	var err error
  1155  	select {
  1156  	case err = <-db.compErrC:
  1157  		if err == ErrReadOnly {
  1158  			err = nil
  1159  		}
  1160  	default:
  1161  	}
  1162  
  1163  	// Signal all goroutines.
  1164  	close(db.closeC)
  1165  
  1166  	// Discard open transaction.
  1167  	if db.tr != nil {
  1168  		db.tr.Discard()
  1169  	}
  1170  
  1171  	// Acquire writer lock.
  1172  	db.writeLockC <- struct{}{}
  1173  
  1174  	// Wait for all gorotines to exit.
  1175  	db.closeW.Wait()
  1176  
  1177  	// Closes journal.
  1178  	if db.journal != nil {
  1179  		db.journal.Close()
  1180  		db.journalWriter.Close()
  1181  		db.journal = nil
  1182  		db.journalWriter = nil
  1183  	}
  1184  
  1185  	if db.writeDelayN > 0 {
  1186  		db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
  1187  	}
  1188  
  1189  	// Close session.
  1190  	db.s.close()
  1191  	db.logf("db@close done T·%v", time.Since(start))
  1192  	db.s.release()
  1193  
  1194  	if db.closer != nil {
  1195  		if err1 := db.closer.Close(); err == nil {
  1196  			err = err1
  1197  		}
  1198  		db.closer = nil
  1199  	}
  1200  
  1201  	// Clear memdbs.
  1202  	db.clearMems()
  1203  
  1204  	return err
  1205  }