github.com/bhojpur/cache@v0.0.4/pkg/memory/db.go (about)

     1  package memory
     2  
     3  // Copyright (c) 2018 Bhojpur Consulting Private Limited, India. All rights reserved.
     4  
     5  // Permission is hereby granted, free of charge, to any person obtaining a copy
     6  // of this software and associated documentation files (the "Software"), to deal
     7  // in the Software without restriction, including without limitation the rights
     8  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     9  // copies of the Software, and to permit persons to whom the Software is
    10  // furnished to do so, subject to the following conditions:
    11  
    12  // The above copyright notice and this permission notice shall be included in
    13  // all copies or substantial portions of the Software.
    14  
    15  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    18  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    20  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    21  // THE SOFTWARE.
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"hash/fnv"
    27  	"log"
    28  	"os"
    29  	"runtime"
    30  	"sort"
    31  	"sync"
    32  	"time"
    33  	"unsafe"
    34  )
    35  
    36  // The largest step that can be taken when remapping the mmap.
    37  const maxMmapStep = 1 << 30 // 1GB
    38  
    39  // The data file format version.
    40  const version = 2
    41  
    42  // Represents a marker value to indicate that a file is a Bhojpur Cache
    43  // in-memory database DB.
    44  const magic uint32 = 0xED0CDAED
    45  
    46  const pgidNoFreelist pgid = 0xffffffffffffffff
    47  
    48  // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
    49  // syncing changes to a file. This is required as some operating systems,
    50  // such as OpenBSD, do not have a unified buffer cache (UBC) and writes
    51  // must be synchronized using the msync(2) syscall.
    52  const IgnoreNoSync = runtime.GOOS == "openbsd"
    53  
    54  // Default values if not set in a DB instance.
    55  const (
    56  	DefaultMaxBatchSize  int = 1000
    57  	DefaultMaxBatchDelay     = 10 * time.Millisecond
    58  	DefaultAllocSize         = 16 * 1024 * 1024
    59  )
    60  
    61  // default page size for db is set to the OS page size.
    62  var defaultPageSize = os.Getpagesize()
    63  
    64  // The time elapsed between consecutive file locking attempts.
    65  const flockRetryTimeout = 50 * time.Millisecond
    66  
    67  // FreelistType is the type of the freelist backend
    68  type FreelistType string
    69  
    70  const (
    71  	// FreelistArrayType indicates backend freelist type is array
    72  	FreelistArrayType = FreelistType("array")
    73  	// FreelistMapType indicates backend freelist type is hashmap
    74  	FreelistMapType = FreelistType("hashmap")
    75  )
    76  
    77  // DB represents a collection of buckets persisted to a file on disk.
    78  // All data access is performed through transactions which can be obtained through the DB.
    79  // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
    80  type DB struct {
    81  	// When enabled, the database will perform a Check() after every commit.
    82  	// A panic is issued if the database is in an inconsistent state. This
    83  	// flag has a large performance impact so it should only be used for
    84  	// debugging purposes.
    85  	StrictMode bool
    86  
    87  	// Setting the NoSync flag will cause the database to skip fsync()
    88  	// calls after each commit. This can be useful when bulk loading data
    89  	// into a database and you can restart the bulk load in the event of
    90  	// a system failure or database corruption. Do not set this flag for
    91  	// normal use.
    92  	//
    93  	// If the package global IgnoreNoSync constant is true, this value is
    94  	// ignored.  See the comment on that constant for more details.
    95  	//
    96  	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
    97  	NoSync bool
    98  
    99  	// When true, skips syncing freelist to disk. This improves the database
   100  	// write performance under normal operation, but requires a full database
   101  	// re-sync during recovery.
   102  	NoFreelistSync bool
   103  
   104  	// FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
   105  	// dramatic performance degradation if database is large and framentation in freelist is common.
   106  	// The alternative one is using hashmap, it is faster in almost all circumstances
   107  	// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
   108  	// The default type is array
   109  	FreelistType FreelistType
   110  
   111  	// When true, skips the truncate call when growing the database.
   112  	// Setting this to true is only safe on non-ext3/ext4 systems.
   113  	// Skipping truncation avoids preallocation of hard drive space and
   114  	// bypasses a truncate() and fsync() syscall on remapping.
   115  	NoGrowSync bool
   116  
   117  	// If you want to read the entire database fast, you can set MmapFlag to
   118  	// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
   119  	MmapFlags int
   120  
   121  	// MaxBatchSize is the maximum size of a batch. Default value is
   122  	// copied from DefaultMaxBatchSize in Open.
   123  	//
   124  	// If <=0, disables batching.
   125  	//
   126  	// Do not change concurrently with calls to Batch.
   127  	MaxBatchSize int
   128  
   129  	// MaxBatchDelay is the maximum delay before a batch starts.
   130  	// Default value is copied from DefaultMaxBatchDelay in Open.
   131  	//
   132  	// If <=0, effectively disables batching.
   133  	//
   134  	// Do not change concurrently with calls to Batch.
   135  	MaxBatchDelay time.Duration
   136  
   137  	// AllocSize is the amount of space allocated when the database
   138  	// needs to create new pages. This is done to amortize the cost
   139  	// of truncate() and fsync() when growing the data file.
   140  	AllocSize int
   141  
   142  	// Mlock locks database file in memory when set to true.
   143  	// It prevents major page faults, however used memory can't be reclaimed.
   144  	//
   145  	// Supported only on Unix via mlock/munlock syscalls.
   146  	Mlock bool
   147  
   148  	path     string
   149  	openFile func(string, int, os.FileMode) (*os.File, error)
   150  	file     *os.File
   151  	dataref  []byte // mmap'ed readonly, write throws SEGV
   152  	data     *[maxMapSize]byte
   153  	datasz   int
   154  	filesz   int // current on disk file size
   155  	meta0    *meta
   156  	meta1    *meta
   157  	pageSize int
   158  	opened   bool
   159  	rwtx     *Tx
   160  	txs      []*Tx
   161  	stats    Stats
   162  
   163  	freelist     *freelist
   164  	freelistLoad sync.Once
   165  
   166  	pagePool sync.Pool
   167  
   168  	batchMu sync.Mutex
   169  	batch   *batch
   170  
   171  	rwlock   sync.Mutex   // Allows only one writer at a time.
   172  	metalock sync.Mutex   // Protects meta page access.
   173  	mmaplock sync.RWMutex // Protects mmap access during remapping.
   174  	statlock sync.RWMutex // Protects stats access.
   175  
   176  	ops struct {
   177  		writeAt func(b []byte, off int64) (n int, err error)
   178  	}
   179  
   180  	// Read only mode.
   181  	// When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
   182  	readOnly bool
   183  }
   184  
   185  // Path returns the path to currently open database file.
   186  func (db *DB) Path() string {
   187  	return db.path
   188  }
   189  
   190  // GoString returns the Go string representation of the database.
   191  func (db *DB) GoString() string {
   192  	return fmt.Sprintf("memcache.DB{path:%q}", db.path)
   193  }
   194  
   195  // String returns the string representation of the database.
   196  func (db *DB) String() string {
   197  	return fmt.Sprintf("DB<%q>", db.path)
   198  }
   199  
   200  // Open creates and opens an In-Memory database at the given path.
   201  // If the file does not exist then it will be created automatically.
   202  // Passing in nil options will cause Bhojpur Cache to open the database
   203  // with the default options.
   204  func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
   205  	db := &DB{
   206  		opened: true,
   207  	}
   208  	// Set default options if no options are provided.
   209  	if options == nil {
   210  		options = DefaultOptions
   211  	}
   212  	db.NoSync = options.NoSync
   213  	db.NoGrowSync = options.NoGrowSync
   214  	db.MmapFlags = options.MmapFlags
   215  	db.NoFreelistSync = options.NoFreelistSync
   216  	db.FreelistType = options.FreelistType
   217  	db.Mlock = options.Mlock
   218  
   219  	// Set default values for later DB operations.
   220  	db.MaxBatchSize = DefaultMaxBatchSize
   221  	db.MaxBatchDelay = DefaultMaxBatchDelay
   222  	db.AllocSize = DefaultAllocSize
   223  
   224  	flag := os.O_RDWR
   225  	if options.ReadOnly {
   226  		flag = os.O_RDONLY
   227  		db.readOnly = true
   228  	}
   229  
   230  	db.openFile = options.OpenFile
   231  	if db.openFile == nil {
   232  		db.openFile = os.OpenFile
   233  	}
   234  
   235  	// Open data file and separate sync handler for metadata writes.
   236  	var err error
   237  	if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil {
   238  		_ = db.close()
   239  		return nil, err
   240  	}
   241  	db.path = db.file.Name()
   242  
   243  	// Lock file so that other processes using Bhojpur Cache in read-write mode cannot
   244  	// use the database  at the same time. This would cause corruption since
   245  	// the two processes would write meta pages and free pages separately.
   246  	// The database file is locked exclusively (only one process can grab the lock)
   247  	// if !options.ReadOnly.
   248  	// The database file is locked using the shared lock (more than one process may
   249  	// hold a lock at the same time) otherwise (options.ReadOnly is set).
   250  	if err := flock(db, !db.readOnly, options.Timeout); err != nil {
   251  		_ = db.close()
   252  		return nil, err
   253  	}
   254  
   255  	// Default values for test hooks
   256  	db.ops.writeAt = db.file.WriteAt
   257  
   258  	if db.pageSize = options.PageSize; db.pageSize == 0 {
   259  		// Set the default page size to the OS page size.
   260  		db.pageSize = defaultPageSize
   261  	}
   262  
   263  	// Initialize the database if it doesn't exist.
   264  	if info, err := db.file.Stat(); err != nil {
   265  		_ = db.close()
   266  		return nil, err
   267  	} else if info.Size() == 0 {
   268  		// Initialize new files with meta pages.
   269  		if err := db.init(); err != nil {
   270  			// clean up file descriptor on initialization fail
   271  			_ = db.close()
   272  			return nil, err
   273  		}
   274  	} else {
   275  		// Read the first meta page to determine the page size.
   276  		var buf [0x1000]byte
   277  		// If we can't read the page size, but can read a page, assume
   278  		// it's the same as the OS or one given -- since that's how the
   279  		// page size was chosen in the first place.
   280  		//
   281  		// If the first page is invalid and this OS uses a different
   282  		// page size than what the database was created with then we
   283  		// are out of luck and cannot access the database.
   284  		//
   285  		// TODO: scan for next page
   286  		if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
   287  			if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
   288  				db.pageSize = int(m.pageSize)
   289  			}
   290  		} else {
   291  			_ = db.close()
   292  			return nil, ErrInvalid
   293  		}
   294  	}
   295  
   296  	// Initialize page pool.
   297  	db.pagePool = sync.Pool{
   298  		New: func() interface{} {
   299  			return make([]byte, db.pageSize)
   300  		},
   301  	}
   302  
   303  	// Memory map the data file.
   304  	if err := db.mmap(options.InitialMmapSize); err != nil {
   305  		_ = db.close()
   306  		return nil, err
   307  	}
   308  
   309  	if db.readOnly {
   310  		return db, nil
   311  	}
   312  
   313  	db.loadFreelist()
   314  
   315  	// Flush freelist when transitioning from no sync to sync so
   316  	// NoFreelistSync unaware In-Memory db can open the db later.
   317  	if !db.NoFreelistSync && !db.hasSyncedFreelist() {
   318  		tx, err := db.Begin(true)
   319  		if tx != nil {
   320  			err = tx.Commit()
   321  		}
   322  		if err != nil {
   323  			_ = db.close()
   324  			return nil, err
   325  		}
   326  	}
   327  
   328  	// Mark the database as opened and return.
   329  	return db, nil
   330  }
   331  
   332  // loadFreelist reads the freelist if it is synced, or reconstructs it
   333  // by scanning the DB if it is not synced. It assumes there are no
   334  // concurrent accesses being made to the freelist.
   335  func (db *DB) loadFreelist() {
   336  	db.freelistLoad.Do(func() {
   337  		db.freelist = newFreelist(db.FreelistType)
   338  		if !db.hasSyncedFreelist() {
   339  			// Reconstruct free list by scanning the DB.
   340  			db.freelist.readIDs(db.freepages())
   341  		} else {
   342  			// Read free list from freelist page.
   343  			db.freelist.read(db.page(db.meta().freelist))
   344  		}
   345  		db.stats.FreePageN = db.freelist.free_count()
   346  	})
   347  }
   348  
   349  func (db *DB) hasSyncedFreelist() bool {
   350  	return db.meta().freelist != pgidNoFreelist
   351  }
   352  
   353  // mmap opens the underlying memory-mapped file and initializes the meta references.
   354  // minsz is the minimum size that the new mmap can be.
   355  func (db *DB) mmap(minsz int) error {
   356  	db.mmaplock.Lock()
   357  	defer db.mmaplock.Unlock()
   358  
   359  	info, err := db.file.Stat()
   360  	if err != nil {
   361  		return fmt.Errorf("mmap stat error: %s", err)
   362  	} else if int(info.Size()) < db.pageSize*2 {
   363  		return fmt.Errorf("file size too small")
   364  	}
   365  
   366  	// Ensure the size is at least the minimum size.
   367  	fileSize := int(info.Size())
   368  	var size = fileSize
   369  	if size < minsz {
   370  		size = minsz
   371  	}
   372  	size, err = db.mmapSize(size)
   373  	if err != nil {
   374  		return err
   375  	}
   376  
   377  	if db.Mlock {
   378  		// Unlock db memory
   379  		if err := db.munlock(fileSize); err != nil {
   380  			return err
   381  		}
   382  	}
   383  
   384  	// Dereference all mmap references before unmapping.
   385  	if db.rwtx != nil {
   386  		db.rwtx.root.dereference()
   387  	}
   388  
   389  	// Unmap existing data before continuing.
   390  	if err := db.munmap(); err != nil {
   391  		return err
   392  	}
   393  
   394  	// Memory-map the data file as a byte slice.
   395  	if err := mmap(db, size); err != nil {
   396  		return err
   397  	}
   398  
   399  	if db.Mlock {
   400  		// Don't allow swapping of data file
   401  		if err := db.mlock(fileSize); err != nil {
   402  			return err
   403  		}
   404  	}
   405  
   406  	// Save references to the meta pages.
   407  	db.meta0 = db.page(0).meta()
   408  	db.meta1 = db.page(1).meta()
   409  
   410  	// Validate the meta pages. We only return an error if both meta pages fail
   411  	// validation, since meta0 failing validation means that it wasn't saved
   412  	// properly -- but we can recover using meta1. And vice-versa.
   413  	err0 := db.meta0.validate()
   414  	err1 := db.meta1.validate()
   415  	if err0 != nil && err1 != nil {
   416  		return err0
   417  	}
   418  
   419  	return nil
   420  }
   421  
   422  // munmap unmaps the data file from memory.
   423  func (db *DB) munmap() error {
   424  	if err := munmap(db); err != nil {
   425  		return fmt.Errorf("unmap error: " + err.Error())
   426  	}
   427  	return nil
   428  }
   429  
   430  // mmapSize determines the appropriate size for the mmap given the current size
   431  // of the database. The minimum size is 32KB and doubles until it reaches 1GB.
   432  // Returns an error if the new mmap size is greater than the max allowed.
   433  func (db *DB) mmapSize(size int) (int, error) {
   434  	// Double the size from 32KB until 1GB.
   435  	for i := uint(15); i <= 30; i++ {
   436  		if size <= 1<<i {
   437  			return 1 << i, nil
   438  		}
   439  	}
   440  
   441  	// Verify the requested size is not above the maximum allowed.
   442  	if size > maxMapSize {
   443  		return 0, fmt.Errorf("mmap too large")
   444  	}
   445  
   446  	// If larger than 1GB then grow by 1GB at a time.
   447  	sz := int64(size)
   448  	if remainder := sz % int64(maxMmapStep); remainder > 0 {
   449  		sz += int64(maxMmapStep) - remainder
   450  	}
   451  
   452  	// Ensure that the mmap size is a multiple of the page size.
   453  	// This should always be true since we're incrementing in MBs.
   454  	pageSize := int64(db.pageSize)
   455  	if (sz % pageSize) != 0 {
   456  		sz = ((sz / pageSize) + 1) * pageSize
   457  	}
   458  
   459  	// If we've exceeded the max size then only grow up to the max size.
   460  	if sz > maxMapSize {
   461  		sz = maxMapSize
   462  	}
   463  
   464  	return int(sz), nil
   465  }
   466  
   467  func (db *DB) munlock(fileSize int) error {
   468  	if err := munlock(db, fileSize); err != nil {
   469  		return fmt.Errorf("munlock error: " + err.Error())
   470  	}
   471  	return nil
   472  }
   473  
   474  func (db *DB) mlock(fileSize int) error {
   475  	if err := mlock(db, fileSize); err != nil {
   476  		return fmt.Errorf("mlock error: " + err.Error())
   477  	}
   478  	return nil
   479  }
   480  
   481  func (db *DB) mrelock(fileSizeFrom, fileSizeTo int) error {
   482  	if err := db.munlock(fileSizeFrom); err != nil {
   483  		return err
   484  	}
   485  	if err := db.mlock(fileSizeTo); err != nil {
   486  		return err
   487  	}
   488  	return nil
   489  }
   490  
   491  // init creates a new database file and initializes its meta pages.
   492  func (db *DB) init() error {
   493  	// Create two meta pages on a buffer.
   494  	buf := make([]byte, db.pageSize*4)
   495  	for i := 0; i < 2; i++ {
   496  		p := db.pageInBuffer(buf, pgid(i))
   497  		p.id = pgid(i)
   498  		p.flags = metaPageFlag
   499  
   500  		// Initialize the meta page.
   501  		m := p.meta()
   502  		m.magic = magic
   503  		m.version = version
   504  		m.pageSize = uint32(db.pageSize)
   505  		m.freelist = 2
   506  		m.root = bucket{root: 3}
   507  		m.pgid = 4
   508  		m.txid = txid(i)
   509  		m.checksum = m.sum64()
   510  	}
   511  
   512  	// Write an empty freelist at page 3.
   513  	p := db.pageInBuffer(buf, pgid(2))
   514  	p.id = pgid(2)
   515  	p.flags = freelistPageFlag
   516  	p.count = 0
   517  
   518  	// Write an empty leaf page at page 4.
   519  	p = db.pageInBuffer(buf, pgid(3))
   520  	p.id = pgid(3)
   521  	p.flags = leafPageFlag
   522  	p.count = 0
   523  
   524  	// Write the buffer to our data file.
   525  	if _, err := db.ops.writeAt(buf, 0); err != nil {
   526  		return err
   527  	}
   528  	if err := fdatasync(db); err != nil {
   529  		return err
   530  	}
   531  	db.filesz = len(buf)
   532  
   533  	return nil
   534  }
   535  
   536  // Close releases all database resources.
   537  // It will block waiting for any open transactions to finish
   538  // before closing the database and returning.
   539  func (db *DB) Close() error {
   540  	db.rwlock.Lock()
   541  	defer db.rwlock.Unlock()
   542  
   543  	db.metalock.Lock()
   544  	defer db.metalock.Unlock()
   545  
   546  	db.mmaplock.Lock()
   547  	defer db.mmaplock.Unlock()
   548  
   549  	return db.close()
   550  }
   551  
   552  func (db *DB) close() error {
   553  	if !db.opened {
   554  		return nil
   555  	}
   556  
   557  	db.opened = false
   558  
   559  	db.freelist = nil
   560  
   561  	// Clear ops.
   562  	db.ops.writeAt = nil
   563  
   564  	// Close the mmap.
   565  	if err := db.munmap(); err != nil {
   566  		return err
   567  	}
   568  
   569  	// Close file handles.
   570  	if db.file != nil {
   571  		// No need to unlock read-only file.
   572  		if !db.readOnly {
   573  			// Unlock the file.
   574  			if err := funlock(db); err != nil {
   575  				log.Printf("memcache.Close(): funlock error: %s", err)
   576  			}
   577  		}
   578  
   579  		// Close the file descriptor.
   580  		if err := db.file.Close(); err != nil {
   581  			return fmt.Errorf("db file close: %s", err)
   582  		}
   583  		db.file = nil
   584  	}
   585  
   586  	db.path = ""
   587  	return nil
   588  }
   589  
   590  // Begin starts a new transaction.
   591  // Multiple read-only transactions can be used concurrently but only one
   592  // write transaction can be used at a time. Starting multiple write transactions
   593  // will cause the calls to block and be serialized until the current write
   594  // transaction finishes.
   595  //
   596  // Transactions should not be dependent on one another. Opening a read
   597  // transaction and a write transaction in the same goroutine can cause the
   598  // writer to deadlock because the database periodically needs to re-mmap itself
   599  // as it grows and it cannot do that while a read transaction is open.
   600  //
   601  // If a long running read transaction (for example, a snapshot transaction) is
   602  // needed, you might want to set DB.InitialMmapSize to a large enough value
   603  // to avoid potential blocking of write transaction.
   604  //
   605  // IMPORTANT: You must close read-only transactions after you are finished or
   606  // else the database will not reclaim old pages.
   607  func (db *DB) Begin(writable bool) (*Tx, error) {
   608  	if writable {
   609  		return db.beginRWTx()
   610  	}
   611  	return db.beginTx()
   612  }
   613  
   614  func (db *DB) beginTx() (*Tx, error) {
   615  	// Lock the meta pages while we initialize the transaction. We obtain
   616  	// the meta lock before the mmap lock because that's the order that the
   617  	// write transaction will obtain them.
   618  	db.metalock.Lock()
   619  
   620  	// Obtain a read-only lock on the mmap. When the mmap is remapped it will
   621  	// obtain a write lock so all transactions must finish before it can be
   622  	// remapped.
   623  	db.mmaplock.RLock()
   624  
   625  	// Exit if the database is not open yet.
   626  	if !db.opened {
   627  		db.mmaplock.RUnlock()
   628  		db.metalock.Unlock()
   629  		return nil, ErrDatabaseNotOpen
   630  	}
   631  
   632  	// Create a transaction associated with the database.
   633  	t := &Tx{}
   634  	t.init(db)
   635  
   636  	// Keep track of transaction until it closes.
   637  	db.txs = append(db.txs, t)
   638  	n := len(db.txs)
   639  
   640  	// Unlock the meta pages.
   641  	db.metalock.Unlock()
   642  
   643  	// Update the transaction stats.
   644  	db.statlock.Lock()
   645  	db.stats.TxN++
   646  	db.stats.OpenTxN = n
   647  	db.statlock.Unlock()
   648  
   649  	return t, nil
   650  }
   651  
   652  func (db *DB) beginRWTx() (*Tx, error) {
   653  	// If the database was opened with Options.ReadOnly, return an error.
   654  	if db.readOnly {
   655  		return nil, ErrDatabaseReadOnly
   656  	}
   657  
   658  	// Obtain writer lock. This is released by the transaction when it closes.
   659  	// This enforces only one writer transaction at a time.
   660  	db.rwlock.Lock()
   661  
   662  	// Once we have the writer lock then we can lock the meta pages so that
   663  	// we can set up the transaction.
   664  	db.metalock.Lock()
   665  	defer db.metalock.Unlock()
   666  
   667  	// Exit if the database is not open yet.
   668  	if !db.opened {
   669  		db.rwlock.Unlock()
   670  		return nil, ErrDatabaseNotOpen
   671  	}
   672  
   673  	// Create a transaction associated with the database.
   674  	t := &Tx{writable: true}
   675  	t.init(db)
   676  	db.rwtx = t
   677  	db.freePages()
   678  	return t, nil
   679  }
   680  
   681  // freePages releases any pages associated with closed read-only transactions.
   682  func (db *DB) freePages() {
   683  	// Free all pending pages prior to earliest open transaction.
   684  	sort.Sort(txsById(db.txs))
   685  	minid := txid(0xFFFFFFFFFFFFFFFF)
   686  	if len(db.txs) > 0 {
   687  		minid = db.txs[0].meta.txid
   688  	}
   689  	if minid > 0 {
   690  		db.freelist.release(minid - 1)
   691  	}
   692  	// Release unused txid extents.
   693  	for _, t := range db.txs {
   694  		db.freelist.releaseRange(minid, t.meta.txid-1)
   695  		minid = t.meta.txid + 1
   696  	}
   697  	db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
   698  	// Any page both allocated and freed in an extent is safe to release.
   699  }
   700  
   701  type txsById []*Tx
   702  
   703  func (t txsById) Len() int           { return len(t) }
   704  func (t txsById) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
   705  func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
   706  
   707  // removeTx removes a transaction from the database.
   708  func (db *DB) removeTx(tx *Tx) {
   709  	// Release the read lock on the mmap.
   710  	db.mmaplock.RUnlock()
   711  
   712  	// Use the meta lock to restrict access to the DB object.
   713  	db.metalock.Lock()
   714  
   715  	// Remove the transaction.
   716  	for i, t := range db.txs {
   717  		if t == tx {
   718  			last := len(db.txs) - 1
   719  			db.txs[i] = db.txs[last]
   720  			db.txs[last] = nil
   721  			db.txs = db.txs[:last]
   722  			break
   723  		}
   724  	}
   725  	n := len(db.txs)
   726  
   727  	// Unlock the meta pages.
   728  	db.metalock.Unlock()
   729  
   730  	// Merge statistics.
   731  	db.statlock.Lock()
   732  	db.stats.OpenTxN = n
   733  	db.stats.TxStats.add(&tx.stats)
   734  	db.statlock.Unlock()
   735  }
   736  
   737  // Update executes a function within the context of a read-write managed transaction.
   738  // If no error is returned from the function then the transaction is committed.
   739  // If an error is returned then the entire transaction is rolled back.
   740  // Any error that is returned from the function or returned from the commit is
   741  // returned from the Update() method.
   742  //
   743  // Attempting to manually commit or rollback within the function will cause a panic.
   744  func (db *DB) Update(fn func(*Tx) error) error {
   745  	t, err := db.Begin(true)
   746  	if err != nil {
   747  		return err
   748  	}
   749  
   750  	// Make sure the transaction rolls back in the event of a panic.
   751  	defer func() {
   752  		if t.db != nil {
   753  			t.rollback()
   754  		}
   755  	}()
   756  
   757  	// Mark as a managed tx so that the inner function cannot manually commit.
   758  	t.managed = true
   759  
   760  	// If an error is returned from the function then rollback and return error.
   761  	err = fn(t)
   762  	t.managed = false
   763  	if err != nil {
   764  		_ = t.Rollback()
   765  		return err
   766  	}
   767  
   768  	return t.Commit()
   769  }
   770  
   771  // View executes a function within the context of a managed read-only transaction.
   772  // Any error that is returned from the function is returned from the View() method.
   773  //
   774  // Attempting to manually rollback within the function will cause a panic.
   775  func (db *DB) View(fn func(*Tx) error) error {
   776  	t, err := db.Begin(false)
   777  	if err != nil {
   778  		return err
   779  	}
   780  
   781  	// Make sure the transaction rolls back in the event of a panic.
   782  	defer func() {
   783  		if t.db != nil {
   784  			t.rollback()
   785  		}
   786  	}()
   787  
   788  	// Mark as a managed tx so that the inner function cannot manually rollback.
   789  	t.managed = true
   790  
   791  	// If an error is returned from the function then pass it through.
   792  	err = fn(t)
   793  	t.managed = false
   794  	if err != nil {
   795  		_ = t.Rollback()
   796  		return err
   797  	}
   798  
   799  	return t.Rollback()
   800  }
   801  
   802  // Batch calls fn as part of a batch. It behaves similar to Update,
   803  // except:
   804  //
   805  // 1. concurrent Batch calls can be combined into a single Bhojpur Cache
   806  // transaction.
   807  //
   808  // 2. the function passed to Batch may be called multiple times,
   809  // regardless of whether it returns error or not.
   810  //
   811  // This means that Batch function side effects must be idempotent and
   812  // take permanent effect only after a successful return is seen in
   813  // caller.
   814  //
   815  // The maximum batch size and delay can be adjusted with DB.MaxBatchSize
   816  // and DB.MaxBatchDelay, respectively.
   817  //
   818  // Batch is only useful when there are multiple goroutines calling it.
   819  func (db *DB) Batch(fn func(*Tx) error) error {
   820  	errCh := make(chan error, 1)
   821  
   822  	db.batchMu.Lock()
   823  	if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
   824  		// There is no existing batch, or the existing batch is full; start a new one.
   825  		db.batch = &batch{
   826  			db: db,
   827  		}
   828  		db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
   829  	}
   830  	db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
   831  	if len(db.batch.calls) >= db.MaxBatchSize {
   832  		// wake up batch, it's ready to run
   833  		go db.batch.trigger()
   834  	}
   835  	db.batchMu.Unlock()
   836  
   837  	err := <-errCh
   838  	if err == trySolo {
   839  		err = db.Update(fn)
   840  	}
   841  	return err
   842  }
   843  
   844  type call struct {
   845  	fn  func(*Tx) error
   846  	err chan<- error
   847  }
   848  
   849  type batch struct {
   850  	db    *DB
   851  	timer *time.Timer
   852  	start sync.Once
   853  	calls []call
   854  }
   855  
   856  // trigger runs the batch if it hasn't already been run.
   857  func (b *batch) trigger() {
   858  	b.start.Do(b.run)
   859  }
   860  
   861  // run performs the transactions in the batch and communicates results
   862  // back to DB.Batch.
   863  func (b *batch) run() {
   864  	b.db.batchMu.Lock()
   865  	b.timer.Stop()
   866  	// Make sure no new work is added to this batch, but don't break
   867  	// other batches.
   868  	if b.db.batch == b {
   869  		b.db.batch = nil
   870  	}
   871  	b.db.batchMu.Unlock()
   872  
   873  retry:
   874  	for len(b.calls) > 0 {
   875  		var failIdx = -1
   876  		err := b.db.Update(func(tx *Tx) error {
   877  			for i, c := range b.calls {
   878  				if err := safelyCall(c.fn, tx); err != nil {
   879  					failIdx = i
   880  					return err
   881  				}
   882  			}
   883  			return nil
   884  		})
   885  
   886  		if failIdx >= 0 {
   887  			// take the failing transaction out of the batch. it's
   888  			// safe to shorten b.calls here because db.batch no longer
   889  			// points to us, and we hold the mutex anyway.
   890  			c := b.calls[failIdx]
   891  			b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
   892  			// tell the submitter re-run it solo, continue with the rest of the batch
   893  			c.err <- trySolo
   894  			continue retry
   895  		}
   896  
   897  		// pass success, or Bhojpur Cache in-memory storage engine internal
   898  		// errors, to all callers
   899  		for _, c := range b.calls {
   900  			c.err <- err
   901  		}
   902  		break retry
   903  	}
   904  }
   905  
   906  // trySolo is a special sentinel error value used for signaling that a
   907  // transaction function should be re-run. It should never be seen by
   908  // callers.
   909  var trySolo = errors.New("batch function returned an error and should be re-run solo")
   910  
   911  type panicked struct {
   912  	reason interface{}
   913  }
   914  
   915  func (p panicked) Error() string {
   916  	if err, ok := p.reason.(error); ok {
   917  		return err.Error()
   918  	}
   919  	return fmt.Sprintf("panic: %v", p.reason)
   920  }
   921  
   922  func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
   923  	defer func() {
   924  		if p := recover(); p != nil {
   925  			err = panicked{p}
   926  		}
   927  	}()
   928  	return fn(tx)
   929  }
   930  
   931  // Sync executes fdatasync() against the database file handle.
   932  //
   933  // This is not necessary under normal operation, however, if you use NoSync
   934  // then it allows you to force the database file to sync against the disk.
   935  func (db *DB) Sync() error { return fdatasync(db) }
   936  
   937  // Stats retrieves ongoing performance stats for the database.
   938  // This is only updated when a transaction closes.
   939  func (db *DB) Stats() Stats {
   940  	db.statlock.RLock()
   941  	defer db.statlock.RUnlock()
   942  	return db.stats
   943  }
   944  
   945  // This is for internal access to the raw data bytes from the C cursor, use
   946  // carefully, or not at all.
   947  func (db *DB) Info() *Info {
   948  	return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
   949  }
   950  
   951  // page retrieves a page reference from the mmap based on the current page size.
   952  func (db *DB) page(id pgid) *page {
   953  	pos := id * pgid(db.pageSize)
   954  	return (*page)(unsafe.Pointer(&db.data[pos]))
   955  }
   956  
   957  // pageInBuffer retrieves a page reference from a given byte array based on the current page size.
   958  func (db *DB) pageInBuffer(b []byte, id pgid) *page {
   959  	return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
   960  }
   961  
   962  // meta retrieves the current meta page reference.
   963  func (db *DB) meta() *meta {
   964  	// We have to return the meta with the highest txid which doesn't fail
   965  	// validation. Otherwise, we can cause errors when in fact the database is
   966  	// in a consistent state. metaA is the one with the higher txid.
   967  	metaA := db.meta0
   968  	metaB := db.meta1
   969  	if db.meta1.txid > db.meta0.txid {
   970  		metaA = db.meta1
   971  		metaB = db.meta0
   972  	}
   973  
   974  	// Use higher meta page if valid. Otherwise fallback to previous, if valid.
   975  	if err := metaA.validate(); err == nil {
   976  		return metaA
   977  	} else if err := metaB.validate(); err == nil {
   978  		return metaB
   979  	}
   980  
   981  	// This should never be reached, because both meta1 and meta0 were validated
   982  	// on mmap() and we do fsync() on every write.
   983  	panic("memcache.DB.meta(): invalid meta pages")
   984  }
   985  
   986  // allocate returns a contiguous block of memory starting at a given page.
   987  func (db *DB) allocate(txid txid, count int) (*page, error) {
   988  	// Allocate a temporary buffer for the page.
   989  	var buf []byte
   990  	if count == 1 {
   991  		buf = db.pagePool.Get().([]byte)
   992  	} else {
   993  		buf = make([]byte, count*db.pageSize)
   994  	}
   995  	p := (*page)(unsafe.Pointer(&buf[0]))
   996  	p.overflow = uint32(count - 1)
   997  
   998  	// Use pages from the freelist if they are available.
   999  	if p.id = db.freelist.allocate(txid, count); p.id != 0 {
  1000  		return p, nil
  1001  	}
  1002  
  1003  	// Resize mmap() if we're at the end.
  1004  	p.id = db.rwtx.meta.pgid
  1005  	var minsz = int((p.id+pgid(count))+1) * db.pageSize
  1006  	if minsz >= db.datasz {
  1007  		if err := db.mmap(minsz); err != nil {
  1008  			return nil, fmt.Errorf("mmap allocate error: %s", err)
  1009  		}
  1010  	}
  1011  
  1012  	// Move the page id high water mark.
  1013  	db.rwtx.meta.pgid += pgid(count)
  1014  
  1015  	return p, nil
  1016  }
  1017  
  1018  // grow grows the size of the database to the given sz.
  1019  func (db *DB) grow(sz int) error {
  1020  	// Ignore if the new size is less than available file size.
  1021  	if sz <= db.filesz {
  1022  		return nil
  1023  	}
  1024  
  1025  	// If the data is smaller than the alloc size then only allocate what's needed.
  1026  	// Once it goes over the allocation size then allocate in chunks.
  1027  	if db.datasz < db.AllocSize {
  1028  		sz = db.datasz
  1029  	} else {
  1030  		sz += db.AllocSize
  1031  	}
  1032  
  1033  	// Truncate and fsync to ensure file size metadata is flushed.
  1034  	if !db.NoGrowSync && !db.readOnly {
  1035  		if runtime.GOOS != "windows" {
  1036  			if err := db.file.Truncate(int64(sz)); err != nil {
  1037  				return fmt.Errorf("file resize error: %s", err)
  1038  			}
  1039  		}
  1040  		if err := db.file.Sync(); err != nil {
  1041  			return fmt.Errorf("file sync error: %s", err)
  1042  		}
  1043  		if db.Mlock {
  1044  			// unlock old file and lock new one
  1045  			if err := db.mrelock(db.filesz, sz); err != nil {
  1046  				return fmt.Errorf("mlock/munlock error: %s", err)
  1047  			}
  1048  		}
  1049  	}
  1050  
  1051  	db.filesz = sz
  1052  	return nil
  1053  }
  1054  
  1055  func (db *DB) IsReadOnly() bool {
  1056  	return db.readOnly
  1057  }
  1058  
  1059  func (db *DB) freepages() []pgid {
  1060  	tx, err := db.beginTx()
  1061  	defer func() {
  1062  		err = tx.Rollback()
  1063  		if err != nil {
  1064  			panic("freepages: failed to rollback tx")
  1065  		}
  1066  	}()
  1067  	if err != nil {
  1068  		panic("freepages: failed to open read only tx")
  1069  	}
  1070  
  1071  	reachable := make(map[pgid]*page)
  1072  	nofreed := make(map[pgid]bool)
  1073  	ech := make(chan error)
  1074  	go func() {
  1075  		for e := range ech {
  1076  			panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e))
  1077  		}
  1078  	}()
  1079  	tx.checkBucket(&tx.root, reachable, nofreed, ech)
  1080  	close(ech)
  1081  
  1082  	var fids []pgid
  1083  	for i := pgid(2); i < db.meta().pgid; i++ {
  1084  		if _, ok := reachable[i]; !ok {
  1085  			fids = append(fids, i)
  1086  		}
  1087  	}
  1088  	return fids
  1089  }
  1090  
  1091  // Options represents the options that can be set when opening a database.
  1092  type Options struct {
  1093  	// Timeout is the amount of time to wait to obtain a file lock.
  1094  	// When set to zero it will wait indefinitely. This option is only
  1095  	// available on Darwin and Linux.
  1096  	Timeout time.Duration
  1097  
  1098  	// Sets the DB.NoGrowSync flag before memory mapping the file.
  1099  	NoGrowSync bool
  1100  
  1101  	// Do not sync freelist to disk. This improves the database write performance
  1102  	// under normal operation, but requires a full database re-sync during recovery.
  1103  	NoFreelistSync bool
  1104  
  1105  	// FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
  1106  	// dramatic performance degradation if database is large and framentation in freelist is common.
  1107  	// The alternative one is using hashmap, it is faster in almost all circumstances
  1108  	// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
  1109  	// The default type is array
  1110  	FreelistType FreelistType
  1111  
  1112  	// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
  1113  	// grab a shared lock (UNIX).
  1114  	ReadOnly bool
  1115  
  1116  	// Sets the DB.MmapFlags flag before memory mapping the file.
  1117  	MmapFlags int
  1118  
  1119  	// InitialMmapSize is the initial mmap size of the database
  1120  	// in bytes. Read transactions won't block write transaction
  1121  	// if the InitialMmapSize is large enough to hold database mmap
  1122  	// size. (See DB.Begin for more information)
  1123  	//
  1124  	// If <=0, the initial map size is 0.
  1125  	// If initialMmapSize is smaller than the previous database size,
  1126  	// it takes no effect.
  1127  	InitialMmapSize int
  1128  
  1129  	// PageSize overrides the default OS page size.
  1130  	PageSize int
  1131  
  1132  	// NoSync sets the initial value of DB.NoSync. Normally this can just be
  1133  	// set directly on the DB itself when returned from Open(), but this option
  1134  	// is useful in APIs which expose Options but not the underlying DB.
  1135  	NoSync bool
  1136  
  1137  	// OpenFile is used to open files. It defaults to os.OpenFile. This option
  1138  	// is useful for writing hermetic tests.
  1139  	OpenFile func(string, int, os.FileMode) (*os.File, error)
  1140  
  1141  	// Mlock locks database file in memory when set to true.
  1142  	// It prevents potential page faults, however
  1143  	// used memory can't be reclaimed. (UNIX only)
  1144  	Mlock bool
  1145  }
  1146  
  1147  // DefaultOptions represent the options used if nil options are passed into Open().
  1148  // No timeout is used which will cause Bhojpur Cache to wait indefinitely for a lock.
  1149  var DefaultOptions = &Options{
  1150  	Timeout:      0,
  1151  	NoGrowSync:   false,
  1152  	FreelistType: FreelistArrayType,
  1153  }
  1154  
  1155  // Stats represents statistics about the database.
  1156  type Stats struct {
  1157  	// Freelist stats
  1158  	FreePageN     int // total number of free pages on the freelist
  1159  	PendingPageN  int // total number of pending pages on the freelist
  1160  	FreeAlloc     int // total bytes allocated in free pages
  1161  	FreelistInuse int // total bytes used by the freelist
  1162  
  1163  	// Transaction stats
  1164  	TxN     int // total number of started read transactions
  1165  	OpenTxN int // number of currently open read transactions
  1166  
  1167  	TxStats TxStats // global, ongoing stats.
  1168  }
  1169  
  1170  // Sub calculates and returns the difference between two sets of database stats.
  1171  // This is useful when obtaining stats at two different points and time and
  1172  // you need the performance counters that occurred within that time span.
  1173  func (s *Stats) Sub(other *Stats) Stats {
  1174  	if other == nil {
  1175  		return *s
  1176  	}
  1177  	var diff Stats
  1178  	diff.FreePageN = s.FreePageN
  1179  	diff.PendingPageN = s.PendingPageN
  1180  	diff.FreeAlloc = s.FreeAlloc
  1181  	diff.FreelistInuse = s.FreelistInuse
  1182  	diff.TxN = s.TxN - other.TxN
  1183  	diff.TxStats = s.TxStats.Sub(&other.TxStats)
  1184  	return diff
  1185  }
  1186  
  1187  type Info struct {
  1188  	Data     uintptr
  1189  	PageSize int
  1190  }
  1191  
  1192  type meta struct {
  1193  	magic    uint32
  1194  	version  uint32
  1195  	pageSize uint32
  1196  	flags    uint32
  1197  	root     bucket
  1198  	freelist pgid
  1199  	pgid     pgid
  1200  	txid     txid
  1201  	checksum uint64
  1202  }
  1203  
  1204  // validate checks the marker bytes and version of the meta page to ensure it matches this binary.
  1205  func (m *meta) validate() error {
  1206  	if m.magic != magic {
  1207  		return ErrInvalid
  1208  	} else if m.version != version {
  1209  		return ErrVersionMismatch
  1210  	} else if m.checksum != 0 && m.checksum != m.sum64() {
  1211  		return ErrChecksum
  1212  	}
  1213  	return nil
  1214  }
  1215  
  1216  // copy copies one meta object to another.
  1217  func (m *meta) copy(dest *meta) {
  1218  	*dest = *m
  1219  }
  1220  
  1221  // write writes the meta onto a page.
  1222  func (m *meta) write(p *page) {
  1223  	if m.root.root >= m.pgid {
  1224  		panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
  1225  	} else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
  1226  		// TODO: reject pgidNoFreeList if !NoFreelistSync
  1227  		panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
  1228  	}
  1229  
  1230  	// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
  1231  	p.id = pgid(m.txid % 2)
  1232  	p.flags |= metaPageFlag
  1233  
  1234  	// Calculate the checksum.
  1235  	m.checksum = m.sum64()
  1236  
  1237  	m.copy(p.meta())
  1238  }
  1239  
  1240  // generates the checksum for the meta.
  1241  func (m *meta) sum64() uint64 {
  1242  	var h = fnv.New64a()
  1243  	_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
  1244  	return h.Sum64()
  1245  }
  1246  
  1247  // _assert will panic with a given formatted message if the given condition is false.
  1248  func _assert(condition bool, msg string, v ...interface{}) {
  1249  	if !condition {
  1250  		panic(fmt.Sprintf("assertion failed: "+msg, v...))
  1251  	}
  1252  }